sched/fair: Optimize ___update_sched_avg()

author Yuyang Du <yuyang.du@intel.com>

Sun, 12 Feb 2017 21:44:23 +0000 (05:44 +0800)

committer Ingo Molnar <mingo@kernel.org>

Thu, 30 Mar 2017 07:43:41 +0000 (09:43 +0200)
author Yuyang Du <yuyang.du@intel.com>
Sun, 12 Feb 2017 21:44:23 +0000 (05:44 +0800)
committer Ingo Molnar <mingo@kernel.org>
Thu, 30 Mar 2017 07:43:41 +0000 (09:43 +0200)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 2ac00cfbf29f81448a43ea2dbee7fe4052f6a135..76f67b3e34d6d3f80264e309954ef59083829f45 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
   * Approximate:
   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
   */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
  {
         unsigned int local_n;
  
@@ -2795,31 +2795,112 @@ static __always_inline u64 decay_load(u64 val, u64 n)
         return val;
  }
  
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
  {
-       u32 contrib = 0;
+       u32 c1, c2, c3 = remainder; /* y^0 == 1 */
  
-       if (likely(n <= LOAD_AVG_PERIOD))
-               return runnable_avg_yN_sum[n];
-       else if (unlikely(n >= LOAD_AVG_MAX_N))
+       if (!periods)
+               return remainder - period_contrib;
+
+       if (unlikely(periods >= LOAD_AVG_MAX_N))
                 return LOAD_AVG_MAX;
  
-       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-       n %= LOAD_AVG_PERIOD;
-       contrib = decay_load(contrib, n);
-       return contrib + runnable_avg_yN_sum[n];
+       /*
+        * c1 = d1 y^(p+1)
+        */
+       c1 = decay_load((u64)(1024 - period_contrib), periods);
+
+       periods -= 1;
+       /*
+        * For updates fully spanning n periods, the contribution to runnable
+        * average will be:
+        *
+        *   c2 = 1024 \Sum y^n
+        *
+        * We can compute this reasonably efficiently by combining:
+        *
+        *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+        */
+       if (likely(periods <= LOAD_AVG_PERIOD)) {
+               c2 = runnable_avg_yN_sum[periods];
+       } else {
+               c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+               periods %= LOAD_AVG_PERIOD;
+               c2 = decay_load(c2, periods);
+               c2 += runnable_avg_yN_sum[periods];
+       }
+
+       return c1 + c2 + c3;
  }
  
  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
  
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                                p
+ * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
+ *                               n=1
+ *
+ *    = u y^(p+1) +                            (Step 1)
+ *
+ *                          p
+ *      d1 y^(p+1) + 1024 \Sum y^n + d3 y^0    (Step 2)
+ *                         n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+              unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+       unsigned long scale_freq, scale_cpu;
+       u64 periods;
+       u32 contrib;
+
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+       delta += sa->period_contrib;
+       periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+       /*
+        * Step 1: decay old *_sum if we crossed period boundaries.
+        */
+       if (periods) {
+               sa->load_sum = decay_load(sa->load_sum, periods);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_sum =
+                               decay_load(cfs_rq->runnable_load_sum, periods);
+               }
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+       }
+
+       /*
+        * Step 2
+        */
+       delta %= 1024;
+       contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * contrib;
+               if (cfs_rq)
+                       cfs_rq->runnable_load_sum += weight * contrib;
+       }
+       if (running)
+               sa->util_sum += contrib * scale_cpu;
+
+       return periods;
+}
+
  /*
   * We can represent the historical contribution to runnable average as the
   * coefficients of a geometric series.  To do this we sub-divide our runnable
@@ -2852,10 +2933,7 @@ static __always_inline int
  ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
  {
-       u64 delta, scaled_delta, periods;
-       u32 contrib;
-       unsigned int delta_w, scaled_delta_w, decayed = 0;
-       unsigned long scale_freq, scale_cpu;
+       u64 delta;
  
         delta = now - sa->last_update_time;
         /*
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                 return 0;
         sa->last_update_time = now;
  
-       scale_freq = arch_scale_freq_capacity(NULL, cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-       /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->period_contrib;
-       if (delta + delta_w >= 1024) {
-               decayed = 1;
-
-               /* how much left for next period will start over, we don't know yet */
-               sa->period_contrib = 0;
-
-               /*
-                * Now that we know we're crossing a period boundary, figure
-                * out how much from delta we need to complete the current
-                * period and accrue it.
-                */
-               delta_w = 1024 - delta_w;
-               scaled_delta_w = cap_scale(delta_w, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * scaled_delta_w;
-                       if (cfs_rq) {
-                               cfs_rq->runnable_load_sum +=
-                                               weight * scaled_delta_w;
-                       }
-               }
-               if (running)
-                       sa->util_sum += scaled_delta_w * scale_cpu;
-
-               delta -= delta_w;
-
-               /* Figure out how many additional periods this update spans */
-               periods = delta / 1024;
-               delta %= 1024;
-
-               sa->load_sum = decay_load(sa->load_sum, periods + 1);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods + 1);
-               }
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-               contrib = __compute_runnable_contrib(periods);
-               contrib = cap_scale(contrib, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * contrib;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * contrib;
-               }
-               if (running)
-                       sa->util_sum += contrib * scale_cpu;
-       }
-
-       /* Remainder of delta accrued against u_0` */
-       scaled_delta = cap_scale(delta, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * scaled_delta;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * scaled_delta;
-       }
-       if (running)
-               sa->util_sum += scaled_delta * scale_cpu;
-
-       sa->period_contrib += delta;
+       /*
+        * Now we know we crossed measurement unit boundaries. The *_avg
+        * accrues by two steps:
+        *
+        * Step 1: accumulate *_sum since last_update_time. If we haven't
+        * crossed period boundaries, finish.
+        */
+       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+               return 0;
  
-       if (decayed) {
-               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_avg =
-                               div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
-               }
-               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+       /*
+        * Step 2: update *_avg.
+        */
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       if (cfs_rq) {
+               cfs_rq->runnable_load_avg =
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
         }
+       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
  
-       return decayed;
+       return 1;
  }
  
  static int
author	Yuyang Du <yuyang.du@intel.com>
	Sun, 12 Feb 2017 21:44:23 +0000 (05:44 +0800)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 30 Mar 2017 07:43:41 +0000 (09:43 +0200)