sched: Use group weight, idle cpu metrics to fix imbalances during idle

[mv-sheeva.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 933f3d1b62ea0affb63767f87152545f7659dc6f..034c4f410b36ef8d0d2fd2a7a108984e3d281ae0 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2035,13 +2035,16 @@ struct sd_lb_stats {
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
         unsigned long this_has_capacity;
+       unsigned int  this_idle_cpus;
  
         /* Statistics of the busiest group */
+       unsigned int  busiest_idle_cpus;
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
         unsigned long busiest_has_capacity;
+       unsigned int  busiest_group_weight;
  
         int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2066,8 @@ struct sg_lb_stats {
         unsigned long sum_nr_running; /* Nr tasks running in the group */
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
+       unsigned long idle_cpus;
+       unsigned long group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
  };
@@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                 sgs->group_load += load;
                 sgs->sum_nr_running += rq->nr_running;
                 sgs->sum_weighted_load += weighted_cpuload(i);
-
+               if (idle_cpu(i))
+                       sgs->idle_cpus++;
         }
  
         /*
@@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
         if (!sgs->group_capacity)
                 sgs->group_capacity = fix_small_capacity(sd, group);
+       sgs->group_weight = group->group_weight;
  
         if (sgs->group_capacity > sgs->sum_nr_running)
                 sgs->group_has_capacity = 1;
@@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
                         sds->this_has_capacity = sgs.group_has_capacity;
+                       sds->this_idle_cpus = sgs.idle_cpus;
                 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_idle_cpus = sgs.idle_cpus;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
                         sds->busiest_has_capacity = sgs.group_has_capacity;
+                       sds->busiest_group_weight = sgs.group_weight;
                         sds->group_imb = sgs.group_imb;
                 }
  
@@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (sds.this_load >= sds.avg_load)
                 goto out_balanced;
  
-       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-               goto out_balanced;
+       /*
+        * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+        * And to check for busy balance use !idle_cpu instead of
+        * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+        * even when they are idle.
+        */
+       if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
+       } else {
+               /*
+                * This cpu is idle. If the busiest group load doesn't
+                * have more tasks than the number of available cpu's and
+                * there is no imbalance between this and busiest group
+                * wrt to idle cpu's, it is balanced.
+                */
+               if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                   sds.busiest_nr_running <= sds.busiest_group_weight)
+                       goto out_balanced;
+       }
  
  force_balance:
         /* Looks like there is an imbalance. Compute it */
@@ -3869,13 +3897,26 @@ static void set_curr_task_fair(struct rq *rq)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
  {
-       struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-       update_curr(cfs_rq);
+       /*
+        * If the task was not on the rq at the time of this cgroup movement
+        * it must have been asleep, sleeping tasks keep their ->vruntime
+        * absolute on their old rq until wakeup (needed for the fair sleeper
+        * bonus in place_entity()).
+        *
+        * If it was on the rq, we've just 'preempted' it, which does convert
+        * ->vruntime to a relative base.
+        *
+        * Make sure both cases convert their relative position when migrating
+        * to another cgroup's rq. This does somewhat interfere with the
+        * fair sleeper stuff for the first placement, but who cares.
+        */
+       if (!on_rq)
+               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+       set_task_rq(p, task_cpu(p));
         if (!on_rq)
-               place_entity(cfs_rq, &p->se, 1);
+               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
  }
  #endif
  
@@ -3927,7 +3968,7 @@ static const struct sched_class fair_sched_class = {
         .get_rr_interval        = get_rr_interval_fair,
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       .moved_group            = moved_group_fair,
+       .task_move_group        = task_move_group_fair,
  #endif
  };