sched/numa: More NUMA_PULL_BIAS fixes

[karo-tx-linux.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 60be6bf582789d49ea432f9b8bed1e388a8e0dd6..7d522d08448f6ded33a9fd9ec08f9897ec4ff0cc 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -787,104 +787,108 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * they act !NUMA until we've established the task is busy enough to bother
   * with placement.
   *
- * Once we start doing NUMA placement there's two modes, 'small' process-wide
- * and 'big' per-task. For the small mode we have a process-wide home node
- * and lazily mirgrate all memory only when this home-node changes.
- *
- * For big mode we keep a home-node per task and use periodic fault scans
- * to try and estalish a task<->page relation. This assumes the task<->page
- * relation is a compute<->data relation, this is false for things like virt.
- * and n:m threading solutions but its the best we can do given the
- * information we have.
+ * We keep a home-node per task and use periodic fault scans to try and
+ * estalish a task<->page relation. This assumes the task<->page relation is a
+ * compute<->data relation, this is false for things like virt. and n:m
+ * threading solutions but its the best we can do given the information we
+ * have.
   */
  
  static unsigned long task_h_load(struct task_struct *p);
  
  #ifdef CONFIG_SCHED_NUMA
-static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
-       p->numa_contrib = task_h_load(p);
-       rq->offnode_weight += p->numa_contrib;
-       rq->offnode_running++;
+       struct list_head *tasks = &rq->cfs_tasks;
+
+       if (tsk_home_node(p) != cpu_to_node(task_cpu(p))) {
+               p->numa_contrib = task_h_load(p);
+               rq->offnode_weight += p->numa_contrib;
+               rq->offnode_running++;
+               tasks = &rq->offnode_tasks;
+       } else
+               rq->onnode_running++;
+
+       return tasks;
  }
  
-static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
-       rq->offnode_weight -= p->numa_contrib;
-       rq->offnode_running--;
+       if (tsk_home_node(p) != cpu_to_node(task_cpu(p))) {
+               rq->offnode_weight -= p->numa_contrib;
+               rq->offnode_running--;
+       } else
+               rq->onnode_running--;
  }
  
  /*
- * numa task sample period in ms: 2.5s
+ * numa task sample period in ms: 5s
   */
-unsigned int sysctl_sched_numa_task_period = 2500;
+unsigned int sysctl_sched_numa_task_period_min = 5000;
+unsigned int sysctl_sched_numa_task_period_max = 5000*16;
  
  /*
- * Determine if a process is 'big'.
- *
- * Currently only looks at CPU-time used, maybe we should also add an RSS
- * heuristic.
+ * Wait for the 2-sample stuff to settle before migrating again
+ */
+unsigned int sysctl_sched_numa_settle_count = 2;
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
   */
-static bool task_numa_big(struct task_struct *p)
+void task_numa_fault(int node)
  {
-       struct sched_domain *sd;
-       struct task_struct *t;
-       u64 walltime = local_clock();
-       u64 runtime = 0;
-       int weight = 0;
+       struct task_struct *p = current;
  
-       if (sched_feat(NUMA_FORCE_BIG))
-               return true;
+       if (unlikely(!p->numa_faults)) {
+               p->numa_faults = kzalloc(sizeof(unsigned long) * nr_node_ids,
+                                        GFP_KERNEL);
+               if (!p->numa_faults)
+                       return;
+       }
  
-       rcu_read_lock();
-       t = p;
-       do {
-               if (t->sched_class == &fair_sched_class)
-                       runtime += t->se.sum_exec_runtime;
-       } while ((t = next_thread(t)) != p);
+       p->numa_faults[node]++;
+}
  
-       sd = rcu_dereference(__raw_get_cpu_var(sd_node));
-       if (sd)
-               weight = sd->span_weight;
-       rcu_read_unlock();
+void task_numa_placement(void)
+{
+       unsigned long faults, max_faults = 0;
+       struct task_struct *p = current;
+       int node, max_node = -1;
+       int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
  
-       runtime -= p->numa_runtime_stamp;
-       walltime -= p->numa_walltime_stamp;
+       if (p->numa_scan_seq == seq)
+               return;
  
-       p->numa_runtime_stamp += runtime;
-       p->numa_walltime_stamp += walltime;
+       p->numa_scan_seq = seq;
  
-       /*
-        * We're 'big' when we burn more than half a node's worth
-        * of cputime.
-        */
-       return runtime > walltime * max(1, weight / 2);
-}
+       if (unlikely(!p->numa_faults))
+               return;
  
-static bool had_many_migrate_failures(struct task_struct *p)
-{
-       /* More than 1/4 of the attempted NUMA page migrations failed. */
-       return p->mm->numa_migrate_failed * 3 > p->mm->numa_migrate_success;
-}
+       for (node = 0; node < nr_node_ids; node++) {
+               faults = p->numa_faults[node];
  
-static inline bool need_numa_migration(struct task_struct *p)
-{
-       /*
-        * We need to change our home-node, its been different for 2 samples.
-        * See the whole P(n)^2 story in task_tick_numa().
-        */
-       return p->node_curr == p->node_last && p->node != p->node_curr;
-}
+               if (faults > max_faults) {
+                       max_faults = faults;
+                       max_node = node;
+               }
  
-static void sched_setnode_process(struct task_struct *p, int node)
-{
-       struct task_struct *t = p;
+               p->numa_faults[node] /= 2;
+       }
  
-       rcu_read_lock();
-       do {
-               sched_setnode(t, node);
-       } while ((t = next_thread(t)) != p);
-       rcu_read_unlock();
+       if (max_node == -1)
+               return;
+
+       if (p->node != max_node) {
+               p->numa_task_period = sysctl_sched_numa_task_period_min;
+               if (sched_feat(NUMA_SETTLE) &&
+                   (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
+                       return;
+               p->numa_migrate_seq = seq;
+               sched_setnode(p, max_node);
+       } else {
+               p->numa_task_period = min(sysctl_sched_numa_task_period_max,
+                               p->numa_task_period * 2);
+       }
  }
  
  /*
@@ -895,8 +899,7 @@ void task_numa_work(struct callback_head *work)
  {
         unsigned long migrate, next_scan, now = jiffies;
         struct task_struct *p = current;
-       bool need_migration;
-       int big;
+       struct mm_struct *mm = p->mm;
  
         WARN_ON_ONCE(p != container_of(work, struct task_struct, rcu));
  
@@ -911,52 +914,23 @@ void task_numa_work(struct callback_head *work)
         if (p->flags & PF_EXITING)
                 return;
  
-       big = p->mm->numa_big;
-       need_migration = need_numa_migration(p);
-
-       /*
-        * Change per-task state before the process wide freq. throttle,
-        * otherwise it might be a long while ere this task wins the
-        * lottery and gets its home-node set.
-        */
-       if (big && need_migration)
-               sched_setnode(p, p->node_curr);
-
         /*
          * Enforce maximal scan/migration frequency..
          */
-       migrate = p->mm->numa_next_scan;
+       migrate = mm->numa_next_scan;
         if (time_before(now, migrate))
                 return;
  
-       next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_task_period);
-       if (cmpxchg(&p->mm->numa_next_scan, migrate, next_scan) != migrate)
+       next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_task_period_min);
+       if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                 return;
  
-       if (!big) {
-               /* Age the numa migrate statistics. */
-               p->mm->numa_migrate_failed /= 2;
-               p->mm->numa_migrate_success /= 2;
-
-               big = p->mm->numa_big = task_numa_big(p);
-       }
-
-       if (need_migration) {
-               if (big)
-                       sched_setnode(p, p->node_curr);
-               else
-                       sched_setnode_process(p, p->node_curr);
-       }
-
-       if (big || need_migration || had_many_migrate_failures(p))
-               lazy_migrate_process(p->mm);
+       ACCESS_ONCE(mm->numa_scan_seq)++;
+       lazy_migrate_process(mm);
  }
  
  /*
- * Sample task location from hardirq context (tick), this has minimal bias with
- * obvious exceptions of frequency interference and tick avoidance techniques.
- * If this were to become a problem we could move this sampling into the
- * sleep/wakeup path -- but we'd prefer to avoid that for obvious reasons.
+ * Drive the periodic memory faults..
   */
  void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
@@ -969,35 +943,18 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
                 return;
  
         /*
-        * Sample our node location every @sysctl_sched_numa_task_period
-        * runtime ms. We use a two stage selection in order to filter
-        * unlikely locations.
-        *
-        * If P(n) is the probability we're on node 'n', then the probability
-        * we sample the same node twice is P(n)^2. This quadric squishes small
-        * values and makes it more likely we end up on nodes where we have
-        * significant presence.
-        *
          * Using runtime rather than walltime has the dual advantage that
          * we (mostly) drive the selection from busy threads and that the
          * task needs to have done some actual work before we bother with
          * NUMA placement.
          */
         now = curr->se.sum_exec_runtime;
-       period = (u64)sysctl_sched_numa_task_period * NSEC_PER_MSEC;
+       period = (u64)curr->numa_task_period * NSEC_PER_MSEC;
  
         if (now - curr->node_stamp > period) {
                 curr->node_stamp = now;
  
-               curr->node_last = curr->node_curr;
-               curr->node_curr = numa_node_id();
-
-               /*
-                * We need to do expensive work to either migrate or
-                * drive priodic state update or scanning for 'big' processes.
-                */
-               if (need_numa_migration(curr) ||
-                   !time_before(jiffies, curr->mm->numa_next_scan)) {
+               if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                         /*
                          * We can re-use curr->rcu because we checked curr->mm
                          * != NULL so release_task()->call_rcu() was not called
@@ -1010,11 +967,11 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
         }
  }
  #else
-static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
  }
  
-static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
  }
  
@@ -1039,10 +996,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 struct task_struct *p = task_of(se);
                 struct list_head *tasks = &rq->cfs_tasks;
  
-               if (offnode_task(p)) {
-                       account_offnode_enqueue(rq, p);
-                       tasks = offnode_tasks(rq);
-               }
+               if (tsk_home_node(p) != -1)
+                       tasks = account_numa_enqueue(rq, p);
  
                 list_add(&se->group_node, tasks);
         }
@@ -1061,8 +1016,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  
                 list_del_init(&se->group_node);
  
-               if (offnode_task(p))
-                       account_offnode_dequeue(rq_of(cfs_rq), p);
+               if (tsk_home_node(p) != -1)
+                       account_numa_dequeue(rq_of(cfs_rq), p);
         }
         cfs_rq->nr_running--;
  }
@@ -3410,7 +3365,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
         check_preempt_curr(env->dst_rq, p, 0);
  }
  
-static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
+static int task_numa_hot(struct task_struct *p, struct lb_env *env)
  {
         int from_dist, to_dist;
         int node = tsk_home_node(p);
@@ -3418,8 +3373,8 @@ static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
         if (!sched_feat_numa(NUMA_HOT) || node == -1)
                 return 0; /* no node preference */
  
-       from_dist = node_distance(cpu_to_node(from_cpu), node);
-       to_dist = node_distance(cpu_to_node(to_cpu), node);
+       from_dist = node_distance(cpu_to_node(env->src_cpu), node);
+       to_dist = node_distance(cpu_to_node(env->dst_cpu), node);
  
         if (to_dist < from_dist)
                 return 0; /* getting closer is ok */
@@ -3431,7 +3386,7 @@ static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
   * Is this task likely cache-hot:
   */
  static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, struct lb_env *env)
  {
         s64 delta;
  
@@ -3454,7 +3409,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         if (sysctl_sched_migration_cost == 0)
                 return 0;
  
-       delta = now - p->se.exec_start;
+       delta = env->src_rq->clock_task - p->se.exec_start;
  
         return delta < (s64)sysctl_sched_migration_cost;
  }
@@ -3511,8 +3466,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
          * 2) too many balance attempts have failed.
          */
  
-       tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
-       tsk_cache_hot |= task_numa_hot(p, env->src_cpu, env->dst_cpu);
+       tsk_cache_hot = task_hot(p, env);
+       if (env->idle == CPU_NOT_IDLE)
+               tsk_cache_hot |= task_numa_hot(p, env);
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
@@ -3815,6 +3771,9 @@ struct sd_lb_stats {
         struct sched_group *numa_group; /* group which has offnode_tasks */
         unsigned long numa_group_weight;
         unsigned long numa_group_running;
+
+       unsigned long this_offnode_running;
+       unsigned long this_onnode_running;
  #endif
  };
  
@@ -3832,8 +3791,9 @@ struct sg_lb_stats {
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
  #ifdef CONFIG_SCHED_NUMA
-       unsigned long numa_weight;
-       unsigned long numa_running;
+       unsigned long numa_offnode_weight;
+       unsigned long numa_offnode_running;
+       unsigned long numa_onnode_running;
  #endif
  };
  
@@ -3866,8 +3826,9 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
  #ifdef CONFIG_SCHED_NUMA
  static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
  {
-       sgs->numa_weight += rq->offnode_weight;
-       sgs->numa_running += rq->offnode_running;
+       sgs->numa_offnode_weight += rq->offnode_weight;
+       sgs->numa_offnode_running += rq->offnode_running;
+       sgs->numa_onnode_running += rq->onnode_running;
  }
  
  /*
@@ -3885,16 +3846,19 @@ static inline void update_sd_numa_stats(struct sched_domain *sd,
         if (!(sd->flags & SD_NUMA))
                 return;
  
-       if (local_group)
+       if (local_group) {
+               sds->this_offnode_running = sgs->numa_offnode_running;
+               sds->this_onnode_running  = sgs->numa_onnode_running;
                 return;
+       }
  
-       if (!sgs->numa_running)
+       if (!sgs->numa_offnode_running)
                 return;
  
         if (!sds->numa_group || pick_numa_rand(sd->span_weight / group->group_weight)) {
                 sds->numa_group = group;
-               sds->numa_group_weight = sgs->numa_weight;
-               sds->numa_group_running = sgs->numa_running;
+               sds->numa_group_weight = sgs->numa_offnode_weight;
+               sds->numa_group_running = sgs->numa_offnode_running;
         }
  }
  
@@ -3931,6 +3895,13 @@ static inline int check_numa_busiest_group(struct lb_env *env, struct sd_lb_stat
         if (!sds->numa_group)
                 return 0;
  
+       /*
+        * Only pull an offnode task home if we've got offnode or !numa tasks to trade for it.
+        */
+       if (!sds->this_offnode_running &&
+           !(sds->this_nr_running - sds->this_onnode_running - sds->this_offnode_running))
+               return 0;
+
         env->imbalance = sds->numa_group_weight / sds->numa_group_running;
         sds->busiest = sds->numa_group;
         env->find_busiest_queue = find_busiest_numa_queue;