sched/numa: More NUMA_PULL_BIAS fixes

[karo-tx-linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index c1774723643838ea565a9419838bc0509ee48353..23ad8b94aeb2fef96493f91b9a8225c8f0bb206c 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,20 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
+
+#ifdef CONFIG_SCHED_NUMA
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->node = -1;
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+       p->numa_faults = NULL;
+       p->numa_task_period = sysctl_sched_numa_task_period_min;
+#endif /* CONFIG_SCHED_NUMA */
  }
  
  /*
@@ -1774,6 +1788,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -5484,7 +5499,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_id);
  
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
  {
         struct sched_domain *sd;
         int id = cpu;
@@ -5495,6 +5512,15 @@ static void update_top_cache_domain(int cpu)
  
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_id, cpu) = id;
+
+       for_each_domain(cpu, sd) {
+               if (cpumask_equal(sched_domain_span(sd),
+                                 cpumask_of_node(cpu_to_node(cpu))))
+                       goto got_node;
+       }
+       sd = NULL;
+got_node:
+       rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
  }
  
  /*
@@ -5537,7 +5563,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         rcu_assign_pointer(rq->sd, sd);
         destroy_sched_domains(tmp, cpu);
  
-       update_top_cache_domain(cpu);
+       update_domain_cache(cpu);
  }
  
  /* cpus with isolated domains */
@@ -5959,6 +5985,42 @@ static struct sched_domain_topology_level default_topology[] = {
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
+#ifdef CONFIG_SCHED_NUMA
+
+/*
+ * Requeues a task ensuring its on the right load-balance list so
+ * that it might get migrated to its new home.
+ *
+ * Since home-node is pure preference there's no hard migrate to force
+ * us anywhere, this also allows us to call this from atomic context if
+ * required.
+ */
+void sched_setnode(struct task_struct *p, int node)
+{
+       unsigned long flags;
+       int on_rq, running;
+       struct rq *rq;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->node = node;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+
+#endif /* CONFIG_SCHED_NUMA */
+
  #ifdef CONFIG_NUMA
  
  static int sched_domains_numa_levels;
@@ -6004,6 +6066,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                         | sd_local_flags(level)
                                         ,
                 .last_balance           = jiffies,
@@ -6122,6 +6185,17 @@ static void sched_init_numa(void)
          * numbers.
          */
  
+       /*
+        * Here, we should temporarily reset sched_domains_numa_levels to 0.
+        * If it fails to allocate memory for array sched_domains_numa_masks[][],
+        * the array will contain less then 'level' members. This could be
+        * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+        * in other functions.
+        *
+        * We reset it to 'level' at the end of this function.
+        */
+       sched_domains_numa_levels = 0;
+
         sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
         if (!sched_domains_numa_masks)
                 return;
@@ -6176,11 +6250,68 @@ static void sched_init_numa(void)
         }
  
         sched_domain_topology = tl;
+
+       sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+       int i, j;
+       int node = cpu_to_node(cpu);
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       if (node_distance(j, node) <= sched_domains_numa_distance[i])
+                               cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+               }
+       }
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+       int i, j;
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               for (j = 0; j < nr_node_ids; j++)
+                       cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+       }
+}
+
+/*
+ * Update sched_domains_numa_masks[level][node] array when new cpus
+ * are onlined.
+ */
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+                                          unsigned long action,
+                                          void *hcpu)
+{
+       int cpu = (long)hcpu;
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_ONLINE:
+               sched_domains_numa_masks_set(cpu);
+               break;
+
+       case CPU_DEAD:
+               sched_domains_numa_masks_clear(cpu);
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       return NOTIFY_OK;
  }
  #else
  static inline void sched_init_numa(void)
  {
  }
+
+static int sched_domains_numa_masks_update(struct notifier_block *nfb,
+                                          unsigned long action,
+                                          void *hcpu)
+{
+       return 0;
+}
  #endif /* CONFIG_NUMA */
  
  static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -6629,6 +6760,7 @@ void __init sched_init_smp(void)
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
  
+       hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
         hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
         hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
  
@@ -6796,6 +6928,12 @@ void __init sched_init(void)
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
+#ifdef CONFIG_SCHED_NUMA
+               INIT_LIST_HEAD(&rq->offnode_tasks);
+               rq->onnode_running = 0;
+               rq->offnode_running = 0;
+               rq->offnode_weight = 0;
+#endif
  
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ