sched, numa, mm: Implement slow start for working set sampling

[karo-tx-linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2d8927fda712f5ee1e19f1fe364fbad557d8a736..9dbbe452ed636b760f6a0058ccf8aa61225efa34 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         trace_sched_migrate_task(p, new_cpu);
  
         if (task_cpu(p) != new_cpu) {
+               if (p->sched_class->migrate_task_rq)
+                       p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
         }
@@ -1524,6 +1526,15 @@ static void __sched_fork(struct task_struct *p)
         p->se.vruntime                  = 0;
         INIT_LIST_HEAD(&p->se.group_node);
  
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+       p->se.avg.runnable_avg_period = 0;
+       p->se.avg.runnable_avg_sum = 0;
+#endif
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
@@ -1533,6 +1544,21 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->numa_shared = -1;
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = 2;
+       p->numa_faults = NULL;
+       p->numa_scan_period = sysctl_sched_numa_scan_delay;
+       p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
  }
  
  /*
@@ -1774,6 +1800,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -5484,7 +5511,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_id);
  
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
  {
         struct sched_domain *sd;
         int id = cpu;
@@ -5495,6 +5524,15 @@ static void update_top_cache_domain(int cpu)
  
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_id, cpu) = id;
+
+       for_each_domain(cpu, sd) {
+               if (cpumask_equal(sched_domain_span(sd),
+                                 cpumask_of_node(cpu_to_node(cpu))))
+                       goto got_node;
+       }
+       sd = NULL;
+got_node:
+       rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
  }
  
  /*
@@ -5537,7 +5575,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         rcu_assign_pointer(rq->sd, sd);
         destroy_sched_domains(tmp, cpu);
  
-       update_top_cache_domain(cpu);
+       update_domain_cache(cpu);
  }
  
  /* cpus with isolated domains */
@@ -5959,6 +5997,37 @@ static struct sched_domain_topology_level default_topology[] = {
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
+#ifdef CONFIG_NUMA_BALANCING
+
+/*
+ */
+void sched_setnuma(struct task_struct *p, int node, int shared)
+{
+       unsigned long flags;
+       int on_rq, running;
+       struct rq *rq;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_shared = shared;
+       p->numa_max_node = node;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+
+#endif /* CONFIG_NUMA_BALANCING */
+
  #ifdef CONFIG_NUMA
  
  static int sched_domains_numa_levels;
@@ -6004,6 +6073,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                         | sd_local_flags(level)
                                         ,
                 .last_balance           = jiffies,
@@ -6858,7 +6928,6 @@ void __init sched_init(void)
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
-               rq->push_cpu = 0;
                 rq->cpu = i;
                 rq->online = 0;
                 rq->idle_stamp = 0;
@@ -6866,6 +6935,10 @@ void __init sched_init(void)
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
+#ifdef CONFIG_NUMA_BALANCING
+               rq->nr_shared_running = 0;
+#endif
+
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ
                 rq->nohz_flags = 0;