]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - kernel/sched/core.c
sched, numa, mm: Implement slow start for working set sampling
[karo-tx-linux.git] / kernel / sched / core.c
index 2d8927fda712f5ee1e19f1fe364fbad557d8a736..9dbbe452ed636b760f6a0058ccf8aa61225efa34 100644 (file)
@@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
 
        if (task_cpu(p) != new_cpu) {
+               if (p->sched_class->migrate_task_rq)
+                       p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
        }
@@ -1524,6 +1526,15 @@ static void __sched_fork(struct task_struct *p)
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
 
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+       p->se.avg.runnable_avg_period = 0;
+       p->se.avg.runnable_avg_sum = 0;
+#endif
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
@@ -1533,6 +1544,21 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+               p->mm->numa_next_scan = jiffies;
+               p->mm->numa_scan_seq = 0;
+       }
+
+       p->numa_shared = -1;
+       p->node_stamp = 0ULL;
+       p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+       p->numa_migrate_seq = 2;
+       p->numa_faults = NULL;
+       p->numa_scan_period = sysctl_sched_numa_scan_delay;
+       p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
 
 /*
@@ -1774,6 +1800,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -5484,7 +5511,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_id);
 
-static void update_top_cache_domain(int cpu)
+DEFINE_PER_CPU(struct sched_domain *, sd_node);
+
+static void update_domain_cache(int cpu)
 {
        struct sched_domain *sd;
        int id = cpu;
@@ -5495,6 +5524,15 @@ static void update_top_cache_domain(int cpu)
 
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
+
+       for_each_domain(cpu, sd) {
+               if (cpumask_equal(sched_domain_span(sd),
+                                 cpumask_of_node(cpu_to_node(cpu))))
+                       goto got_node;
+       }
+       sd = NULL;
+got_node:
+       rcu_assign_pointer(per_cpu(sd_node, cpu), sd);
 }
 
 /*
@@ -5537,7 +5575,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        rcu_assign_pointer(rq->sd, sd);
        destroy_sched_domains(tmp, cpu);
 
-       update_top_cache_domain(cpu);
+       update_domain_cache(cpu);
 }
 
 /* cpus with isolated domains */
@@ -5959,6 +5997,37 @@ static struct sched_domain_topology_level default_topology[] = {
 
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
 
+#ifdef CONFIG_NUMA_BALANCING
+
+/*
+ */
+void sched_setnuma(struct task_struct *p, int node, int shared)
+{
+       unsigned long flags;
+       int on_rq, running;
+       struct rq *rq;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_shared = shared;
+       p->numa_max_node = node;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+
+#endif /* CONFIG_NUMA_BALANCING */
+
 #ifdef CONFIG_NUMA
 
 static int sched_domains_numa_levels;
@@ -6004,6 +6073,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -6858,7 +6928,6 @@ void __init sched_init(void)
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
-               rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
                rq->idle_stamp = 0;
@@ -6866,6 +6935,10 @@ void __init sched_init(void)
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
 
+#ifdef CONFIG_NUMA_BALANCING
+               rq->nr_shared_running = 0;
+#endif
+
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
                rq->nohz_flags = 0;