From bb527e7df7bd5f05dca74868f61b7a9a17ee54c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Oct 2012 13:06:22 +0200
Subject: [PATCH] sched/numa: More NUMA_PULL_BIAS fixes

Consider the simple case where 2 runnable tasks, each with a preferred
node (not the same), each running on the wrong node. Weight wise, this
is a well balanced situation and thus the load-balancer will not find
anything to do and is content.

NUMA_PULL_BIAS is the 'force' introduced to break this situation and
promote task movement despite the weight balance so that we might find
the situation where each task runs on its preferred node.

It was (again) observed that this force was too strong and created
undue task movement. In particular, suppose the two runnable tasks
each having the same preferred node, the weight balance will then move
one of them to the other node. This results in one task running on the
right node and one on the wrong node.

NUMA_PULL_BIAS would in this case try and move the off-node task back
to its home-node in order to promote better node-preference. This is
clearly sub-optimal since either we end up moving that same task back,
or worse, we'll move the other task to the now empty node.

This results in endless undue task movement.

Solve this by limiting the NUMA_PULL_BIAS to only allow moving an
offnode task to its home-node when the home-node has either another
offnode task or an non-numa task to trade back for it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-2sz5f6n058jxkvuwmupjpv7x@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 70 +++++++++++++++++++++++++++++---------------
 kernel/sched/sched.h | 11 +------
 3 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c386297e05c5..23ad8b94aeb2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6930,6 +6930,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 #ifdef CONFIG_SCHED_NUMA
 		INIT_LIST_HEAD(&rq->offnode_tasks);
+		rq->onnode_running = 0;
 		rq->offnode_running = 0;
 		rq->offnode_weight = 0;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab2f11be7e96..7d522d08448f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -797,17 +797,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static unsigned long task_h_load(struct task_struct *p);
 
 #ifdef CONFIG_SCHED_NUMA
-static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
-	p->numa_contrib = task_h_load(p);
-	rq->offnode_weight += p->numa_contrib;
-	rq->offnode_running++;
+	struct list_head *tasks = &rq->cfs_tasks;
+
+	if (tsk_home_node(p) != cpu_to_node(task_cpu(p))) {
+		p->numa_contrib = task_h_load(p);
+		rq->offnode_weight += p->numa_contrib;
+		rq->offnode_running++;
+		tasks = &rq->offnode_tasks;
+	} else
+		rq->onnode_running++;
+
+	return tasks;
 }
 
-static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
-	rq->offnode_weight -= p->numa_contrib;
-	rq->offnode_running--;
+	if (tsk_home_node(p) != cpu_to_node(task_cpu(p))) {
+		rq->offnode_weight -= p->numa_contrib;
+		rq->offnode_running--;
+	} else
+		rq->onnode_running--;
 }
 
 /*
@@ -956,11 +967,11 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	}
 }
 #else
-static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 }
 
-static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 
@@ -985,10 +996,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		struct task_struct *p = task_of(se);
 		struct list_head *tasks = &rq->cfs_tasks;
 
-		if (offnode_task(p)) {
-			account_offnode_enqueue(rq, p);
-			tasks = offnode_tasks(rq);
-		}
+		if (tsk_home_node(p) != -1)
+			tasks = account_numa_enqueue(rq, p);
 
 		list_add(&se->group_node, tasks);
 	}
@@ -1007,8 +1016,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		list_del_init(&se->group_node);
 
-		if (offnode_task(p))
-			account_offnode_dequeue(rq_of(cfs_rq), p);
+		if (tsk_home_node(p) != -1)
+			account_numa_dequeue(rq_of(cfs_rq), p);
 	}
 	cfs_rq->nr_running--;
 }
@@ -3762,6 +3771,9 @@ struct sd_lb_stats {
 	struct sched_group *numa_group; /* group which has offnode_tasks */
 	unsigned long numa_group_weight;
 	unsigned long numa_group_running;
+
+	unsigned long this_offnode_running;
+	unsigned long this_onnode_running;
 #endif
 };
 
@@ -3779,8 +3791,9 @@ struct sg_lb_stats {
 	int group_imb; /* Is there an imbalance in the group ? */
 	int group_has_capacity; /* Is there extra capacity in the group? */
 #ifdef CONFIG_SCHED_NUMA
-	unsigned long numa_weight;
-	unsigned long numa_running;
+	unsigned long numa_offnode_weight;
+	unsigned long numa_offnode_running;
+	unsigned long numa_onnode_running;
 #endif
 };
 
@@ -3813,8 +3826,9 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 #ifdef CONFIG_SCHED_NUMA
 static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
 {
-	sgs->numa_weight += rq->offnode_weight;
-	sgs->numa_running += rq->offnode_running;
+	sgs->numa_offnode_weight += rq->offnode_weight;
+	sgs->numa_offnode_running += rq->offnode_running;
+	sgs->numa_onnode_running += rq->onnode_running;
 }
 
 /*
@@ -3832,16 +3846,19 @@ static inline void update_sd_numa_stats(struct sched_domain *sd,
 	if (!(sd->flags & SD_NUMA))
 		return;
 
-	if (local_group)
+	if (local_group) {
+		sds->this_offnode_running = sgs->numa_offnode_running;
+		sds->this_onnode_running  = sgs->numa_onnode_running;
 		return;
+	}
 
-	if (!sgs->numa_running)
+	if (!sgs->numa_offnode_running)
 		return;
 
 	if (!sds->numa_group || pick_numa_rand(sd->span_weight / group->group_weight)) {
 		sds->numa_group = group;
-		sds->numa_group_weight = sgs->numa_weight;
-		sds->numa_group_running = sgs->numa_running;
+		sds->numa_group_weight = sgs->numa_offnode_weight;
+		sds->numa_group_running = sgs->numa_offnode_running;
 	}
 }
 
@@ -3878,6 +3895,13 @@ static inline int check_numa_busiest_group(struct lb_env *env, struct sd_lb_stat
 	if (!sds->numa_group)
 		return 0;
 
+	/*
+	 * Only pull an offnode task home if we've got offnode or !numa tasks to trade for it.
+	 */
+	if (!sds->this_offnode_running &&
+	    !(sds->this_nr_running - sds->this_onnode_running - sds->this_offnode_running))
+		return 0;
+
 	env->imbalance = sds->numa_group_weight / sds->numa_group_running;
 	sds->busiest = sds->numa_group;
 	env->find_busiest_queue = find_busiest_numa_queue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3060136ef896..e68cef1a024c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -420,6 +420,7 @@ struct rq {
 	struct list_head cfs_tasks;
 
 #ifdef CONFIG_SCHED_NUMA
+	unsigned long    onnode_running;
 	unsigned long    offnode_running;
 	unsigned long	 offnode_weight;
 	struct list_head offnode_tasks;
@@ -494,11 +495,6 @@ DECLARE_PER_CPU(struct rq, runqueues);
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
 #ifdef CONFIG_SCHED_NUMA
-static inline bool offnode_task(struct task_struct *t)
-{
-	return t->node != -1 && t->node != cpu_to_node(task_cpu(t));
-}
-
 static inline struct list_head *offnode_tasks(struct rq *rq)
 {
 	return &rq->offnode_tasks;
@@ -509,11 +505,6 @@ static inline void task_numa_free(struct task_struct *p)
 	kfree(p->numa_faults);
 }
 #else /* CONFIG_SCHED_NUMA */
-static inline bool offnode_task(struct task_struct *t)
-{
-	return false;
-}
-
 static inline struct list_head *offnode_tasks(struct rq *rq)
 {
 	return NULL;
-- 
2.39.2