mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node

author Mel Gorman <mgorman@suse.de>

Thu, 22 Nov 2012 14:40:03 +0000 (14:40 +0000)

committer Mel Gorman <mgorman@suse.de>

Tue, 11 Dec 2012 14:42:56 +0000 (14:42 +0000)
author Mel Gorman <mgorman@suse.de>
Thu, 22 Nov 2012 14:40:03 +0000 (14:40 +0000)
committer Mel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:56 +0000 (14:42 +0000)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index e850a23dd6ecd9c76743f579bad5f5f4df0d3cd5..197422a1598c38d21a01695286bd6f9e473357bc 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -418,10 +418,20 @@ struct mm_struct {
  
         /* numa_scan_seq prevents two threads setting pte_numa */
         int numa_scan_seq;
+
+       /*
+        * The first node a task was scheduled on. If a task runs on
+        * a different node than Make PTE Scan Go Now.
+        */
+       int first_nid;
  #endif
         struct uprobes_state uprobes_state;
  };
  
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT     -1
+#define NUMA_PTE_SCAN_ACTIVE   -2
+
  static inline void mm_init_cpumask(struct mm_struct *mm)
  {
  #ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/kernel/fork.c b/kernel/fork.c

index 8b20ab7d3aa2951eff91a4e09e0af23a90992747..296ea308096d4c37bd03ff45d833bd32474d7081 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -820,6 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+       mm->first_nid = NUMA_PTE_SCAN_INIT;
  #endif
         if (!mm_init(mm, tsk))
                 goto fail_nomem;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 7a02a2082e95154d5094ded219b20f76ac98a525..3e18f611a5aa6d15e41c2e32186da7587e386ef7 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -860,6 +860,24 @@ void task_numa_work(struct callback_head *work)
         if (p->flags & PF_EXITING)
                 return;
  
+       /*
+        * We do not care about task placement until a task runs on a node
+        * other than the first one used by the address space. This is
+        * largely because migrations are driven by what CPU the task
+        * is running on. If it's never scheduled on another node, it'll
+        * not migrate so why bother trapping the fault.
+        */
+       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+               mm->first_nid = numa_node_id();
+       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+               /* Are we running on a new node yet? */
+               if (numa_node_id() == mm->first_nid &&
+                   !sched_feat_numa(NUMA_FORCE))
+                       return;
+
+               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+       }
+
         /*
          * Reset the scan period if enough time has gone by. Objective is that
          * scanning will be reduced if pages are properly placed. As tasks
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index d2373a3e32528ae6869248188108dc2283cd249e..e7c25fff1e94df70faab87d4d97772ba2a2cdd4e 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false)
  /*
   * Apply the automatic NUMA scheduling policy. Enabled automatically
   * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
   */
  #ifdef CONFIG_NUMA_BALANCING
  SCHED_FEAT(NUMA,       false)
+SCHED_FEAT(NUMA_FORCE, false)
  #endif
author	Mel Gorman <mgorman@suse.de>
	Thu, 22 Nov 2012 14:40:03 +0000 (14:40 +0000)
committer	Mel Gorman <mgorman@suse.de>
	Tue, 11 Dec 2012 14:42:56 +0000 (14:42 +0000)
include/linux/mm_types.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/features.h		patch \| blob \| history