Merge branch 'numa/misc'

[karo-tx-linux.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index fb135ba4aba90349e58c79d49838860cb7db8105..dbe32bd28642b0262a544f2932cec1f897111df1 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -36,6 +36,8 @@
   *             (Gerhard.Wichert@pdb.siemens.de)
   *
   * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ *
+ * 2012 - NUMA placement page faults (Andrea Arcangeli, Peter Zijlstra)
   */
  
  #include <linux/kernel_stat.h>
@@ -57,6 +59,7 @@
  #include <linux/swapops.h>
  #include <linux/elf.h>
  #include <linux/gfp.h>
+#include <linux/migrate.h>
  
  #include <asm/io.h>
  #include <asm/pgalloc.h>
@@ -67,6 +70,10 @@
  
  #include "internal.h"
  
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA config, growing page-frame for last_nid.
+#endif
+
  #ifndef CONFIG_NEED_MULTIPLE_NODES
  /* use the per-pgdat data instead for discontigmem - mbligh */
  unsigned long max_mapnr;
@@ -3433,6 +3440,84 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
  }
  
+static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte)
+{
+       /*
+        * If we have the normal vma->vm_page_prot protections we're not a
+        * 'special' PROT_NONE page.
+        *
+        * This means we cannot get 'special' PROT_NONE faults from genuine
+        * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty
+        * tracking.
+        *
+        * Neither case is really interesting for our current use though so we
+        * don't care.
+        */
+       if (pte_same(pte, pte_modify(pte, vma->vm_page_prot)))
+               return false;
+
+       return pte_same(pte, pte_modify(pte, vma_prot_none(vma)));
+}
+
+static int do_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pte_t *ptep, pmd_t *pmd,
+                       unsigned int flags, pte_t entry)
+{
+       struct page *page = NULL;
+       int node, page_nid = -1;
+       spinlock_t *ptl;
+
+       ptl = pte_lockptr(mm, pmd);
+       spin_lock(ptl);
+       if (unlikely(!pte_same(*ptep, entry)))
+               goto unlock;
+
+       page = vm_normal_page(vma, address, entry);
+       if (page) {
+               get_page(page);
+               page_nid = page_to_nid(page);
+               node = mpol_misplaced(page, vma, address);
+               if (node != -1)
+                       goto migrate;
+       }
+
+fixup:
+       flush_cache_page(vma, address, pte_pfn(entry));
+
+       ptep_modify_prot_start(mm, address, ptep);
+       entry = pte_modify(entry, vma->vm_page_prot);
+       ptep_modify_prot_commit(mm, address, ptep, entry);
+
+       update_mmu_cache(vma, address, ptep);
+
+unlock:
+       pte_unmap_unlock(ptep, ptl);
+out:
+       if (page) {
+               task_numa_fault(page_nid, 1);
+               put_page(page);
+       }
+
+       return 0;
+
+migrate:
+       pte_unmap_unlock(ptep, ptl);
+
+       if (!migrate_misplaced_page(page, node)) {
+               page_nid = node;
+               goto out;
+       }
+
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_same(*ptep, entry)) {
+               put_page(page);
+               page = NULL;
+               goto unlock;
+       }
+
+       goto fixup;
+}
+
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most
@@ -3453,7 +3538,7 @@ int handle_pte_fault(struct mm_struct *mm,
         pte_t entry;
         spinlock_t *ptl;
  
-       entry = *pte;
+       entry = ACCESS_ONCE(*pte);
         if (!pte_present(entry)) {
                 if (pte_none(entry)) {
                         if (vma->vm_ops) {
@@ -3471,6 +3556,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                         pte, pmd, flags, entry);
         }
  
+       if (pte_prot_none(vma, entry))
+               return do_prot_none(mm, vma, address, pte, pmd, flags, entry);
+
         ptl = pte_lockptr(mm, pmd);
         spin_lock(ptl);
         if (unlikely(!pte_same(*pte, entry)))
@@ -3535,13 +3623,16 @@ retry:
                                                           pmd, flags);
         } else {
                 pmd_t orig_pmd = *pmd;
-               int ret;
+               int ret = 0;
  
                 barrier();
-               if (pmd_trans_huge(orig_pmd)) {
-                       if (flags & FAULT_FLAG_WRITE &&
-                           !pmd_write(orig_pmd) &&
-                           !pmd_trans_splitting(orig_pmd)) {
+               if (pmd_trans_huge(orig_pmd) && !pmd_trans_splitting(orig_pmd)) {
+                       if (pmd_prot_none(vma, orig_pmd)) {
+                               do_huge_pmd_prot_none(mm, vma, address, pmd,
+                                                     flags, orig_pmd);
+                       }
+
+                       if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
                                 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                           orig_pmd);
                                 /*
@@ -3551,18 +3642,20 @@ retry:
                                  */
                                 if (unlikely(ret & VM_FAULT_OOM))
                                         goto retry;
-                               return ret;
                         }
-                       return 0;
+
+                       return ret;
                 }
         }
  
+
         /*
          * Use __pte_alloc instead of pte_alloc_map, because we can't
          * run pte_offset_map on the pmd, if an huge pmd could
          * materialize from under us from a different thread.
          */
-       if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+       if (unlikely(pmd_none(*pmd)) &&
+           unlikely(__pte_alloc(mm, vma, pmd, address)))
                 return VM_FAULT_OOM;
         /* if an huge pmd materialized from under us just retry later */
         if (unlikely(pmd_trans_huge(*pmd)))