]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/memory.c
Merge branch 'numa/misc'
[karo-tx-linux.git] / mm / memory.c
index fb135ba4aba90349e58c79d49838860cb7db8105..dbe32bd28642b0262a544f2932cec1f897111df1 100644 (file)
@@ -36,6 +36,8 @@
  *             (Gerhard.Wichert@pdb.siemens.de)
  *
  * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
+ *
+ * 2012 - NUMA placement page faults (Andrea Arcangeli, Peter Zijlstra)
  */
 
 #include <linux/kernel_stat.h>
@@ -57,6 +59,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
 
 #include "internal.h"
 
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA config, growing page-frame for last_nid.
+#endif
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -3433,6 +3440,84 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
+static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte)
+{
+       /*
+        * If we have the normal vma->vm_page_prot protections we're not a
+        * 'special' PROT_NONE page.
+        *
+        * This means we cannot get 'special' PROT_NONE faults from genuine
+        * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty
+        * tracking.
+        *
+        * Neither case is really interesting for our current use though so we
+        * don't care.
+        */
+       if (pte_same(pte, pte_modify(pte, vma->vm_page_prot)))
+               return false;
+
+       return pte_same(pte, pte_modify(pte, vma_prot_none(vma)));
+}
+
+static int do_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pte_t *ptep, pmd_t *pmd,
+                       unsigned int flags, pte_t entry)
+{
+       struct page *page = NULL;
+       int node, page_nid = -1;
+       spinlock_t *ptl;
+
+       ptl = pte_lockptr(mm, pmd);
+       spin_lock(ptl);
+       if (unlikely(!pte_same(*ptep, entry)))
+               goto unlock;
+
+       page = vm_normal_page(vma, address, entry);
+       if (page) {
+               get_page(page);
+               page_nid = page_to_nid(page);
+               node = mpol_misplaced(page, vma, address);
+               if (node != -1)
+                       goto migrate;
+       }
+
+fixup:
+       flush_cache_page(vma, address, pte_pfn(entry));
+
+       ptep_modify_prot_start(mm, address, ptep);
+       entry = pte_modify(entry, vma->vm_page_prot);
+       ptep_modify_prot_commit(mm, address, ptep, entry);
+
+       update_mmu_cache(vma, address, ptep);
+
+unlock:
+       pte_unmap_unlock(ptep, ptl);
+out:
+       if (page) {
+               task_numa_fault(page_nid, 1);
+               put_page(page);
+       }
+
+       return 0;
+
+migrate:
+       pte_unmap_unlock(ptep, ptl);
+
+       if (!migrate_misplaced_page(page, node)) {
+               page_nid = node;
+               goto out;
+       }
+
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_same(*ptep, entry)) {
+               put_page(page);
+               page = NULL;
+               goto unlock;
+       }
+
+       goto fixup;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3453,7 +3538,7 @@ int handle_pte_fault(struct mm_struct *mm,
        pte_t entry;
        spinlock_t *ptl;
 
-       entry = *pte;
+       entry = ACCESS_ONCE(*pte);
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
@@ -3471,6 +3556,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
 
+       if (pte_prot_none(vma, entry))
+               return do_prot_none(mm, vma, address, pte, pmd, flags, entry);
+
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3535,13 +3623,16 @@ retry:
                                                          pmd, flags);
        } else {
                pmd_t orig_pmd = *pmd;
-               int ret;
+               int ret = 0;
 
                barrier();
-               if (pmd_trans_huge(orig_pmd)) {
-                       if (flags & FAULT_FLAG_WRITE &&
-                           !pmd_write(orig_pmd) &&
-                           !pmd_trans_splitting(orig_pmd)) {
+               if (pmd_trans_huge(orig_pmd) && !pmd_trans_splitting(orig_pmd)) {
+                       if (pmd_prot_none(vma, orig_pmd)) {
+                               do_huge_pmd_prot_none(mm, vma, address, pmd,
+                                                     flags, orig_pmd);
+                       }
+
+                       if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3551,18 +3642,20 @@ retry:
                                 */
                                if (unlikely(ret & VM_FAULT_OOM))
                                        goto retry;
-                               return ret;
                        }
-                       return 0;
+
+                       return ret;
                }
        }
 
+
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-       if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+       if (unlikely(pmd_none(*pmd)) &&
+           unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))