mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode

author Andrea Arcangeli <aarcange@redhat.com>

Wed, 21 Mar 2012 23:33:42 +0000 (16:33 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 2 Apr 2012 17:31:53 +0000 (10:31 -0700)
author Andrea Arcangeli <aarcange@redhat.com>
Wed, 21 Mar 2012 23:33:42 +0000 (16:33 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 2 Apr 2012 17:31:53 +0000 (10:31 -0700)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c

index b466cab5ba15d171cb4b1fd61a1590bd0cca9956..328cb37bb827915ccc3e87cc6518acd7ccf25686 100644 (file)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
         spinlock_t *ptl;
         int i;
  
+       down_write(&mm->mmap_sem);
         pgd = pgd_offset(mm, 0xA0000);
         if (pgd_none_or_clear_bad(pgd))
                 goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
         }
         pte_unmap_unlock(pte, ptl);
  out:
+       up_write(&mm->mmap_sem);
         flush_tlb();
  }
  
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index 7dcd2a250495d9a1777e6d7992637fb0a4897a81..3efa7253523e77c7a07d47d340f6df7a88deb741 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
         } else {
                 spin_unlock(&walk->mm->page_table_lock);
         }
+
+       if (pmd_trans_unstable(pmd))
+               return 0;
         /*
          * The mmap_sem held all the way back in m_start() is what
          * keeps khugepaged out of here and from collapsing things
@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
         struct page *page;
  
         split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
  
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
         int err = 0;
  
         split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
  
         /* find the first VMA at or above 'addr' */
         vma = find_vma(walk->mm, addr);
@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                 spin_unlock(&walk->mm->page_table_lock);
         }
  
+       if (pmd_trans_unstable(pmd))
+               return 0;
         orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
         do {
                 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 76bff2bff15e346532be60dc1ce1a13aff070471..a03c098b0cce94e06c87e022fb33f9e19b8f89cb 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
                                 unsigned long size);
  #endif
  
+#ifdef CONFIG_MMU
+
  #ifndef CONFIG_TRANSPARENT_HUGEPAGE
  static inline int pmd_trans_huge(pmd_t pmd)
  {
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
         return 0;
  }
  #endif /* __HAVE_ARCH_PMD_WRITE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/*
+ * This function is meant to be used by sites walking pagetables with
+ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
+ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
+ * into a null pmd and the transhuge page fault can convert a null pmd
+ * into an hugepmd or into a regular pmd (if the hugepage allocation
+ * fails). While holding the mmap_sem in read mode the pmd becomes
+ * stable and stops changing under us only if it's not null and not a
+ * transhuge pmd. When those races occurs and this function makes a
+ * difference vs the standard pmd_none_or_clear_bad, the result is
+ * undefined so behaving like if the pmd was none is safe (because it
+ * can return none anyway). The compiler level barrier() is critically
+ * important to compute the two checks atomically on the same pmdval.
+ */
+static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
+{
+       /* depend on compiler for an atomic pmd read */
+       pmd_t pmdval = *pmd;
+       /*
+        * The barrier will stabilize the pmdval in a register or on
+        * the stack so that it will stop changing under the code.
+        */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       barrier();
+#endif
+       if (pmd_none(pmdval))
+               return 1;
+       if (unlikely(pmd_bad(pmdval))) {
+               if (!pmd_trans_huge(pmdval))
+                       pmd_clear_bad(pmd);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * This is a noop if Transparent Hugepage Support is not built into
+ * the kernel. Otherwise it is equivalent to
+ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
+ * places that already verified the pmd is not none and they want to
+ * walk ptes while holding the mmap sem in read mode (write mode don't
+ * need this). If THP is not enabled, the pmd can't go away under the
+ * code even if MADV_DONTNEED runs, but if THP is enabled we need to
+ * run a pmd_trans_unstable before walking the ptes after
+ * split_huge_page_pmd returns (because it may have run when the pmd
+ * become null, but then a page fault can map in a THP and not a
+ * regular page).
+ */
+static inline int pmd_trans_unstable(pmd_t *pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       return pmd_none_or_trans_huge_or_clear_bad(pmd);
+#else
+       return 0;
  #endif
+}
+
+#endif /* CONFIG_MMU */
  
  #endif /* !__ASSEMBLY__ */
  
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 58a08fc7414aaf4de59b57a924994dfa43452468..cd412fc9b802c69a279adaf7d74c4dfcb142f089 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5234,6 +5234,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
         spinlock_t *ptl;
  
         split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
  
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -5396,6 +5398,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
         spinlock_t *ptl;
  
         split_huge_page_pmd(walk->mm, pmd);
+       if (pmd_trans_unstable(pmd))
+               return 0;
  retry:
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c

index fa2f04e0337c437e739e75a683ff5b80a070a95a..10b4ddadc37ebca11133a74c8081fcde8e259bae 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
         do {
                 next = pmd_addr_end(addr, end);
                 if (pmd_trans_huge(*pmd)) {
-                       if (next-addr != HPAGE_PMD_SIZE) {
+                       if (next - addr != HPAGE_PMD_SIZE) {
                                 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                 split_huge_page_pmd(vma->vm_mm, pmd);
                         } else if (zap_huge_pmd(tlb, vma, pmd, addr))
-                               continue;
+                               goto next;
                         /* fall through */
                 }
-               if (pmd_none_or_clear_bad(pmd))
-                       continue;
+               /*
+                * Here there can be other concurrent MADV_DONTNEED or
+                * trans huge page faults running, and if the pmd is
+                * none or trans huge it can change under us. This is
+                * because MADV_DONTNEED holds the mmap_sem in read
+                * mode.
+                */
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+                       goto next;
                 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+next:
                 cond_resched();
         } while (pmd++, addr = next, addr != end);
  
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 47296fee23dbaa67c3408227d7653037a371bcdf..0a3757067631413e97ff1d843364321027be32b2 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
         do {
                 next = pmd_addr_end(addr, end);
                 split_huge_page_pmd(vma->vm_mm, pmd);
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                         continue;
                 if (check_pte_range(vma, pmd, addr, next, nodes,
                                     flags, private))
diff --git a/mm/mincore.c b/mm/mincore.c

index 636a86876ff217a7f6d18f29c18524c2f2645a63..936b4cee8cb1ee126e7c78672d54062437ba5c45 100644 (file)
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                         }
                         /* fall through */
                 }
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                         mincore_unmapped_range(vma, addr, next, vec);
                 else
                         mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c

index 2f5cf10ff6607483433cc3a059d28023afa9a042..aa9701e12714af2ce7ead752def02200910bf2b1 100644 (file)
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
                         continue;
  
                 split_huge_page_pmd(walk->mm, pmd);
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                         goto again;
                 err = walk_pte_range(pmd, addr, next, walk);
                 if (err)
diff --git a/mm/swapfile.c b/mm/swapfile.c

index d999f090dfdabb6e5282aaaf3429029c5e3f839d..f31b29d2ca4e5062d889e74bd3f531f092748dad 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
         pmd = pmd_offset(pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
-               if (unlikely(pmd_trans_huge(*pmd)))
-                       continue;
-               if (pmd_none_or_clear_bad(pmd))
+               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                         continue;
                 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
                 if (ret)
author	Andrea Arcangeli <aarcange@redhat.com>
	Wed, 21 Mar 2012 23:33:42 +0000 (16:33 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 2 Apr 2012 17:31:53 +0000 (10:31 -0700)
arch/x86/kernel/vm86_32.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
include/asm-generic/pgtable.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/mincore.c		patch \| blob \| history
mm/pagewalk.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history