thp: avoid breaking huge pmd invariants in case of vma_adjust failures

author Andrea Arcangeli <aarcange@redhat.com>

Thu, 13 Jan 2011 23:47:08 +0000 (15:47 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Jan 2011 01:32:45 +0000 (17:32 -0800)
author Andrea Arcangeli <aarcange@redhat.com>
Thu, 13 Jan 2011 23:47:08 +0000 (15:47 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Jan 2011 01:32:45 +0000 (17:32 -0800)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index c590b08c6fa60321ea546aa24b2e0d19b3b448c4..827595228734fcfe4deebf24f6ce3d76f1f6115e 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
  #error "hugepages can't be allocated by the buddy allocator"
  #endif
  extern int hugepage_madvise(unsigned long *vm_flags);
+extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                   unsigned long start,
+                                   unsigned long end,
+                                   long adjust_next);
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                        unsigned long start,
+                                        unsigned long end,
+                                        long adjust_next)
+{
+       if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+               return;
+       __vma_adjust_trans_huge(vma, start, end, adjust_next);
+}
  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
  #define HPAGE_PMD_SHIFT ({ BUG(); 0; })
  #define HPAGE_PMD_MASK ({ BUG(); 0; })
@@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags)
         BUG();
         return 0;
  }
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+                                        unsigned long start,
+                                        unsigned long end,
+                                        long adjust_next)
+{
+}
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 30c3cec8202396a0cc223361f999fc094083b7ab..b6facc35e8932d8b612e0d78ac48056f4c2aa815 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1075,8 +1075,16 @@ pmd_t *page_check_address_pmd(struct page *page,
                 goto out;
         if (pmd_page(*pmd) != page)
                 goto out;
-       VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-                 pmd_trans_splitting(*pmd));
+       /*
+        * split_vma() may create temporary aliased mappings. There is
+        * no risk as long as all huge pmd are found and have their
+        * splitting bit set before __split_huge_page_refcount
+        * runs. Finding the same huge pmd more than once during the
+        * same rmap walk is not a problem.
+        */
+       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+           pmd_trans_splitting(*pmd))
+               goto out;
         if (pmd_trans_huge(*pmd)) {
                 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
                           !pmd_trans_splitting(*pmd));
@@ -2196,3 +2204,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
         put_page(page);
         BUG_ON(pmd_trans_huge(*pmd));
  }
+
+static void split_huge_page_address(struct mm_struct *mm,
+                                   unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+       pgd = pgd_offset(mm, address);
+       if (!pgd_present(*pgd))
+               return;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return;
+       /*
+        * Caller holds the mmap_sem write mode, so a huge pmd cannot
+        * materialize from under us.
+        */
+       split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                            unsigned long start,
+                            unsigned long end,
+                            long adjust_next)
+{
+       /*
+        * If the new start address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (start & ~HPAGE_PMD_MASK &&
+           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, start);
+
+       /*
+        * If the new end address isn't hpage aligned and it could
+        * previously contain an hugepage: check if we need to split
+        * an huge pmd.
+        */
+       if (end & ~HPAGE_PMD_MASK &&
+           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+               split_huge_page_address(vma->vm_mm, end);
+
+       /*
+        * If we're also updating the vma->vm_next->vm_start, if the new
+        * vm_next->vm_start isn't page aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (adjust_next > 0) {
+               struct vm_area_struct *next = vma->vm_next;
+               unsigned long nstart = next->vm_start;
+               nstart += adjust_next << PAGE_SHIFT;
+               if (nstart & ~HPAGE_PMD_MASK &&
+                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                       split_huge_page_address(next->vm_mm, nstart);
+       }
+}
diff --git a/mm/mmap.c b/mm/mmap.c

index 753f44d17047d80ff1b1cb64e9d53d09fd054cb4..73cc648873d6d00712457b7aa67d13db31b5c781 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -589,6 +589,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
                 }
         }
  
+       vma_adjust_trans_huge(vma, start, end, adjust_next);
+
         /*
          * When changing only vma->vm_end, we don't really need anon_vma
          * lock. This is a fairly rare case by itself, but the anon_vma
author	Andrea Arcangeli <aarcange@redhat.com>
	Thu, 13 Jan 2011 23:47:08 +0000 (15:47 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Jan 2011 01:32:45 +0000 (17:32 -0800)
include/linux/huge_mm.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history