mm: thp: tail page refcounting fix

author Andrea Arcangeli <aarcange@redhat.com>

Wed, 2 Nov 2011 20:36:59 +0000 (13:36 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 2 Nov 2011 23:06:57 +0000 (16:06 -0700)
author Andrea Arcangeli <aarcange@redhat.com>
Wed, 2 Nov 2011 20:36:59 +0000 (13:36 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Nov 2011 23:06:57 +0000 (16:06 -0700)
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c

index fec13200868f8e6ab40fd834abe7ed4bf6dc89f8..b9e1c7ff5f6d33d8a111c10ab9b3551f6df6f335 100644 (file)
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page)
          * __split_huge_page_refcount() cannot run
          * from under us.
          */
-       VM_BUG_ON(atomic_read(&page->_count) < 0);
-       atomic_inc(&page->_count);
+       VM_BUG_ON(page_mapcount(page) < 0);
+       VM_BUG_ON(atomic_read(&page->_count) != 0);
+       atomic_inc(&page->_mapcount);
  }
  
  /*
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c

index dbe34b9313743f1cae72ac28aee5c99e5d2c9369..3b5032a62b0f3e41c98b98c0842500215e9e3e18 100644 (file)
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page)
          * __split_huge_page_refcount() cannot run
          * from under us.
          */
-       VM_BUG_ON(atomic_read(&page->_count) < 0);
-       atomic_inc(&page->_count);
+       VM_BUG_ON(page_mapcount(page) < 0);
+       VM_BUG_ON(atomic_read(&page->_count) != 0);
+       atomic_inc(&page->_mapcount);
  }
  
  static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 3b3e3b8bb70652fa1d1669a9520b4b5969d6c935..f81b7b41930cc10d7525ca7272419acca1b356d5 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page)
         return page;
  }
  
+/*
+ * The atomic page->_mapcount, starts from -1: so that transitions
+ * both from it and to it can be tracked, using atomic_inc_and_test
+ * and atomic_add_negative(-1).
+ */
+static inline void reset_page_mapcount(struct page *page)
+{
+       atomic_set(&(page)->_mapcount, -1);
+}
+
+static inline int page_mapcount(struct page *page)
+{
+       return atomic_read(&(page)->_mapcount) + 1;
+}
+
  static inline int page_count(struct page *page)
  {
         return atomic_read(&compound_head(page)->_count);
  }
  
+extern bool __get_page_tail(struct page *page);
+
  static inline void get_page(struct page *page)
  {
+       if (unlikely(PageTail(page)))
+               if (likely(__get_page_tail(page)))
+                       return;
         /*
          * Getting a normal page or the head of a compound page
-        * requires to already have an elevated page->_count. Only if
-        * we're getting a tail page, the elevated page->_count is
-        * required only in the head page, so for tail pages the
-        * bugcheck only verifies that the page->_count isn't
-        * negative.
+        * requires to already have an elevated page->_count.
          */
-       VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
+       VM_BUG_ON(atomic_read(&page->_count) <= 0);
         atomic_inc(&page->_count);
-       /*
-        * Getting a tail page will elevate both the head and tail
-        * page->_count(s).
-        */
-       if (unlikely(PageTail(page))) {
-               /*
-                * This is safe only because
-                * __split_huge_page_refcount can't run under
-                * get_page().
-                */
-               VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
-               atomic_inc(&page->first_page->_count);
-       }
  }
  
  static inline struct page *virt_to_head_page(const void *x)
@@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct page *page)
         return page->index;
  }
  
-/*
- * The atomic page->_mapcount, like _count, starts from -1:
- * so that transitions both from it and to it can be tracked,
- * using atomic_inc_and_test and atomic_add_negative(-1).
- */
-static inline void reset_page_mapcount(struct page *page)
-{
-       atomic_set(&(page)->_mapcount, -1);
-}
-
-static inline int page_mapcount(struct page *page)
-{
-       return atomic_read(&(page)->_mapcount) + 1;
-}
-
  /*
   * Return true if this page is mapped into pagetables.
   */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 3e01a19a91e8025f6f6f7499369071af53aef947..5b42f1b34eb74b59964f9a717d436cad44481c05 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -62,10 +62,23 @@ struct page {
                         struct {
  
                                 union {
-                                       atomic_t _mapcount;     /* Count of ptes mapped in mms,
-                                                        * to show when page is mapped
-                                                        * & limit reverse map searches.
-                                                        */
+                                       /*
+                                        * Count of ptes mapped in
+                                        * mms, to show when page is
+                                        * mapped & limit reverse map
+                                        * searches.
+                                        *
+                                        * Used also for tail pages
+                                        * refcounting instead of
+                                        * _count. Tail pages cannot
+                                        * be mapped and keeping the
+                                        * tail page _count zero at
+                                        * all times guarantees
+                                        * get_page_unless_zero() will
+                                        * never succeed on tail
+                                        * pages.
+                                        */
+                                       atomic_t _mapcount;
  
                                         struct {
                                                 unsigned inuse:16;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 860ec211ddd667175eca0abb95e82a01cdaa8e1e..4298abaae153033caafe1f8ac641fdef1dee097e 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
         VM_BUG_ON(!PageCompound(page));
         if (flags & FOLL_GET)
-               get_page(page);
+               get_page_foll(page);
  
  out:
         return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
         unsigned long head_index = page->index;
         struct zone *zone = page_zone(page);
         int zonestat;
+       int tail_count = 0;
  
         /* prevent PageLRU to go away from under us, and freeze lru stats */
         spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
         for (i = 1; i < HPAGE_PMD_NR; i++) {
                 struct page *page_tail = page + i;
  
-               /* tail_page->_count cannot change */
-               atomic_sub(atomic_read(&page_tail->_count), &page->_count);
-               BUG_ON(page_count(page) <= 0);
-               atomic_add(page_mapcount(page) + 1, &page_tail->_count);
-               BUG_ON(atomic_read(&page_tail->_count) <= 0);
+               /* tail_page->_mapcount cannot change */
+               BUG_ON(page_mapcount(page_tail) < 0);
+               tail_count += page_mapcount(page_tail);
+               /* check for overflow */
+               BUG_ON(tail_count < 0);
+               BUG_ON(atomic_read(&page_tail->_count) != 0);
+               /*
+                * tail_page->_count is zero and not changing from
+                * under us. But get_page_unless_zero() may be running
+                * from under us on the tail_page. If we used
+                * atomic_set() below instead of atomic_add(), we
+                * would then run atomic_set() concurrently with
+                * get_page_unless_zero(), and atomic_set() is
+                * implemented in C not using locked ops. spin_unlock
+                * on x86 sometime uses locked ops because of PPro
+                * errata 66, 92, so unless somebody can guarantee
+                * atomic_set() here would be safe on all archs (and
+                * not only on x86), it's safer to use atomic_add().
+                */
+               atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+                          &page_tail->_count);
  
                 /* after clearing PageTail the gup refcount can be released */
                 smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
                                       (1L << PG_uptodate)));
                 page_tail->flags |= (1L << PG_dirty);
  
-               /*
-                * 1) clear PageTail before overwriting first_page
-                * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-                */
+               /* clear PageTail before overwriting first_page */
                 smp_wmb();
  
                 /*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
                  * status is achieved setting a reserved bit in the
                  * pmd, not by clearing the present bit.
                 */
-               BUG_ON(page_mapcount(page_tail));
                 page_tail->_mapcount = page->_mapcount;
  
                 BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
  
                 lru_add_page_tail(zone, page, page_tail);
         }
+       atomic_sub(tail_count, &page->_count);
+       BUG_ON(atomic_read(&page->_count) <= 0);
  
         __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
         __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
diff --git a/mm/internal.h b/mm/internal.h

index d071d380fb498ab36ac6700343bf86485463bdbe..2189af491783f958c1337ebf1f1bb14dc5cd8b5d 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
         atomic_dec(&page->_count);
  }
  
+static inline void __get_page_tail_foll(struct page *page,
+                                       bool get_page_head)
+{
+       /*
+        * If we're getting a tail page, the elevated page->_count is
+        * required only in the head page and we will elevate the head
+        * page->_count and tail page->_mapcount.
+        *
+        * We elevate page_tail->_mapcount for tail pages to force
+        * page_tail->_count to be zero at all times to avoid getting
+        * false positives from get_page_unless_zero() with
+        * speculative page access (like in
+        * page_cache_get_speculative()) on tail pages.
+        */
+       VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+       VM_BUG_ON(atomic_read(&page->_count) != 0);
+       VM_BUG_ON(page_mapcount(page) < 0);
+       if (get_page_head)
+               atomic_inc(&page->first_page->_count);
+       atomic_inc(&page->_mapcount);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+       if (unlikely(PageTail(page)))
+               /*
+                * This is safe only because
+                * __split_huge_page_refcount() can't run under
+                * get_page_foll() because we hold the proper PT lock.
+                */
+               __get_page_tail_foll(page, true);
+       else {
+               /*
+                * Getting a normal page or the head of a compound page
+                * requires to already have an elevated page->_count.
+                */
+               VM_BUG_ON(atomic_read(&page->_count) <= 0);
+               atomic_inc(&page->_count);
+       }
+}
+
  extern unsigned long highest_memmap_pfn;
  
  /*
diff --git a/mm/memory.c b/mm/memory.c

index a56e3ba816b21e52016a1a46be99479afe45ad08..b2b87315cdc638bea2f99775113bf736d9aac505 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1503,7 +1503,7 @@ split_fallthrough:
         }
  
         if (flags & FOLL_GET)
-               get_page(page);
+               get_page_foll(page);
         if (flags & FOLL_TOUCH) {
                 if ((flags & FOLL_WRITE) &&
                     !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/swap.c b/mm/swap.c

index 3a442f18b0b3dab5acfd99b5fc4e85389b9b5439..87627f181c3f333075cb15773bd3d45db3ec1ea9 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
  {
         if (unlikely(PageTail(page))) {
                 /* __split_huge_page_refcount can run under us */
-               struct page *page_head = page->first_page;
-               smp_rmb();
-               /*
-                * If PageTail is still set after smp_rmb() we can be sure
-                * that the page->first_page we read wasn't a dangling pointer.
-                * See __split_huge_page_refcount() smp_wmb().
-                */
-               if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+               struct page *page_head = compound_trans_head(page);
+
+               if (likely(page != page_head &&
+                          get_page_unless_zero(page_head))) {
                         unsigned long flags;
                         /*
-                        * Verify that our page_head wasn't converted
-                        * to a a regular page before we got a
-                        * reference on it.
+                        * page_head wasn't a dangling pointer but it
+                        * may not be a head page anymore by the time
+                        * we obtain the lock. That is ok as long as it
+                        * can't be freed from under us.
                          */
-                       if (unlikely(!PageHead(page_head))) {
-                               /* PageHead is cleared after PageTail */
-                               smp_rmb();
-                               VM_BUG_ON(PageTail(page));
-                               goto out_put_head;
-                       }
-                       /*
-                        * Only run compound_lock on a valid PageHead,
-                        * after having it pinned with
-                        * get_page_unless_zero() above.
-                        */
-                       smp_mb();
-                       /* page_head wasn't a dangling pointer */
                         flags = compound_lock_irqsave(page_head);
                         if (unlikely(!PageTail(page))) {
                                 /* __split_huge_page_refcount run before us */
                                 compound_unlock_irqrestore(page_head, flags);
                                 VM_BUG_ON(PageHead(page_head));
-                       out_put_head:
                                 if (put_page_testzero(page_head))
                                         __put_single_page(page_head);
                         out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
                         VM_BUG_ON(page_head != page->first_page);
                         /*
                          * We can release the refcount taken by
-                        * get_page_unless_zero now that
-                        * split_huge_page_refcount is blocked on the
-                        * compound_lock.
+                        * get_page_unless_zero() now that
+                        * __split_huge_page_refcount() is blocked on
+                        * the compound_lock.
                          */
                         if (put_page_testzero(page_head))
                                 VM_BUG_ON(1);
                         /* __split_huge_page_refcount will wait now */
-                       VM_BUG_ON(atomic_read(&page->_count) <= 0);
-                       atomic_dec(&page->_count);
+                       VM_BUG_ON(page_mapcount(page) <= 0);
+                       atomic_dec(&page->_mapcount);
                         VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                       VM_BUG_ON(atomic_read(&page->_count) != 0);
                         compound_unlock_irqrestore(page_head, flags);
                         if (put_page_testzero(page_head)) {
                                 if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
  }
  EXPORT_SYMBOL(put_page);
  
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+       /*
+        * This takes care of get_page() if run on a tail page
+        * returned by one of the get_user_pages/follow_page variants.
+        * get_user_pages/follow_page itself doesn't need the compound
+        * lock because it runs __get_page_tail_foll() under the
+        * proper PT lock that already serializes against
+        * split_huge_page().
+        */
+       unsigned long flags;
+       bool got = false;
+       struct page *page_head = compound_trans_head(page);
+
+       if (likely(page != page_head && get_page_unless_zero(page_head))) {
+               /*
+                * page_head wasn't a dangling pointer but it
+                * may not be a head page anymore by the time
+                * we obtain the lock. That is ok as long as it
+                * can't be freed from under us.
+                */
+               flags = compound_lock_irqsave(page_head);
+               /* here __split_huge_page_refcount won't run anymore */
+               if (likely(PageTail(page))) {
+                       __get_page_tail_foll(page, false);
+                       got = true;
+               }
+               compound_unlock_irqrestore(page_head, flags);
+               if (unlikely(!got))
+                       put_page(page_head);
+       }
+       return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
+
  /**
   * put_pages_list() - release a list of pages
   * @pages: list of pages threaded on page->lru
author	Andrea Arcangeli <aarcange@redhat.com>
	Wed, 2 Nov 2011 20:36:59 +0000 (13:36 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 2 Nov 2011 23:06:57 +0000 (16:06 -0700)
arch/powerpc/mm/gup.c		patch \| blob \| history
arch/x86/mm/gup.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history