rmap: do not call mmu_notifier_invalidate_page() under ptl

[karo-tx-linux.git] / mm / rmap.c
diff --git a/mm/rmap.c b/mm/rmap.c

index ced14f1af6dc29ac2337eac9806eb6415238536b..c1286d47aa1fad7fee7ea5bb865a2dc7efd672f2 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
         arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
         tlb_ubc->flush_required = true;
  
+       /*
+        * Ensure compiler does not re-order the setting of tlb_flush_batched
+        * before the PTE is cleared.
+        */
+       barrier();
+       mm->tlb_flush_batched = true;
+
         /*
          * If the PTE was dirty then it's best to assume it's writable. The
          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
  
         return should_defer;
  }
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+       if (mm->tlb_flush_batched) {
+               flush_tlb_mm(mm);
+
+               /*
+                * Do not allow the compiler to re-order the clearing of
+                * tlb_flush_batched before the tlb is flushed.
+                */
+               barrier();
+               mm->tlb_flush_batched = false;
+       }
+}
  #else
  static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
  {
@@ -852,10 +888,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                 .flags = PVMW_SYNC,
         };
         int *cleaned = arg;
+       bool invalidation_needed = false;
  
         while (page_vma_mapped_walk(&pvmw)) {
                 int ret = 0;
-               address = pvmw.address;
                 if (pvmw.pte) {
                         pte_t entry;
                         pte_t *pte = pvmw.pte;
@@ -863,11 +899,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                         if (!pte_dirty(*pte) && !pte_write(*pte))
                                 continue;
  
-                       flush_cache_page(vma, address, pte_pfn(*pte));
-                       entry = ptep_clear_flush(vma, address, pte);
+                       flush_cache_page(vma, pvmw.address, pte_pfn(*pte));
+                       entry = ptep_clear_flush(vma, pvmw.address, pte);
                         entry = pte_wrprotect(entry);
                         entry = pte_mkclean(entry);
-                       set_pte_at(vma->vm_mm, address, pte, entry);
+                       set_pte_at(vma->vm_mm, pvmw.address, pte, entry);
                         ret = 1;
                 } else {
  #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -877,11 +913,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                 continue;
  
-                       flush_cache_page(vma, address, page_to_pfn(page));
-                       entry = pmdp_huge_clear_flush(vma, address, pmd);
+                       flush_cache_page(vma, pvmw.address, page_to_pfn(page));
+                       entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd);
                         entry = pmd_wrprotect(entry);
                         entry = pmd_mkclean(entry);
-                       set_pmd_at(vma->vm_mm, address, pmd, entry);
+                       set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry);
                         ret = 1;
  #else
                         /* unexpected pmd-mapped page? */
@@ -890,11 +926,16 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                 }
  
                 if (ret) {
-                       mmu_notifier_invalidate_page(vma->vm_mm, address);
                         (*cleaned)++;
+                       invalidation_needed = true;
                 }
         }
  
+       if (invalidation_needed) {
+               mmu_notifier_invalidate_range(vma->vm_mm, address,
+                               address + (1UL << compound_order(page)));
+       }
+
         return true;
  }
  
@@ -1287,7 +1328,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         };
         pte_t pteval;
         struct page *subpage;
-       bool ret = true;
+       bool ret = true, invalidation_needed = false;
         enum ttu_flags flags = (enum ttu_flags)arg;
  
         /* munlock has nothing to gain from examining un-locked vmas */
@@ -1327,11 +1368,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 VM_BUG_ON_PAGE(!pvmw.pte, page);
  
                 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
-               address = pvmw.address;
-
  
                 if (!(flags & TTU_IGNORE_ACCESS)) {
-                       if (ptep_clear_flush_young_notify(vma, address,
+                       if (ptep_clear_flush_young_notify(vma, pvmw.address,
                                                 pvmw.pte)) {
                                 ret = false;
                                 page_vma_mapped_walk_done(&pvmw);
@@ -1340,7 +1379,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 }
  
                 /* Nuke the page table entry. */
-               flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+               flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte));
                 if (should_defer_flush(mm, flags)) {
                         /*
                          * We clear the PTE but do not flush so potentially
@@ -1350,11 +1389,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                          * transition on a cached TLB entry is written through
                          * and traps if the PTE is unmapped.
                          */
-                       pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+                       pteval = ptep_get_and_clear(mm, pvmw.address,
+                                                   pvmw.pte);
  
                         set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
                 } else {
-                       pteval = ptep_clear_flush(vma, address, pvmw.pte);
+                       pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
                 }
  
                 /* Move the dirty bit to the page. Now the pte is gone. */
@@ -1369,12 +1409,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         if (PageHuge(page)) {
                                 int nr = 1 << compound_order(page);
                                 hugetlb_count_sub(nr, mm);
-                               set_huge_swap_pte_at(mm, address,
+                               set_huge_swap_pte_at(mm, pvmw.address,
                                                      pvmw.pte, pteval,
                                                      vma_mmu_pagesize(vma));
                         } else {
                                 dec_mm_counter(mm, mm_counter(page));
-                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
                         }
  
                 } else if (pte_unused(pteval)) {
@@ -1398,7 +1438,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         swp_pte = swp_entry_to_pte(entry);
                         if (pte_soft_dirty(pteval))
                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                       set_pte_at(mm, address, pvmw.pte, swp_pte);
+                       set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                 } else if (PageAnon(page)) {
                         swp_entry_t entry = { .val = page_private(subpage) };
                         pte_t swp_pte;
@@ -1424,7 +1464,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                  * If the page was redirtied, it cannot be
                                  * discarded. Remap the page to page table.
                                  */
-                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
                                 SetPageSwapBacked(page);
                                 ret = false;
                                 page_vma_mapped_walk_done(&pvmw);
@@ -1432,7 +1472,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         }
  
                         if (swap_duplicate(entry) < 0) {
-                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               set_pte_at(mm, pvmw.address, pvmw.pte, pteval);
                                 ret = false;
                                 page_vma_mapped_walk_done(&pvmw);
                                 break;
@@ -1448,14 +1488,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         swp_pte = swp_entry_to_pte(entry);
                         if (pte_soft_dirty(pteval))
                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                       set_pte_at(mm, address, pvmw.pte, swp_pte);
+                       set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                 } else
                         dec_mm_counter(mm, mm_counter_file(page));
  discard:
                 page_remove_rmap(subpage, PageHuge(page));
                 put_page(page);
-               mmu_notifier_invalidate_page(mm, address);
+               invalidation_needed = true;
         }
+
+       if (invalidation_needed)
+               mmu_notifier_invalidate_range(mm, address,
+                               address + (1UL << compound_order(page)));
         return ret;
  }