thp: clear_copy_huge_page

[mv-sheeva.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 02e48aa0ed136ff8e4d808d954a20d0b46e6d23d..60e1c68d821845f4bf75fe23e8d070ccc80eb360 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
         }
  }
  
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long address)
  {
         pgtable_t new = pte_alloc_one(mm, address);
+       int wait_split_huge_page;
         if (!new)
                 return -ENOMEM;
  
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
         smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
  
         spin_lock(&mm->page_table_lock);
-       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+       wait_split_huge_page = 0;
+       if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                 mm->nr_ptes++;
                 pmd_populate(mm, pmd, new);
                 new = NULL;
-       }
+       } else if (unlikely(pmd_trans_splitting(*pmd)))
+               wait_split_huge_page = 1;
         spin_unlock(&mm->page_table_lock);
         if (new)
                 pte_free(mm, new);
+       if (wait_split_huge_page)
+               wait_split_huge_page(vma->anon_vma, pmd);
         return 0;
  }
  
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
         smp_wmb(); /* See comment in __pte_alloc */
  
         spin_lock(&init_mm.page_table_lock);
-       if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+       if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                 pmd_populate_kernel(&init_mm, pmd, new);
                 new = NULL;
-       }
+       } else
+               VM_BUG_ON(pmd_trans_splitting(*pmd));
         spin_unlock(&init_mm.page_table_lock);
         if (new)
                 pte_free_kernel(&init_mm, new);
@@ -1310,6 +1317,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                  */
                 mark_page_accessed(page);
         }
+       if (flags & FOLL_MLOCK) {
+               /*
+                * The preliminary mapping check is mainly to avoid the
+                * pointless overhead of lock_page on the ZERO_PAGE
+                * which might bounce very badly if there is contention.
+                *
+                * If the page is already locked, we don't need to
+                * handle it now - vmscan will handle it later if and
+                * when it attempts to reclaim the page.
+                */
+               if (page->mapping && trylock_page(page)) {
+                       lru_add_drain();  /* push cached pages to LRU */
+                       /*
+                        * Because we lock page here and migration is
+                        * blocked by the pte's page reference, we need
+                        * only check for file-cache page truncation.
+                        */
+                       if (page->mapping)
+                               mlock_vma_page(page);
+                       unlock_page(page);
+               }
+       }
  unlock:
         pte_unmap_unlock(ptep, ptl);
  out:
@@ -1341,7 +1370,8 @@ no_page_table:
  
  int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                      unsigned long start, int nr_pages, unsigned int gup_flags,
-                    struct page **pages, struct vm_area_struct **vmas)
+                    struct page **pages, struct vm_area_struct **vmas,
+                    int *nonblocking)
  {
         int i;
         unsigned long vm_flags;
@@ -1441,10 +1471,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         cond_resched();
                         while (!(page = follow_page(vma, start, foll_flags))) {
                                 int ret;
+                               unsigned int fault_flags = 0;
+
+                               if (foll_flags & FOLL_WRITE)
+                                       fault_flags |= FAULT_FLAG_WRITE;
+                               if (nonblocking)
+                                       fault_flags |= FAULT_FLAG_ALLOW_RETRY;
  
                                 ret = handle_mm_fault(mm, vma, start,
-                                       (foll_flags & FOLL_WRITE) ?
-                                       FAULT_FLAG_WRITE : 0);
+                                                       fault_flags);
  
                                 if (ret & VM_FAULT_ERROR) {
                                         if (ret & VM_FAULT_OOM)
@@ -1460,6 +1495,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 else
                                         tsk->min_flt++;
  
+                               if (ret & VM_FAULT_RETRY) {
+                                       *nonblocking = 0;
+                                       return i;
+                               }
+
                                 /*
                                  * The VM_FAULT_WRITE bit tells us that
                                  * do_wp_page has broken COW when necessary,
@@ -1559,7 +1599,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
         if (force)
                 flags |= FOLL_FORCE;
  
-       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+       return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                               NULL);
  }
  EXPORT_SYMBOL(get_user_pages);
  
@@ -1584,7 +1625,8 @@ struct page *get_dump_page(unsigned long addr)
         struct page *page;
  
         if (__get_user_pages(current, current->mm, addr, 1,
-                       FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                            FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                            NULL) < 1)
                 return NULL;
         flush_cache_page(vma, addr, page_to_pfn(page));
         return page;
@@ -2048,19 +2090,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
         return same;
  }
  
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-       if (likely(vma->vm_flags & VM_WRITE))
-               pte = pte_mkwrite(pte);
-       return pte;
-}
-
  static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
  {
         /*
@@ -2112,7 +2141,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  {
         struct page *old_page, *new_page;
         pte_t entry;
-       int reuse = 0, ret = 0;
+       int ret = 0;
         int page_mkwrite = 0;
         struct page *dirty_page = NULL;
  
@@ -2149,14 +2178,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         }
                         page_cache_release(old_page);
                 }
-               reuse = reuse_swap_page(old_page);
-               if (reuse)
+               if (reuse_swap_page(old_page)) {
                         /*
                          * The page is all ours.  Move it to our anon_vma so
                          * the rmap code will not search our parent or siblings.
                          * Protected against the rmap code by the page lock.
                          */
                         page_move_anon_rmap(old_page, vma, address);
+                       unlock_page(old_page);
+                       goto reuse;
+               }
                 unlock_page(old_page);
         } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                         (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2251,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 }
                 dirty_page = old_page;
                 get_page(dirty_page);
-               reuse = 1;
-       }
  
-       if (reuse) {
  reuse:
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 if (ptep_set_access_flags(vma, address, page_table, entry,1))
                         update_mmu_cache(vma, address, page_table);
+               pte_unmap_unlock(page_table, ptl);
                 ret |= VM_FAULT_WRITE;
-               goto unlock;
+
+               if (!dirty_page)
+                       return ret;
+
+               /*
+                * Yes, Virginia, this is actually required to prevent a race
+                * with clear_page_dirty_for_io() from clearing the page dirty
+                * bit after it clear all dirty ptes, but before a racing
+                * do_wp_page installs a dirty pte.
+                *
+                * do_no_page is protected similarly.
+                */
+               if (!page_mkwrite) {
+                       wait_on_page_locked(dirty_page);
+                       set_page_dirty_balance(dirty_page, page_mkwrite);
+               }
+               put_page(dirty_page);
+               if (page_mkwrite) {
+                       struct address_space *mapping = dirty_page->mapping;
+
+                       set_page_dirty(dirty_page);
+                       unlock_page(dirty_page);
+                       page_cache_release(dirty_page);
+                       if (mapping)    {
+                               /*
+                                * Some device drivers do not set page.mapping
+                                * but still dirty their pages
+                                */
+                               balance_dirty_pages_ratelimited(mapping);
+                       }
+               }
+
+               /* file_update_time outside page_lock */
+               if (vma->vm_file)
+                       file_update_time(vma->vm_file);
+
+               return ret;
         }
  
         /*
@@ -2337,39 +2402,6 @@ gotten:
                 page_cache_release(old_page);
  unlock:
         pte_unmap_unlock(page_table, ptl);
-       if (dirty_page) {
-               /*
-                * Yes, Virginia, this is actually required to prevent a race
-                * with clear_page_dirty_for_io() from clearing the page dirty
-                * bit after it clear all dirty ptes, but before a racing
-                * do_wp_page installs a dirty pte.
-                *
-                * do_no_page is protected similarly.
-                */
-               if (!page_mkwrite) {
-                       wait_on_page_locked(dirty_page);
-                       set_page_dirty_balance(dirty_page, page_mkwrite);
-               }
-               put_page(dirty_page);
-               if (page_mkwrite) {
-                       struct address_space *mapping = dirty_page->mapping;
-
-                       set_page_dirty(dirty_page);
-                       unlock_page(dirty_page);
-                       page_cache_release(dirty_page);
-                       if (mapping)    {
-                               /*
-                                * Some device drivers do not set page.mapping
-                                * but still dirty their pages
-                                */
-                               balance_dirty_pages_ratelimited(mapping);
-                       }
-               }
-
-               /* file_update_time outside page_lock */
-               if (vma->vm_file)
-                       file_update_time(vma->vm_file);
-       }
         return ret;
  oom_free_new:
         page_cache_release(new_page);
@@ -3228,7 +3260,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pmd = pmd_alloc(mm, pud, address);
         if (!pmd)
                 return VM_FAULT_OOM;
-       pte = pte_alloc_map(mm, pmd, address);
+       pte = pte_alloc_map(mm, vma, pmd, address);
         if (!pte)
                 return VM_FAULT_OOM;
  
@@ -3296,7 +3328,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
         vma = find_vma(current->mm, addr);
         if (!vma)
                 return -ENOMEM;
-       write = (vma->vm_flags & VM_WRITE) != 0;
+       /*
+        * We want to touch writable mappings with a write fault in order
+        * to break COW, except for shared mappings because these don't COW
+        * and we would not want to dirty them for nothing.
+        */
+       write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
         BUG_ON(addr >= end);
         BUG_ON(end > vma->vm_end);
         len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3608,3 +3645,74 @@ void might_fault(void)
  }
  EXPORT_SYMBOL(might_fault);
  #endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                               unsigned long addr,
+                               unsigned int pages_per_huge_page)
+{
+       int i;
+       struct page *p = page;
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page;
+            i++, p = mem_map_next(p, page, i)) {
+               cond_resched();
+               clear_user_highpage(p, addr + i * PAGE_SIZE);
+       }
+}
+void clear_huge_page(struct page *page,
+                    unsigned long addr, unsigned int pages_per_huge_page)
+{
+       int i;
+
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+               clear_gigantic_page(page, addr, pages_per_huge_page);
+               return;
+       }
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page; i++) {
+               cond_resched();
+               clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+       }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                   unsigned long addr,
+                                   struct vm_area_struct *vma,
+                                   unsigned int pages_per_huge_page)
+{
+       int i;
+       struct page *dst_base = dst;
+       struct page *src_base = src;
+
+       for (i = 0; i < pages_per_huge_page; ) {
+               cond_resched();
+               copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+               i++;
+               dst = mem_map_next(dst, dst_base, i);
+               src = mem_map_next(src, src_base, i);
+       }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+                        unsigned long addr, struct vm_area_struct *vma,
+                        unsigned int pages_per_huge_page)
+{
+       int i;
+
+       if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+               copy_user_gigantic_page(dst, src, addr, vma,
+                                       pages_per_huge_page);
+               return;
+       }
+
+       might_sleep();
+       for (i = 0; i < pages_per_huge_page; i++) {
+               cond_resched();
+               copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+       }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */