mm: avoid taking rmap locks in move_ptes()

author Michel Lespinasse <walken@google.com>

Fri, 21 Sep 2012 00:57:44 +0000 (10:57 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Fri, 21 Sep 2012 05:58:51 +0000 (15:58 +1000)
author Michel Lespinasse <walken@google.com>
Fri, 21 Sep 2012 00:57:44 +0000 (10:57 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Fri, 21 Sep 2012 05:58:51 +0000 (15:58 +1000)
diff --git a/fs/exec.c b/fs/exec.c

index df8b282061e668dab6d028eba32a4b10f0009b75..bdeecd819dc03e48f77fa43312b53ee3e657b9fb 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -612,7 +612,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
          * process cleanup to remove whatever mess we made.
          */
         if (length != move_page_tables(vma, old_start,
-                                      vma, new_start, length))
+                                      vma, new_start, length, false))
                 return -ENOMEM;
  
         lru_add_drain();
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 82da3d3350a39005790fb7d14fad8345d6a08b35..392d6d37cbf6d1ac57dc4ea0358f2d49b9c0ab27 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1061,7 +1061,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
  
  extern unsigned long move_page_tables(struct vm_area_struct *vma,
                 unsigned long old_addr, struct vm_area_struct *new_vma,
-               unsigned long new_addr, unsigned long len);
+               unsigned long new_addr, unsigned long len,
+               bool need_rmap_locks);
  extern unsigned long do_mremap(unsigned long addr,
                                unsigned long old_len, unsigned long new_len,
                                unsigned long flags, unsigned long new_addr);
@@ -1411,7 +1412,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
         struct rb_node **, struct rb_node *);
  extern void unlink_file_vma(struct vm_area_struct *);
  extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
-       unsigned long addr, unsigned long len, pgoff_t pgoff);
+       unsigned long addr, unsigned long len, pgoff_t pgoff,
+       bool *need_rmap_locks);
  extern void exit_mmap(struct mm_struct *);
  
  extern int mm_take_all_locks(struct mm_struct *mm);
diff --git a/mm/mmap.c b/mm/mmap.c

index 69b8f8cce3ca4fb73cc6d11084dae305e829e7a0..d8880368cd158662a3d2469899c006bb1b4b42a1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2372,7 +2372,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
   * prior to moving page table entries, to effect an mremap move.
   */
  struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-       unsigned long addr, unsigned long len, pgoff_t pgoff)
+       unsigned long addr, unsigned long len, pgoff_t pgoff,
+       bool *need_rmap_locks)
  {
         struct vm_area_struct *vma = *vmap;
         unsigned long vma_start = vma->vm_start;
@@ -2414,8 +2415,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                          * linear if there are no pages mapped yet.
                          */
                         VM_BUG_ON(faulted_in_anon_vma);
-                       *vmap = new_vma;
+                       *vmap = vma = new_vma;
                 }
+               *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
         } else {
                 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                 if (new_vma) {
@@ -2435,6 +2437,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         if (new_vma->vm_ops && new_vma->vm_ops->open)
                                 new_vma->vm_ops->open(new_vma);
                         vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                       *need_rmap_locks = false;
                 }
         }
         return new_vma;
diff --git a/mm/mremap.c b/mm/mremap.c

index 5588bb6e92956c049932baac68826fbaa6ce4a24..3b639a4b26bd3a29d3f46318cb4f5be23abb54c6 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 unsigned long old_addr, unsigned long old_end,
                 struct vm_area_struct *new_vma, pmd_t *new_pmd,
-               unsigned long new_addr)
+               unsigned long new_addr, bool need_rmap_locks)
  {
         struct address_space *mapping = NULL;
-       struct anon_vma *anon_vma = vma->anon_vma;
+       struct anon_vma *anon_vma = NULL;
         struct mm_struct *mm = vma->vm_mm;
         pte_t *old_pte, *new_pte, pte;
         spinlock_t *old_ptl, *new_ptl;
  
-       if (vma->vm_file) {
-               /*
-                * Subtle point from Rajesh Venkatasubramanian: before
-                * moving file-based ptes, we must lock truncate_pagecache
-                * out, since it might clean the dst vma before the src vma,
-                * and we propagate stale pages into the dst afterward.
-                */
-               mapping = vma->vm_file->f_mapping;
-               mutex_lock(&mapping->i_mmap_mutex);
+       /*
+        * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+        * locks to ensure that rmap will always observe either the old or the
+        * new ptes. This is the easiest way to avoid races with
+        * truncate_pagecache(), page migration, etc...
+        *
+        * When need_rmap_locks is false, we use other ways to avoid
+        * such races:
+        *
+        * - During exec() shift_arg_pages(), we use a specially tagged vma
+        *   which rmap call sites look for using is_vma_temporary_stack().
+        *
+        * - During mremap(), new_vma is often known to be placed after vma
+        *   in rmap traversal order. This ensures rmap will always observe
+        *   either the old pte, or the new pte, or both (the page table locks
+        *   serialize access to individual ptes, but only rmap traversal
+        *   order guarantees that we won't miss both the old and new ptes).
+        */
+       if (need_rmap_locks) {
+               if (vma->vm_file) {
+                       mapping = vma->vm_file->f_mapping;
+                       mutex_lock(&mapping->i_mmap_mutex);
+               }
+               if (vma->anon_vma) {
+                       anon_vma = vma->anon_vma;
+                       anon_vma_lock(anon_vma);
+               }
         }
-       if (anon_vma)
-               anon_vma_lock(anon_vma);
  
         /*
          * We don't have to worry about the ordering of src and dst
@@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
  
  unsigned long move_page_tables(struct vm_area_struct *vma,
                 unsigned long old_addr, struct vm_area_struct *new_vma,
-               unsigned long new_addr, unsigned long len)
+               unsigned long new_addr, unsigned long len,
+               bool need_rmap_locks)
  {
         unsigned long extent, next, old_end;
         pmd_t *old_pmd, *new_pmd;
@@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                 if (extent > LATENCY_LIMIT)
                         extent = LATENCY_LIMIT;
                 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
-                               new_vma, new_pmd, new_addr);
+                         new_vma, new_pmd, new_addr, need_rmap_locks);
                 need_flush = true;
         }
         if (likely(need_flush))
@@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         unsigned long hiwater_vm;
         int split = 0;
         int err;
+       bool need_rmap_locks;
  
         /*
          * We'd prefer to avoid failure later on in do_munmap:
@@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                 return err;
  
         new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
-       new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+       new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+                          &need_rmap_locks);
         if (!new_vma)
                 return -ENOMEM;
  
-       moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+       moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+                                    need_rmap_locks);
         if (moved_len < old_len) {
                 /*
                  * On error, move entries back from new area to old,
                  * which will succeed since page tables still there,
                  * and then proceed to unmap new area instead of old.
                  */
-               move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+               move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+                                true);
                 vma = new_vma;
                 old_len = new_len;
                 old_addr = new_addr;
author	Michel Lespinasse <walken@google.com>
	Fri, 21 Sep 2012 00:57:44 +0000 (10:57 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Fri, 21 Sep 2012 05:58:51 +0000 (15:58 +1000)
fs/exec.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history