]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/ksm.c
pwm: make the PWM_POLARITY flag optional
[karo-tx-linux.git] / mm / ksm.c
index 7ee101eaacdfe9eb82061585bb820df243a3650b..659e2b5119c043671cf7a0e19afcf1c68a78a3c0 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -349,6 +349,24 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
        return atomic_read(&mm->mm_users) == 0;
 }
 
+/*
+ * If the mm isn't the one associated with the current
+ * ksm_scan.mm_slot ksm_exit() will not down_write();up_write() and in
+ * turn the ksm_test_exit() check run inside a mm->mmap_sem critical
+ * section, will not prevent exit_mmap() to run from under us. In
+ * turn, in those cases where we could work with an "mm" that isn't
+ * guaranteed to be associated with the current ksm_scan.mm_slot,
+ * ksm_get_mm() is needed instead of the ksm_test_exit() run inside
+ * the mmap_sem. Return true if the mm_users was incremented or false
+ * if it we failed at taking the mm because it was freed from under
+ * us. If it returns 1, the caller must take care of calling mmput()
+ * after it finishes using the mm.
+ */
+static __always_inline bool ksm_get_mm(struct mm_struct *mm)
+{
+       return likely(atomic_inc_not_zero(&mm->mm_users));
+}
+
 /*
  * We use break_ksm to break COW on a ksm page: it's a stripped down
  *
@@ -412,8 +430,6 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
                unsigned long addr)
 {
        struct vm_area_struct *vma;
-       if (ksm_test_exit(mm))
-               return NULL;
        vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
                return NULL;
@@ -434,25 +450,21 @@ static void break_cow(struct rmap_item *rmap_item)
         */
        put_anon_vma(rmap_item->anon_vma);
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm here to prevent the
+        * exit_mmap to run from under us in mmput().
+        */
+       if (!ksm_get_mm(mm))
+               return;
+
        down_read(&mm->mmap_sem);
        vma = find_mergeable_vma(mm, addr);
        if (vma)
                break_ksm(vma, addr);
        up_read(&mm->mmap_sem);
-}
-
-static struct page *page_trans_compound_anon(struct page *page)
-{
-       if (PageTransCompound(page)) {
-               struct page *head = compound_head(page);
-               /*
-                * head may actually be splitted and freed from under
-                * us but it's ok here.
-                */
-               if (PageAnon(head))
-                       return head;
-       }
-       return NULL;
+       mmput(mm);
 }
 
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
@@ -462,6 +474,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        struct vm_area_struct *vma;
        struct page *page;
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm here to prevent the
+        * exit_mmap to run from under us in mmput().
+        */
+       if (!ksm_get_mm(mm))
+               return NULL;
+
        down_read(&mm->mmap_sem);
        vma = find_mergeable_vma(mm, addr);
        if (!vma)
@@ -470,7 +491,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-       if (PageAnon(page) || page_trans_compound_anon(page)) {
+       if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -478,6 +499,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 out:           page = NULL;
        }
        up_read(&mm->mmap_sem);
+       mmput(mm);
        return page;
 }
 
@@ -625,7 +647,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                unlock_page(page);
                put_page(page);
 
-               if (stable_node->hlist.first)
+               if (!hlist_empty(&stable_node->hlist))
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;
@@ -957,13 +979,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        }
 
        get_page(kpage);
-       page_add_anon_rmap(kpage, vma, addr);
+       page_add_anon_rmap(kpage, vma, addr, false);
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-       page_remove_rmap(page);
+       page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        put_page(page);
@@ -976,33 +998,6 @@ out:
        return err;
 }
 
-static int page_trans_compound_anon_split(struct page *page)
-{
-       int ret = 0;
-       struct page *transhuge_head = page_trans_compound_anon(page);
-       if (transhuge_head) {
-               /* Get the reference on the head to split it. */
-               if (get_page_unless_zero(transhuge_head)) {
-                       /*
-                        * Recheck we got the reference while the head
-                        * was still anonymous.
-                        */
-                       if (PageAnon(transhuge_head))
-                               ret = split_huge_page(transhuge_head);
-                       else
-                               /*
-                                * Retry later if split_huge_page run
-                                * from under us.
-                                */
-                               ret = 1;
-                       put_page(transhuge_head);
-               } else
-                       /* Retry later if split_huge_page run from under us. */
-                       ret = 1;
-       }
-       return ret;
-}
-
 /*
  * try_to_merge_one_page - take two pages and merge them into one
  * @vma: the vma that holds the pte pointing to page
@@ -1021,11 +1016,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (page == kpage)                      /* ksm page forked */
                return 0;
 
-       if (!(vma->vm_flags & VM_MERGEABLE))
-               goto out;
-       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
-               goto out;
-       BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
 
@@ -1038,6 +1028,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
         */
        if (!trylock_page(page))
                goto out;
+
+       if (PageTransCompound(page)) {
+               err = split_huge_page(page);
+               if (err)
+                       goto out_unlock;
+       }
+
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
@@ -1053,6 +1050,18 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                         */
                        set_page_stable_node(page, NULL);
                        mark_page_accessed(page);
+                       /*
+                        * Stable page could be shared by several processes
+                        * and last process could own the page among them after
+                        * CoW or zapping for every process except last process
+                        * happens. Then, page table entry of the page
+                        * in last process can have no dirty bit.
+                        * In this case, MADV_FREE could discard the page
+                        * wrongly.
+                        * For preventing it, we mark stable page dirty.
+                        */
+                       if (!PageDirty(page))
+                               SetPageDirty(page);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
@@ -1068,6 +1077,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                }
        }
 
+out_unlock:
        unlock_page(page);
 out:
        return err;
@@ -1086,11 +1096,21 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        struct vm_area_struct *vma;
        int err = -EFAULT;
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm() here to prevent the
+        * exit_mmap to run from under us in mmput(). Otherwise
+        * rmap_item->anon_vma could point to an anon_vma that has
+        * already been freed (i.e. get_anon_vma() below would run too
+        * late).
+        */
+       if (!ksm_get_mm(mm))
+               return err;
+
        down_read(&mm->mmap_sem);
-       if (ksm_test_exit(mm))
-               goto out;
-       vma = find_vma(mm, rmap_item->address);
-       if (!vma || vma->vm_start > rmap_item->address)
+       vma = find_mergeable_vma(mm, rmap_item->address);
+       if (!vma)
                goto out;
 
        err = try_to_merge_one_page(vma, page, kpage);
@@ -1105,6 +1125,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        get_anon_vma(vma->anon_vma);
 out:
        up_read(&mm->mmap_sem);
+       mmput(mm);
        return err;
 }
 
@@ -1178,7 +1199,18 @@ again:
                stable_node = rb_entry(*new, struct stable_node, node);
                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
-                       return NULL;
+                       /*
+                        * If we walked over a stale stable_node,
+                        * get_ksm_page() will call rb_erase() and it
+                        * may rebalance the tree from under us. So
+                        * restart the search from scratch. Returning
+                        * NULL would be safe too, but we'd generate
+                        * false negative insertions just because some
+                        * stable_node was stale which would waste CPU
+                        * by doing the preparation work twice at the
+                        * next KSM pass.
+                        */
+                       goto again;
 
                ret = memcmp_pages(page, tree_page);
                put_page(tree_page);
@@ -1254,12 +1286,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
        unsigned long kpfn;
        struct rb_root *root;
        struct rb_node **new;
-       struct rb_node *parent = NULL;
+       struct rb_node *parent;
        struct stable_node *stable_node;
 
        kpfn = page_to_pfn(kpage);
        nid = get_kpfn_nid(kpfn);
        root = root_stable_tree + nid;
+again:
+       parent = NULL;
        new = &root->rb_node;
 
        while (*new) {
@@ -1270,7 +1304,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
                stable_node = rb_entry(*new, struct stable_node, node);
                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
-                       return NULL;
+                       /*
+                        * If we walked over a stale stable_node,
+                        * get_ksm_page() will call rb_erase() and it
+                        * may rebalance the tree from under us. So
+                        * restart the search from scratch. Returning
+                        * NULL would be safe too, but we'd generate
+                        * false negative insertions just because some
+                        * stable_node was stale which would waste CPU
+                        * by doing the preparation work twice at the
+                        * next KSM pass.
+                        */
+                       goto again;
 
                ret = memcmp_pages(kpage, tree_page);
                put_page(tree_page);
@@ -1340,7 +1385,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
-               if (IS_ERR_OR_NULL(tree_page))
+               if (!tree_page)
                        return NULL;
 
                /*
@@ -1620,8 +1665,7 @@ next_mm:
                                cond_resched();
                                continue;
                        }
-                       if (PageAnon(*page) ||
-                           page_trans_compound_anon(*page)) {
+                       if (PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1884,7 +1928,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-               __set_page_locked(new_page);
+               __SetPageLocked(new_page);
        }
 
        return new_page;
@@ -1914,9 +1958,11 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
 
+               cond_resched();
                anon_vma_lock_read(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
+                       cond_resched();
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)