return atomic_read(&mm->mm_users) == 0;
}
+/*
+ * If the mm isn't the one associated with the current
+ * ksm_scan.mm_slot ksm_exit() will not down_write();up_write() and in
+ * turn the ksm_test_exit() check run inside a mm->mmap_sem critical
+ * section, will not prevent exit_mmap() to run from under us. In
+ * turn, in those cases where we could work with an "mm" that isn't
+ * guaranteed to be associated with the current ksm_scan.mm_slot,
+ * ksm_get_mm() is needed instead of the ksm_test_exit() run inside
+ * the mmap_sem. Return true if the mm_users was incremented or false
+ * if it we failed at taking the mm because it was freed from under
+ * us. If it returns 1, the caller must take care of calling mmput()
+ * after it finishes using the mm.
+ */
+static __always_inline bool ksm_get_mm(struct mm_struct *mm)
+{
+ return likely(atomic_inc_not_zero(&mm->mm_users));
+}
+
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
unsigned long addr)
{
struct vm_area_struct *vma;
- if (ksm_test_exit(mm))
- return NULL;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
return NULL;
*/
put_anon_vma(rmap_item->anon_vma);
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm here to prevent the
+ * exit_mmap to run from under us in mmput().
+ */
+ if (!ksm_get_mm(mm))
+ return;
+
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
up_read(&mm->mmap_sem);
-}
-
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
+ mmput(mm);
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct vm_area_struct *vma;
struct page *page;
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm here to prevent the
+ * exit_mmap to run from under us in mmput().
+ */
+ if (!ksm_get_mm(mm))
+ return NULL;
+
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (!vma)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
out: page = NULL;
}
up_read(&mm->mmap_sem);
+ mmput(mm);
return page;
}
unlock_page(page);
put_page(page);
- if (stable_node->hlist.first)
+ if (!hlist_empty(&stable_node->hlist))
ksm_pages_sharing--;
else
ksm_pages_shared--;
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
if (page == kpage) /* ksm page forked */
return 0;
- if (!(vma->vm_flags & VM_MERGEABLE))
- goto out;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
+ /*
+ * Stable page could be shared by several processes
+ * and last process could own the page among them after
+ * CoW or zapping for every process except last process
+ * happens. Then, page table entry of the page
+ * in last process can have no dirty bit.
+ * In this case, MADV_FREE could discard the page
+ * wrongly.
+ * For preventing it, we mark stable page dirty.
+ */
+ if (!PageDirty(page))
+ SetPageDirty(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
}
}
+out_unlock:
unlock_page(page);
out:
return err;
struct vm_area_struct *vma;
int err = -EFAULT;
+ /*
+ * The "mm" of the unstable tree rmap_item isn't necessairly
+ * associated with the current ksm_scan.mm_slot, it could be
+ * any random mm. So we need ksm_get_mm() here to prevent the
+ * exit_mmap to run from under us in mmput(). Otherwise
+ * rmap_item->anon_vma could point to an anon_vma that has
+ * already been freed (i.e. get_anon_vma() below would run too
+ * late).
+ */
+ if (!ksm_get_mm(mm))
+ return err;
+
down_read(&mm->mmap_sem);
- if (ksm_test_exit(mm))
- goto out;
- vma = find_vma(mm, rmap_item->address);
- if (!vma || vma->vm_start > rmap_item->address)
+ vma = find_mergeable_vma(mm, rmap_item->address);
+ if (!vma)
goto out;
err = try_to_merge_one_page(vma, page, kpage);
get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
+ mmput(mm);
return err;
}
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
- return NULL;
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale which would waste CPU
+ * by doing the preparation work twice at the
+ * next KSM pass.
+ */
+ goto again;
ret = memcmp_pages(page, tree_page);
put_page(tree_page);
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
- struct rb_node *parent = NULL;
+ struct rb_node *parent;
struct stable_node *stable_node;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
+again:
+ parent = NULL;
new = &root->rb_node;
while (*new) {
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
- return NULL;
+ /*
+ * If we walked over a stale stable_node,
+ * get_ksm_page() will call rb_erase() and it
+ * may rebalance the tree from under us. So
+ * restart the search from scratch. Returning
+ * NULL would be safe too, but we'd generate
+ * false negative insertions just because some
+ * stable_node was stale which would waste CPU
+ * by doing the preparation work twice at the
+ * next KSM pass.
+ */
+ goto again;
ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
- if (IS_ERR_OR_NULL(tree_page))
+ if (!tree_page)
return NULL;
/*
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
+ cond_resched();
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
+ cond_resched();
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)