#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
+#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
- if (vma->vm_flags & VM_LOCKED)
- return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+ pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
- if (pmd_none(*pmd)) {
- entry = pmd_mkhuge(pfn_pmd(pfn, prot));
- if (write) {
- entry = pmd_mkyoung(pmd_mkdirty(entry));
- entry = maybe_pmd_mkwrite(entry, vma);
- }
- set_pmd_at(mm, addr, pmd, entry);
- update_mmu_cache_pmd(vma, addr, pmd);
- }
+ entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pmd_mkdevmap(entry);
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
spin_unlock(ptl);
}
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, bool write)
+ pmd_t *pmd, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
/*
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+ BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(!pmd_trans_huge(pmd))) {
+ if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
}
goto out_unlock;
}
- src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- get_page(src_page);
- page_dup_rmap(src_page, true);
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (pmd_trans_huge(pmd)) {
+ /* thp accounting separate from pmd_devmap accounting */
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&dst_mm->nr_ptes);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ }
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- atomic_long_inc(&dst_mm->nr_ptes);
ret = 0;
out_unlock:
update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ /*
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
+ */
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+ spinlock_t *ptl;
+ pmd_t orig_pmd;
+ struct page *page;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 0;
+
+ if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+ goto out;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = 1;
+ goto out;
+ }
+
+ page = pmd_page(orig_pmd);
+ /*
+ * If other processes are mapping this page, we couldn't discard
+ * the page unless they all do MADV_FREE so let's skip the page.
+ */
+ if (page_mapcount(page) != 1)
+ goto out;
+
+ if (!trylock_page(page))
+ goto out;
+
+ /*
+ * If user want to discard part-pages of THP, split it so MADV_FREE
+ * will deactivate only them.
+ */
+ if (next - addr != HPAGE_PMD_SIZE) {
+ get_page(page);
+ spin_unlock(ptl);
+ if (split_huge_page(page)) {
+ put_page(page);
+ unlock_page(page);
+ goto out_unlocked;
+ }
+ put_page(page);
+ unlock_page(page);
+ ret = 1;
+ goto out_unlocked;
+ }
+
+ if (PageDirty(page))
+ ClearPageDirty(page);
+ unlock_page(page);
+
+ if (PageActive(page))
+ deactivate_page(page);
+
+ if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+ ret = 1;
+out:
+ spin_unlock(ptl);
+out_unlocked:
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
spinlock_t **ptl)
{
*ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd)))
+ if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
return true;
spin_unlock(*ptl);
return false;
}
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- if (pmd_trans_huge(*pmd))
- return pmd;
-unlock:
- spin_unlock(*ptl);
- return NULL;
-}
-
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
- if (vma->vm_flags & VM_LOCKED)
- return false;
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
struct page *page;
pgtable_t pgtable;
pmd_t _pmd;
- bool young, write;
+ bool young, write, dirty;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!pmd_trans_huge(*pmd));
+ VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
count_vm_event(THP_SPLIT_PMD);
atomic_add(HPAGE_PMD_NR - 1, &page->_count);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
+ dirty = pmd_dirty(*pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
entry = swp_entry_to_pte(swp_entry);
} else {
entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = maybe_mkwrite(entry, vma);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
}
+ if (dirty)
+ SetPageDirty(page + i);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd)))
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+ } else if (!pmd_devmap(*pmd))
+ goto out;
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
static void split_huge_pmd_address(struct vm_area_struct *vma,
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
+ if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
unsigned long address)
{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
spinlock_t *ptl;
pgd_t *pgd;
pud_t *pud;
}
if (pmd_trans_huge(*pmd)) {
if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, address, true);
+ __split_huge_pmd_locked(vma, pmd, haddr, true);
spin_unlock(ptl);
return;
}
spin_unlock(ptl);
pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
pte_t entry, swp_pte;
swp_entry_t swp_entry;
- if (!pte_present(pte[i]))
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non PMD-aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!pte_present(*pte))
continue;
- if (page_to_pfn(page) != pte_pfn(pte[i]))
+ if (page_to_pfn(page) != pte_pfn(*pte))
continue;
flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte + i);
+ entry = ptep_clear_flush(vma, address, pte);
+ if (pte_dirty(entry))
+ SetPageDirty(page);
swp_entry = make_migration_entry(page, pte_write(entry));
swp_pte = swp_entry_to_pte(swp_entry);
if (pte_soft_dirty(entry))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+ set_pte_at(vma->vm_mm, address, pte, swp_pte);
page_remove_rmap(page, false);
put_page(page);
}
- pte_unmap_unlock(pte, ptl);
+ pte_unmap_unlock(pte - 1, ptl);
}
static void freeze_page(struct anon_vma *anon_vma, struct page *page)
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
pgoff + HPAGE_PMD_NR - 1) {
- unsigned long haddr;
+ unsigned long address = __vma_address(page, avc->vma);
- haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, haddr);
+ address, address + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, address);
mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
+ address, address + HPAGE_PMD_SIZE);
}
}
pmd_t *pmd;
pte_t *pte, entry;
swp_entry_t swp_entry;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
int i, nr = HPAGE_PMD_NR;
/* Skip pages which doesn't belong to the VMA */
pmd = mm_find_pmd(vma->vm_mm, address);
if (!pmd)
return;
+
pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
- if (!is_swap_pte(pte[i]))
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non-PMD aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!is_swap_pte(*pte))
continue;
- swp_entry = pte_to_swp_entry(pte[i]);
+ swp_entry = pte_to_swp_entry(*pte);
if (!is_migration_entry(swp_entry))
continue;
if (migration_entry_to_page(swp_entry) != page)
page_add_anon_rmap(page, vma, address, false);
entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- entry = pte_mkdirty(entry);
+ if (PageDirty(page))
+ entry = pte_mkdirty(entry);
if (is_write_migration_entry(swp_entry))
entry = maybe_mkwrite(entry, vma);
flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte + i, entry);
+ set_pte_at(vma->vm_mm, address, pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte + i);
+ update_mmu_cache(vma, address, pte);
}
- pte_unmap_unlock(pte, ptl);
+ pte_unmap_unlock(pte - 1, ptl);
}
static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
}
}
-static int total_mapcount(struct page *page)
-{
- int i, ret;
-
- ret = compound_mapcount(page);
- for (i = 0; i < HPAGE_PMD_NR; i++)
- ret += atomic_read(&page[i]._mapcount) + 1;
-
- if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
-
- return ret;
-}
-
static int __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
(1L << PG_uptodate) |
(1L << PG_active) |
(1L << PG_locked) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
+ (1L << PG_unevictable) |
+ (1L << PG_dirty)));
/*
* After clearing PageTail the gup refcount can be released.
}
}
+int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
struct page *head = compound_head(page);
struct anon_vma *anon_vma;
int count, mapcount, ret;
+ bool mlocked;
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
goto out_unlock;
}
+ mlocked = PageMlocked(page);
freeze_page(anon_vma, head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
+ /* Make sure the page is not on per-CPU pagevec as it takes pin */
+ if (mlocked)
+ lru_add_drain();
+
/* Prevent deferred_split_scan() touching ->_count */
spin_lock(&split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
- if (mapcount == count - 1) {
+ if (!mapcount && count == 1) {
if (!list_empty(page_deferred_list(head))) {
split_queue_len--;
list_del(page_deferred_list(head));
spin_unlock(&split_queue_lock);
__split_huge_page(page, list);
ret = 0;
- } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
spin_unlock(&split_queue_lock);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+ dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
spin_unlock(&split_queue_lock);
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+ struct zone *zone;
+ struct page *page;
+ unsigned long pfn, max_zone_pfn;
+ unsigned long total = 0, split = 0;
+
+ if (val != 1)
+ return -EINVAL;
+
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (!get_page_unless_zero(page))
+ continue;
+
+ if (zone != page_zone(page))
+ goto next;
+
+ if (!PageHead(page) || !PageAnon(page) ||
+ PageHuge(page))
+ goto next;
+
+ total++;
+ lock_page(page);
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+next:
+ put_page(page);
+ }
+ }
+
+ pr_info("%lu of %lu THP split", split, total);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+ "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+ &split_huge_pages_fops);
+ if (!ret)
+ pr_warn("Failed to create split_huge_pages in debugfs");
+ return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif