mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd

[karo-tx-linux.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index b6ac6c43d6a41257794e58874d37e9a25a99713f..82bed2bec3ed8afdf787d78b0ad14f7b70df6cb1 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -21,8 +21,10 @@
  #include <linux/kthread.h>
  #include <linux/khugepaged.h>
  #include <linux/freezer.h>
+#include <linux/pfn_t.h>
  #include <linux/mman.h>
  #include <linux/pagemap.h>
+#include <linux/debugfs.h>
  #include <linux/migrate.h>
  #include <linux/hashtable.h>
  #include <linux/userfaultfd_k.h>
@@ -135,6 +137,10 @@ static struct khugepaged_scan khugepaged_scan = {
         .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  };
  
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
  
  static void set_recommended_min_free_kbytes(void)
  {
@@ -667,6 +673,9 @@ static int __init hugepage_init(void)
         err = register_shrinker(&huge_zero_page_shrinker);
         if (err)
                 goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
  
         /*
          * By default disable transparent hugepages on smaller systems,
@@ -684,6 +693,8 @@ static int __init hugepage_init(void)
  
         return 0;
  err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
         unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
         khugepaged_slab_exit();
@@ -740,6 +751,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
         return entry;
  }
  
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
  static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                         struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmd,
@@ -844,8 +876,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
                 return VM_FAULT_FALLBACK;
-       if (vma->vm_flags & VM_LOCKED)
-               return VM_FAULT_FALLBACK;
         if (unlikely(anon_vma_prepare(vma)))
                 return VM_FAULT_OOM;
         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
@@ -896,32 +926,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
         }
+       prep_transhuge_page(page);
         return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                             flags);
  }
  
  static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+               pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
  {
         struct mm_struct *mm = vma->vm_mm;
         pmd_t entry;
         spinlock_t *ptl;
  
         ptl = pmd_lock(mm, pmd);
-       if (pmd_none(*pmd)) {
-               entry = pmd_mkhuge(pfn_pmd(pfn, prot));
-               if (write) {
-                       entry = pmd_mkyoung(pmd_mkdirty(entry));
-                       entry = maybe_pmd_mkwrite(entry, vma);
-               }
-               set_pmd_at(mm, addr, pmd, entry);
-               update_mmu_cache_pmd(vma, addr, pmd);
-       }
+       entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pmd_mkdevmap(entry);
+       if (write) {
+               entry = pmd_mkyoung(pmd_mkdirty(entry));
+               entry = maybe_pmd_mkwrite(entry, vma);
+       }
+       set_pmd_at(mm, addr, pmd, entry);
+       update_mmu_cache_pmd(vma, addr, pmd);
         spin_unlock(ptl);
  }
  
  int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, unsigned long pfn, bool write)
+                       pmd_t *pmd, pfn_t pfn, bool write)
  {
         pgprot_t pgprot = vma->vm_page_prot;
         /*
@@ -933,7 +964,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                 (VM_PFNMAP|VM_MIXEDMAP));
         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+       BUG_ON(!pfn_t_devmap(pfn));
  
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return VM_FAULT_SIGBUS;
@@ -964,7 +995,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  
         ret = -EAGAIN;
         pmd = *src_pmd;
-       if (unlikely(!pmd_trans_huge(pmd))) {
+       if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
                 pte_free(dst_mm, pgtable);
                 goto out_unlock;
         }
@@ -987,17 +1018,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 goto out_unlock;
         }
  
-       src_page = pmd_page(pmd);
-       VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-       get_page(src_page);
-       page_dup_rmap(src_page, true);
-       add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+       if (pmd_trans_huge(pmd)) {
+               /* thp accounting separate from pmd_devmap accounting */
+               src_page = pmd_page(pmd);
+               VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+               get_page(src_page);
+               page_dup_rmap(src_page, true);
+               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               atomic_long_inc(&dst_mm->nr_ptes);
+               pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+       }
  
         pmdp_set_wrprotect(src_mm, addr, src_pmd);
         pmd = pmd_mkold(pmd_wrprotect(pmd));
-       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
         set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       atomic_long_inc(&dst_mm->nr_ptes);
  
         ret = 0;
  out_unlock:
@@ -1192,7 +1226,9 @@ alloc:
         } else
                 new_page = NULL;
  
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                 if (!page) {
                         split_huge_pmd(vma, pmd, address);
                         ret |= VM_FAULT_FALLBACK;
@@ -1311,7 +1347,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                         update_mmu_cache_pmd(vma, addr, pmd);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                */
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                         lru_add_drain();
                         if (page->mapping)
                                 mlock_vma_page(page);
@@ -1456,6 +1505,77 @@ out:
         return 0;
  }
  
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+       spinlock_t *ptl;
+       pmd_t orig_pmd;
+       struct page *page;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 0;
+
+       if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+               goto out;
+
+       orig_pmd = *pmd;
+       if (is_huge_zero_pmd(orig_pmd)) {
+               ret = 1;
+               goto out;
+       }
+
+       page = pmd_page(orig_pmd);
+       /*
+        * If other processes are mapping this page, we couldn't discard
+        * the page unless they all do MADV_FREE so let's skip the page.
+        */
+       if (page_mapcount(page) != 1)
+               goto out;
+
+       if (!trylock_page(page))
+               goto out;
+
+       /*
+        * If user want to discard part-pages of THP, split it so MADV_FREE
+        * will deactivate only them.
+        */
+       if (next - addr != HPAGE_PMD_SIZE) {
+               get_page(page);
+               spin_unlock(ptl);
+               if (split_huge_page(page)) {
+                       put_page(page);
+                       unlock_page(page);
+                       goto out_unlocked;
+               }
+               put_page(page);
+               unlock_page(page);
+               ret = 1;
+               goto out_unlocked;
+       }
+
+       if (PageDirty(page))
+               ClearPageDirty(page);
+       unlock_page(page);
+
+       if (PageActive(page))
+               deactivate_page(page);
+
+       if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                       tlb->fullmm);
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+       }
+       ret = 1;
+out:
+       spin_unlock(ptl);
+out_unlocked:
+       return ret;
+}
+
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                  pmd_t *pmd, unsigned long addr)
  {
@@ -1599,52 +1719,12 @@ bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                 spinlock_t **ptl)
  {
         *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
+       if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
                 return true;
         spin_unlock(*ptl);
         return false;
  }
  
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
-                             struct mm_struct *mm,
-                             unsigned long address,
-                             spinlock_t **ptl)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       if (address & ~HPAGE_PMD_MASK)
-               return NULL;
-
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return NULL;
-       pmd = pmd_offset(pud, address);
-
-       *ptl = pmd_lock(mm, pmd);
-       if (!pmd_present(*pmd))
-               goto unlock;
-       if (pmd_page(*pmd) != page)
-               goto unlock;
-       if (pmd_trans_huge(*pmd))
-               return pmd;
-unlock:
-       spin_unlock(*ptl);
-       return NULL;
-}
-
  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
  
  int hugepage_madvise(struct vm_area_struct *vma,
@@ -2109,6 +2189,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                 return NULL;
         }
  
+       prep_transhuge_page(*hpage);
         count_vm_event(THP_COLLAPSE_ALLOC);
         return *hpage;
  }
@@ -2120,8 +2201,12 @@ static int khugepaged_find_target_node(void)
  
  static inline struct page *alloc_hugepage(int defrag)
  {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
  }
  
  static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2171,8 +2256,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
         if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
             (vma->vm_flags & VM_NOHUGEPAGE))
                 return false;
-       if (vma->vm_flags & VM_LOCKED)
-               return false;
         if (!vma->anon_vma || vma->vm_ops)
                 return false;
         if (is_vma_temporary_stack(vma))
@@ -2702,13 +2785,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         struct page *page;
         pgtable_t pgtable;
         pmd_t _pmd;
-       bool young, write;
+       bool young, write, dirty;
         int i;
  
         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
         VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
         VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-       VM_BUG_ON(!pmd_trans_huge(*pmd));
+       VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
  
         count_vm_event(THP_SPLIT_PMD);
  
@@ -2726,6 +2809,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         atomic_add(HPAGE_PMD_NR - 1, &page->_count);
         write = pmd_write(*pmd);
         young = pmd_young(*pmd);
+       dirty = pmd_dirty(*pmd);
  
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
@@ -2743,12 +2827,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                         entry = swp_entry_to_pte(swp_entry);
                 } else {
                         entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       entry = maybe_mkwrite(entry, vma);
                         if (!write)
                                 entry = pte_wrprotect(entry);
                         if (!young)
                                 entry = pte_mkold(entry);
                 }
+               if (dirty)
+                       SetPageDirty(page + i);
                 pte = pte_offset_map(&_pmd, haddr);
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, haddr, pte, entry);
@@ -2813,14 +2899,29 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  {
         spinlock_t *ptl;
         struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
         unsigned long haddr = address & HPAGE_PMD_MASK;
  
         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
         ptl = pmd_lock(mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
-               __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (pmd_trans_huge(*pmd)) {
+               page = pmd_page(*pmd);
+               if (PageMlocked(page))
+                       get_page(page);
+               else
+                       page = NULL;
+       } else if (!pmd_devmap(*pmd))
+               goto out;
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
         spin_unlock(ptl);
         mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
  }
  
  static void split_huge_pmd_address(struct vm_area_struct *vma,
@@ -2841,7 +2942,7 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
                 return;
  
         pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
+       if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                 return;
         /*
          * Caller holds the mmap_sem write mode, so a huge pmd cannot
@@ -2894,6 +2995,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
  static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
                 unsigned long address)
  {
+       unsigned long haddr = address & HPAGE_PMD_MASK;
         spinlock_t *ptl;
         pgd_t *pgd;
         pud_t *pud;
@@ -2923,32 +3025,47 @@ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
         }
         if (pmd_trans_huge(*pmd)) {
                 if (page == pmd_page(*pmd))
-                       __split_huge_pmd_locked(vma, pmd, address, true);
+                       __split_huge_pmd_locked(vma, pmd, haddr, true);
                 spin_unlock(ptl);
                 return;
         }
         spin_unlock(ptl);
  
         pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
                 pte_t entry, swp_pte;
                 swp_entry_t swp_entry;
  
-               if (!pte_present(pte[i]))
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non PMD-aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!pte_present(*pte))
                         continue;
-               if (page_to_pfn(page) != pte_pfn(pte[i]))
+               if (page_to_pfn(page) != pte_pfn(*pte))
                         continue;
                 flush_cache_page(vma, address, page_to_pfn(page));
-               entry = ptep_clear_flush(vma, address, pte + i);
+               entry = ptep_clear_flush(vma, address, pte);
+               if (pte_dirty(entry))
+                       SetPageDirty(page);
                 swp_entry = make_migration_entry(page, pte_write(entry));
                 swp_pte = swp_entry_to_pte(swp_entry);
                 if (pte_soft_dirty(entry))
                         swp_pte = pte_swp_mksoft_dirty(swp_pte);
-               set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+               set_pte_at(vma->vm_mm, address, pte, swp_pte);
                 page_remove_rmap(page, false);
                 put_page(page);
         }
-       pte_unmap_unlock(pte, ptl);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  
  static void freeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -2960,14 +3077,13 @@ static void freeze_page(struct anon_vma *anon_vma, struct page *page)
  
         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
                         pgoff + HPAGE_PMD_NR - 1) {
-               unsigned long haddr;
+               unsigned long address = __vma_address(page, avc->vma);
  
-               haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
                 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
-               freeze_page_vma(avc->vma, page, haddr);
+                               address, address + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, address);
                 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
+                               address, address + HPAGE_PMD_SIZE);
         }
  }
  
@@ -2978,6 +3094,7 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
         pmd_t *pmd;
         pte_t *pte, entry;
         swp_entry_t swp_entry;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
         int i, nr = HPAGE_PMD_NR;
  
         /* Skip pages which doesn't belong to the VMA */
@@ -2991,12 +3108,26 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
         pmd = mm_find_pmd(vma->vm_mm, address);
         if (!pmd)
                 return;
+
         pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
-               if (!is_swap_pte(pte[i]))
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non-PMD aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!is_swap_pte(*pte))
                         continue;
  
-               swp_entry = pte_to_swp_entry(pte[i]);
+               swp_entry = pte_to_swp_entry(*pte);
                 if (!is_migration_entry(swp_entry))
                         continue;
                 if (migration_entry_to_page(swp_entry) != page)
@@ -3006,17 +3137,18 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
                 page_add_anon_rmap(page, vma, address, false);
  
                 entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
-               entry = pte_mkdirty(entry);
+               if (PageDirty(page))
+                       entry = pte_mkdirty(entry);
                 if (is_write_migration_entry(swp_entry))
                         entry = maybe_mkwrite(entry, vma);
  
                 flush_dcache_page(page);
-               set_pte_at(vma->vm_mm, address, pte + i, entry);
+               set_pte_at(vma->vm_mm, address, pte, entry);
  
                 /* No need to invalidate - it was non-present before */
-               update_mmu_cache(vma, address, pte + i);
+               update_mmu_cache(vma, address, pte);
         }
-       pte_unmap_unlock(pte, ptl);
+       pte_unmap_unlock(pte - 1, ptl);
  }
  
  static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -3036,20 +3168,6 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
         }
  }
  
-static int total_mapcount(struct page *page)
-{
-       int i, ret;
-
-       ret = compound_mapcount(page);
-       for (i = 0; i < HPAGE_PMD_NR; i++)
-               ret += atomic_read(&page[i]._mapcount) + 1;
-
-       if (PageDoubleMap(page))
-               ret -= HPAGE_PMD_NR;
-
-       return ret;
-}
-
  static int __split_huge_page_tail(struct page *head, int tail,
                 struct lruvec *lruvec, struct list_head *list)
  {
@@ -3081,8 +3199,8 @@ static int __split_huge_page_tail(struct page *head, int tail,
                          (1L << PG_uptodate) |
                          (1L << PG_active) |
                          (1L << PG_locked) |
-                        (1L << PG_unevictable)));
-       page_tail->flags |= (1L << PG_dirty);
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
  
         /*
          * After clearing PageTail the gup refcount can be released.
@@ -3098,7 +3216,7 @@ static int __split_huge_page_tail(struct page *head, int tail,
                 set_page_idle(page_tail);
  
         /* ->mapping in first tail page is compound_mapcount */
-       VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING,
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
                         page_tail);
         page_tail->mapping = head->mapping;
  
@@ -3150,6 +3268,25 @@ static void __split_huge_page(struct page *page, struct list_head *list)
         }
  }
  
+int total_mapcount(struct page *page)
+{
+       int i, ret;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) + 1;
+
+       ret = compound_mapcount(page);
+       if (PageHuge(page))
+               return ret;
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+       return ret;
+}
+
  /*
   * This function splits huge page into normal pages. @page can point to any
   * subpage of huge page to split. Split doesn't change the position of @page.
@@ -3174,6 +3311,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         struct page *head = compound_head(page);
         struct anon_vma *anon_vma;
         int count, mapcount, ret;
+       bool mlocked;
  
         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
         VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -3204,22 +3342,36 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 goto out_unlock;
         }
  
+       mlocked = PageMlocked(page);
         freeze_page(anon_vma, head);
         VM_BUG_ON_PAGE(compound_mapcount(head), head);
  
+       /* Make sure the page is not on per-CPU pagevec as it takes pin */
+       if (mlocked)
+               lru_add_drain();
+
+       /* Prevent deferred_split_scan() touching ->_count */
+       spin_lock(&split_queue_lock);
         count = page_count(head);
         mapcount = total_mapcount(head);
-       if (mapcount == count - 1) {
+       if (!mapcount && count == 1) {
+               if (!list_empty(page_deferred_list(head))) {
+                       split_queue_len--;
+                       list_del(page_deferred_list(head));
+               }
+               spin_unlock(&split_queue_lock);
                 __split_huge_page(page, list);
                 ret = 0;
-       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+               spin_unlock(&split_queue_lock);
                 pr_alert("total_mapcount: %u, page_count(): %u\n",
                                 mapcount, count);
                 if (PageTail(page))
                         dump_page(head, NULL);
-               dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+               dump_page(page, "total_mapcount(head) > 0");
                 BUG();
         } else {
+               spin_unlock(&split_queue_lock);
                 unfreeze_page(anon_vma, head);
                 ret = -EBUSY;
         }
@@ -3231,3 +3383,145 @@ out:
         count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
         return ret;
  }
+
+void free_transhuge_page(struct page *page)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+       unsigned long flags;
+
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+       struct zone *zone;
+       struct page *page;
+       unsigned long pfn, max_zone_pfn;
+       unsigned long total = 0, split = 0;
+
+       if (val != 1)
+               return -EINVAL;
+
+       for_each_populated_zone(zone) {
+               max_zone_pfn = zone_end_pfn(zone);
+               for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+                       if (!pfn_valid(pfn))
+                               continue;
+
+                       page = pfn_to_page(pfn);
+                       if (!get_page_unless_zero(page))
+                               continue;
+
+                       if (zone != page_zone(page))
+                               goto next;
+
+                       if (!PageHead(page) || !PageAnon(page) ||
+                                       PageHuge(page))
+                               goto next;
+
+                       total++;
+                       lock_page(page);
+                       if (!split_huge_page(page))
+                               split++;
+                       unlock_page(page);
+next:
+                       put_page(page);
+               }
+       }
+
+       pr_info("%lu of %lu THP split", split, total);
+
+       return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+               "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+       void *ret;
+
+       ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+                       &split_huge_pages_fops);
+       if (!ret)
+               pr_warn("Failed to create split_huge_pages in debugfs");
+       return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif