]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/huge_memory.c
mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd
[karo-tx-linux.git] / mm / huge_memory.c
index 882b04449904a2f819fc51db609de9491dd209a3..82bed2bec3ed8afdf787d78b0ad14f7b70df6cb1 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/pfn_t.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
@@ -931,27 +932,27 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+               pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
 {
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;
 
        ptl = pmd_lock(mm, pmd);
-       if (pmd_none(*pmd)) {
-               entry = pmd_mkhuge(pfn_pmd(pfn, prot));
-               if (write) {
-                       entry = pmd_mkyoung(pmd_mkdirty(entry));
-                       entry = maybe_pmd_mkwrite(entry, vma);
-               }
-               set_pmd_at(mm, addr, pmd, entry);
-               update_mmu_cache_pmd(vma, addr, pmd);
-       }
+       entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pmd_mkdevmap(entry);
+       if (write) {
+               entry = pmd_mkyoung(pmd_mkdirty(entry));
+               entry = maybe_pmd_mkwrite(entry, vma);
+       }
+       set_pmd_at(mm, addr, pmd, entry);
+       update_mmu_cache_pmd(vma, addr, pmd);
        spin_unlock(ptl);
 }
 
 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, unsigned long pfn, bool write)
+                       pmd_t *pmd, pfn_t pfn, bool write)
 {
        pgprot_t pgprot = vma->vm_page_prot;
        /*
@@ -963,7 +964,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+       BUG_ON(!pfn_t_devmap(pfn));
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;
@@ -994,7 +995,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
        ret = -EAGAIN;
        pmd = *src_pmd;
-       if (unlikely(!pmd_trans_huge(pmd))) {
+       if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
@@ -1017,17 +1018,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
 
-       src_page = pmd_page(pmd);
-       VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-       get_page(src_page);
-       page_dup_rmap(src_page, true);
-       add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+       if (pmd_trans_huge(pmd)) {
+               /* thp accounting separate from pmd_devmap accounting */
+               src_page = pmd_page(pmd);
+               VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+               get_page(src_page);
+               page_dup_rmap(src_page, true);
+               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               atomic_long_inc(&dst_mm->nr_ptes);
+               pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+       }
 
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
-       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       atomic_long_inc(&dst_mm->nr_ptes);
 
        ret = 0;
 out_unlock:
@@ -1501,6 +1505,77 @@ out:
        return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+       spinlock_t *ptl;
+       pmd_t orig_pmd;
+       struct page *page;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 0;
+
+       if (!pmd_trans_huge_lock(pmd, vma, &ptl))
+               goto out;
+
+       orig_pmd = *pmd;
+       if (is_huge_zero_pmd(orig_pmd)) {
+               ret = 1;
+               goto out;
+       }
+
+       page = pmd_page(orig_pmd);
+       /*
+        * If other processes are mapping this page, we couldn't discard
+        * the page unless they all do MADV_FREE so let's skip the page.
+        */
+       if (page_mapcount(page) != 1)
+               goto out;
+
+       if (!trylock_page(page))
+               goto out;
+
+       /*
+        * If user want to discard part-pages of THP, split it so MADV_FREE
+        * will deactivate only them.
+        */
+       if (next - addr != HPAGE_PMD_SIZE) {
+               get_page(page);
+               spin_unlock(ptl);
+               if (split_huge_page(page)) {
+                       put_page(page);
+                       unlock_page(page);
+                       goto out_unlocked;
+               }
+               put_page(page);
+               unlock_page(page);
+               ret = 1;
+               goto out_unlocked;
+       }
+
+       if (PageDirty(page))
+               ClearPageDirty(page);
+       unlock_page(page);
+
+       if (PageActive(page))
+               deactivate_page(page);
+
+       if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                       tlb->fullmm);
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+       }
+       ret = 1;
+out:
+       spin_unlock(ptl);
+out_unlocked:
+       return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
@@ -1644,7 +1719,7 @@ bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
+       if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
                return true;
        spin_unlock(*ptl);
        return false;
@@ -2710,13 +2785,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        struct page *page;
        pgtable_t pgtable;
        pmd_t _pmd;
-       bool young, write;
+       bool young, write, dirty;
        int i;
 
        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-       VM_BUG_ON(!pmd_trans_huge(*pmd));
+       VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
 
        count_vm_event(THP_SPLIT_PMD);
 
@@ -2734,6 +2809,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        atomic_add(HPAGE_PMD_NR - 1, &page->_count);
        write = pmd_write(*pmd);
        young = pmd_young(*pmd);
+       dirty = pmd_dirty(*pmd);
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -2751,12 +2827,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = swp_entry_to_pte(swp_entry);
                } else {
                        entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       entry = maybe_mkwrite(entry, vma);
                        if (!write)
                                entry = pte_wrprotect(entry);
                        if (!young)
                                entry = pte_mkold(entry);
                }
+               if (dirty)
+                       SetPageDirty(page + i);
                pte = pte_offset_map(&_pmd, haddr);
                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, haddr, pte, entry);
@@ -2826,14 +2904,15 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
        mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
        ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd)))
+       if (pmd_trans_huge(*pmd)) {
+               page = pmd_page(*pmd);
+               if (PageMlocked(page))
+                       get_page(page);
+               else
+                       page = NULL;
+       } else if (!pmd_devmap(*pmd))
                goto out;
-       page = pmd_page(*pmd);
        __split_huge_pmd_locked(vma, pmd, haddr, false);
-       if (PageMlocked(page))
-               get_page(page);
-       else
-               page = NULL;
 out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
@@ -2863,7 +2942,7 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
                return;
 
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
+       if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
@@ -2916,6 +2995,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
                unsigned long address)
 {
+       unsigned long haddr = address & HPAGE_PMD_MASK;
        spinlock_t *ptl;
        pgd_t *pgd;
        pud_t *pud;
@@ -2945,32 +3025,47 @@ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
        }
        if (pmd_trans_huge(*pmd)) {
                if (page == pmd_page(*pmd))
-                       __split_huge_pmd_locked(vma, pmd, address, true);
+                       __split_huge_pmd_locked(vma, pmd, haddr, true);
                spin_unlock(ptl);
                return;
        }
        spin_unlock(ptl);
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
                pte_t entry, swp_pte;
                swp_entry_t swp_entry;
 
-               if (!pte_present(pte[i]))
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non PMD-aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!pte_present(*pte))
                        continue;
-               if (page_to_pfn(page) != pte_pfn(pte[i]))
+               if (page_to_pfn(page) != pte_pfn(*pte))
                        continue;
                flush_cache_page(vma, address, page_to_pfn(page));
-               entry = ptep_clear_flush(vma, address, pte + i);
+               entry = ptep_clear_flush(vma, address, pte);
+               if (pte_dirty(entry))
+                       SetPageDirty(page);
                swp_entry = make_migration_entry(page, pte_write(entry));
                swp_pte = swp_entry_to_pte(swp_entry);
                if (pte_soft_dirty(entry))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
-               set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+               set_pte_at(vma->vm_mm, address, pte, swp_pte);
                page_remove_rmap(page, false);
                put_page(page);
        }
-       pte_unmap_unlock(pte, ptl);
+       pte_unmap_unlock(pte - 1, ptl);
 }
 
 static void freeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -2982,14 +3077,13 @@ static void freeze_page(struct anon_vma *anon_vma, struct page *page)
 
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
                        pgoff + HPAGE_PMD_NR - 1) {
-               unsigned long haddr;
+               unsigned long address = __vma_address(page, avc->vma);
 
-               haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
                mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
-               freeze_page_vma(avc->vma, page, haddr);
+                               address, address + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, address);
                mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
+                               address, address + HPAGE_PMD_SIZE);
        }
 }
 
@@ -3000,6 +3094,7 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
        pmd_t *pmd;
        pte_t *pte, entry;
        swp_entry_t swp_entry;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
        int i, nr = HPAGE_PMD_NR;
 
        /* Skip pages which doesn't belong to the VMA */
@@ -3013,12 +3108,26 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
        pmd = mm_find_pmd(vma->vm_mm, address);
        if (!pmd)
                return;
+
        pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
-       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
-               if (!is_swap_pte(pte[i]))
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non-PMD aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!is_swap_pte(*pte))
                        continue;
 
-               swp_entry = pte_to_swp_entry(pte[i]);
+               swp_entry = pte_to_swp_entry(*pte);
                if (!is_migration_entry(swp_entry))
                        continue;
                if (migration_entry_to_page(swp_entry) != page)
@@ -3028,17 +3137,18 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
                page_add_anon_rmap(page, vma, address, false);
 
                entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
-               entry = pte_mkdirty(entry);
+               if (PageDirty(page))
+                       entry = pte_mkdirty(entry);
                if (is_write_migration_entry(swp_entry))
                        entry = maybe_mkwrite(entry, vma);
 
                flush_dcache_page(page);
-               set_pte_at(vma->vm_mm, address, pte + i, entry);
+               set_pte_at(vma->vm_mm, address, pte, entry);
 
                /* No need to invalidate - it was non-present before */
-               update_mmu_cache(vma, address, pte + i);
+               update_mmu_cache(vma, address, pte);
        }
-       pte_unmap_unlock(pte, ptl);
+       pte_unmap_unlock(pte - 1, ptl);
 }
 
 static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -3089,8 +3199,8 @@ static int __split_huge_page_tail(struct page *head, int tail,
                         (1L << PG_uptodate) |
                         (1L << PG_active) |
                         (1L << PG_locked) |
-                        (1L << PG_unevictable)));
-       page_tail->flags |= (1L << PG_dirty);
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
 
        /*
         * After clearing PageTail the gup refcount can be released.
@@ -3244,7 +3354,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        spin_lock(&split_queue_lock);
        count = page_count(head);
        mapcount = total_mapcount(head);
-       if (mapcount == count - 1) {
+       if (!mapcount && count == 1) {
                if (!list_empty(page_deferred_list(head))) {
                        split_queue_len--;
                        list_del(page_deferred_list(head));
@@ -3252,13 +3362,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                spin_unlock(&split_queue_lock);
                __split_huge_page(page, list);
                ret = 0;
-       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
                spin_unlock(&split_queue_lock);
                pr_alert("total_mapcount: %u, page_count(): %u\n",
                                mapcount, count);
                if (PageTail(page))
                        dump_page(head, NULL);
-               dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+               dump_page(page, "total_mapcount(head) > 0");
                BUG();
        } else {
                spin_unlock(&split_queue_lock);