X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;f=mm%2Fhuge_memory.c;h=c107094f79bae9ee895bd6bf30976d900f16c141;hb=546fac60739ef8d7cbf8ce0b8251a519f68b2804;hp=6817b0350c71c43b0f89a4972ce19c51a2408a2b;hpb=c102cb097d9371c2c60049d041ab1f8bdca5ccc2;p=karo-tx-linux.git diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6817b0350c71..c107094f79ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; static int khugepaged(void *none); static int khugepaged_slab_init(void); +static void khugepaged_slab_exit(void); #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!khugepaged_enabled()) - return 0; - for_each_populated_zone(zone) nr_zones++; @@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) setup_per_zone_wmarks(); return 0; } -late_initcall(set_recommended_min_free_kbytes); -static int start_khugepaged(void) +static int start_stop_khugepaged(void) { int err = 0; if (khugepaged_enabled()) { @@ -156,6 +153,7 @@ static int start_khugepaged(void) pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; + goto fail; } if (!list_empty(&khugepaged_scan.mm_head)) @@ -166,7 +164,7 @@ static int start_khugepaged(void) kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } - +fail: return err; } @@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); @@ -202,7 +200,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return ACCESS_ONCE(huge_zero_page); + return READ_ONCE(huge_zero_page); } static void put_huge_zero_page(void) @@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, int err; mutex_lock(&khugepaged_mutex); - err = start_khugepaged(); + err = start_stop_khugepaged(); mutex_unlock(&khugepaged_mutex); if (err) @@ -634,27 +632,38 @@ static int __init hugepage_init(void) err = hugepage_init_sysfs(&hugepage_kobj); if (err) - return err; + goto err_sysfs; err = khugepaged_slab_init(); if (err) - goto out; + goto err_slab; - register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; + return 0; + } - start_khugepaged(); + err = start_stop_khugepaged(); + if (err) + goto err_khugepaged; return 0; -out: +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: return err; } subsys_initcall(hugepage_init); @@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, - struct page *page) + struct page *page, gfp_t gfp) { struct mem_cgroup *memcg; pgtable_t pgtable; @@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) return VM_FAULT_OOM; pgtable = pte_alloc_one(mm, haddr); @@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1022,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ ptl = pmd_lockptr(mm, pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - gfp_t gfp; - - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); - new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1130,8 +1138,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1167,7 +1174,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) @@ -1389,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; /* * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the + * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { @@ -1452,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1498,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void) return 0; } +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + static inline struct mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ @@ -2109,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) { while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval)) + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) release_pte_page(pte_page(pteval)); } } @@ -2120,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int none = 0; + int none_or_zero = 0; bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2207,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, pte_t pteval = *_pte; struct page *src_page; - if (pte_none(pteval)) { + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); @@ -2311,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { @@ -2326,8 +2350,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2380,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) return true; } -static struct page -*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); + return *hpage; } #endif @@ -2421,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm, struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); if (!new_page) return; if (unlikely(mem_cgroup_try_charge(new_page, mm, - GFP_TRANSHUGE, &memcg))) + gfp, &memcg))) return; /* @@ -2470,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2543,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none = 0; + int ret = 0, none_or_zero = 0; struct page *page; unsigned long _address; spinlock_t *ptl; @@ -2561,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; @@ -2770,7 +2799,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2791,8 +2820,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; @@ -2836,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd);