X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;f=mm%2Fhuge_memory.c;h=3cda32c5bdb372c29278f1b9be334d65629eeb2f;hb=74256136ec779b263d93a0566c2dc5ca3598d2e5;hp=aea8f7a42df97d7185f626d5bbc445c64f376eb1;hpb=e48a45aa2f3a810531610e2377e6f782c4f922d6;p=karo-tx-linux.git diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aea8f7a42df9..3cda32c5bdb3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -57,7 +58,8 @@ enum scan_result { SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, - SCAN_CGROUP_CHARGE_FAIL + SCAN_CGROUP_CHARGE_FAIL, + SCAN_EXCEED_SWAP_PTE }; #define CREATE_TRACE_POINTS @@ -99,6 +101,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * fault. */ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; +static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8; static int khugepaged(void *none); static int khugepaged_slab_init(void); @@ -586,6 +589,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr = __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, khugepaged_max_ptes_none_store); +static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); +} + +static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_swap; + + err = kstrtoul(buf, 10, &max_ptes_swap); + if (err || max_ptes_swap > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_swap = max_ptes_swap; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_swap_attr = + __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, + khugepaged_max_ptes_swap_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, @@ -594,6 +624,7 @@ static struct attribute *khugepaged_attr[] = { &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -2313,6 +2344,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return true; } +/* + * Bring missing pages in from swap, to complete THP collapse. + * Only done if khugepaged_scan_pmd believes it is worthwhile. + * + * Called and returns without pte mapped or spinlocks held, + * but with mmap_sem held to protect against vma changes. + */ + +static void __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd) +{ + unsigned long _address; + pte_t *pte, pteval; + int swapped_in = 0, ret = 0; + + pte = pte_offset_map(pmd, address); + for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; + pte++, _address += PAGE_SIZE) { + pteval = *pte; + if (!is_swap_pte(pteval)) + continue; + swapped_in++; + ret = do_swap_page(mm, vma, _address, pte, pmd, + FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT, + pteval); + if (ret & VM_FAULT_ERROR) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0); + return; + } + /* pte is unmapped now, we need to map it */ + pte = pte_offset_map(pmd, _address); + } + pte--; + pte_unmap(pte); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); +} + static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, @@ -2381,6 +2450,8 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; } + __collapse_huge_page_swapin(mm, vma, address, pmd); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -2457,9 +2528,6 @@ static void collapse_huge_page(struct mm_struct *mm, result = SCAN_SUCCEED; out_up_write: up_write(&mm->mmap_sem); - trace_mm_collapse_huge_page(mm, isolated, result); - return; - out_nolock: trace_mm_collapse_huge_page(mm, isolated, result); return; @@ -2479,7 +2547,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page = NULL; unsigned long _address; spinlock_t *ptl; - int node = NUMA_NO_NODE; + int node = NUMA_NO_NODE, unmapped = 0; bool writable = false, referenced = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -2495,6 +2563,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; + if (is_swap_pte(pteval)) { + if (++unmapped <= khugepaged_max_ptes_swap) { + continue; + } else { + result = SCAN_EXCEED_SWAP_PTE; + goto out_unmap; + } + } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && ++none_or_zero <= khugepaged_max_ptes_none) { @@ -2581,7 +2657,7 @@ out_unmap: } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, - none_or_zero, result); + none_or_zero, result, unmapped); return ret; } @@ -3218,28 +3294,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) } } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + atomic_inc(&page_tail->_count); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3273,8 +3347,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3282,7 +3354,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3291,10 +3363,8 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); @@ -3459,6 +3529,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; }