]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - mm/huge_memory.c
Merge branch 'akpm-current/current'
[karo-tx-linux.git] / mm / huge_memory.c
index aea8f7a42df97d7185f626d5bbc445c64f376eb1..3cda32c5bdb372c29278f1b9be334d65629eeb2f 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
+#include <linux/swapops.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -57,7 +58,8 @@ enum scan_result {
        SCAN_SWAP_CACHE_PAGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
-       SCAN_CGROUP_CHARGE_FAIL
+       SCAN_CGROUP_CHARGE_FAIL,
+       SCAN_EXCEED_SWAP_PTE
 };
 
 #define CREATE_TRACE_POINTS
@@ -99,6 +101,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  * fault.
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
@@ -586,6 +589,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
               khugepaged_max_ptes_none_store);
 
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+{
+       int err;
+       unsigned long max_ptes_swap;
+
+       err  = kstrtoul(buf, 10, &max_ptes_swap);
+       if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+               return -EINVAL;
+
+       khugepaged_max_ptes_swap = max_ptes_swap;
+
+       return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+       __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+              khugepaged_max_ptes_swap_store);
+
 static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
@@ -594,6 +624,7 @@ static struct attribute *khugepaged_attr[] = {
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
+       &khugepaged_max_ptes_swap_attr.attr,
        NULL,
 };
 
@@ -2313,6 +2344,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        return true;
 }
 
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd)
+{
+       unsigned long _address;
+       pte_t *pte, pteval;
+       int swapped_in = 0, ret = 0;
+
+       pte = pte_offset_map(pmd, address);
+       for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+            pte++, _address += PAGE_SIZE) {
+               pteval = *pte;
+               if (!is_swap_pte(pteval))
+                       continue;
+               swapped_in++;
+               ret = do_swap_page(mm, vma, _address, pte, pmd,
+                                  FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+                                  pteval);
+               if (ret & VM_FAULT_ERROR) {
+                       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+                       return;
+               }
+               /* pte is unmapped now, we need to map it */
+               pte = pte_offset_map(pmd, _address);
+       }
+       pte--;
+       pte_unmap(pte);
+       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
@@ -2381,6 +2450,8 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        }
 
+       __collapse_huge_page_swapin(mm, vma, address, pmd);
+
        anon_vma_lock_write(vma->anon_vma);
 
        pte = pte_offset_map(pmd, address);
@@ -2457,9 +2528,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        result = SCAN_SUCCEED;
 out_up_write:
        up_write(&mm->mmap_sem);
-       trace_mm_collapse_huge_page(mm, isolated, result);
-       return;
-
 out_nolock:
        trace_mm_collapse_huge_page(mm, isolated, result);
        return;
@@ -2479,7 +2547,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page = NULL;
        unsigned long _address;
        spinlock_t *ptl;
-       int node = NUMA_NO_NODE;
+       int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false, referenced = false;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2495,6 +2563,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
+               if (is_swap_pte(pteval)) {
+                       if (++unmapped <= khugepaged_max_ptes_swap) {
+                               continue;
+                       } else {
+                               result = SCAN_EXCEED_SWAP_PTE;
+                               goto out_unmap;
+                       }
+               }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        if (!userfaultfd_armed(vma) &&
                            ++none_or_zero <= khugepaged_max_ptes_none) {
@@ -2581,7 +2657,7 @@ out_unmap:
        }
 out:
        trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
-                                    none_or_zero, result);
+                                    none_or_zero, result, unmapped);
        return ret;
 }
 
@@ -3218,28 +3294,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
        }
 }
 
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
                struct lruvec *lruvec, struct list_head *list)
 {
-       int mapcount;
        struct page *page_tail = head + tail;
 
-       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
        VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
 
        /*
         * tail_page->_count is zero and not changing from under us. But
         * get_page_unless_zero() may be running from under us on the
-        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * tail_page. If we used atomic_set() below instead of atomic_inc(), we
         * would then run atomic_set() concurrently with
         * get_page_unless_zero(), and atomic_set() is implemented in C not
         * using locked ops. spin_unlock on x86 sometime uses locked ops
         * because of PPro errata 66, 92, so unless somebody can guarantee
         * atomic_set() here would be safe on all archs (and not only on x86),
-        * it's safer to use atomic_add().
+        * it's safer to use atomic_inc().
         */
-       atomic_add(mapcount + 1, &page_tail->_count);
-
+       atomic_inc(&page_tail->_count);
 
        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        page_tail->flags |= (head->flags &
@@ -3273,8 +3347,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
        page_tail->index = head->index + tail;
        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
        lru_add_page_tail(head, page_tail, lruvec, list);
-
-       return mapcount;
 }
 
 static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3282,7 +3354,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
        struct page *head = compound_head(page);
        struct zone *zone = page_zone(head);
        struct lruvec *lruvec;
-       int i, tail_mapcount;
+       int i;
 
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -3291,10 +3363,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
 
-       tail_mapcount = 0;
        for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
-               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
-       atomic_sub(tail_mapcount, &head->_count);
+               __split_huge_page_tail(head, i, lruvec, list);
 
        ClearPageCompound(head);
        spin_unlock_irq(&zone->lru_lock);
@@ -3459,6 +3529,7 @@ void deferred_split_huge_page(struct page *page)
 
        spin_lock_irqsave(&pgdata->split_queue_lock, flags);
        if (list_empty(page_deferred_list(page))) {
+               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                list_add_tail(page_deferred_list(page), &pgdata->split_queue);
                pgdata->split_queue_len++;
        }