#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
+#include <linux/freezer.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
* allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
(1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
int err;
#ifdef CONFIG_SYSFS
static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+#ifdef CONFIG_SYSFS
err = -ENOMEM;
hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!hugepage_kobj)) {
return ret;
}
+static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+{
+ return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+ HPAGE_PMD_ORDER, vma, haddr);
+}
+
+#ifndef CONFIG_NUMA
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT),
+ return alloc_pages(alloc_hugepage_gfpmask(defrag),
HPAGE_PMD_ORDER);
}
+#endif
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
- page = alloc_hugepage(transparent_hugepage_defrag(vma));
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
if (unlikely(!page))
goto out;
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
- new_page = alloc_hugepage(transparent_hugepage_defrag(vma));
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr);
else
new_page = NULL;
goto out;
if (pmd_page(*pmd) != page)
goto out;
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd));
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
VM_BUG_ON(PageLRU(page));
/* If there is no mapped pte young don't collapse the page */
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (unlikely(!referenced))
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
- struct page **hpage)
+ struct page **hpage,
+ struct vm_area_struct *vma)
{
- struct vm_area_struct *vma;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd, _pmd;
unsigned long hstart, hend;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+#endif
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
/*
* Prevent all access to pagetables with the exception of
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
goto out;
- new_page = *hpage;
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
- goto out;
-
anon_vma_lock(vma->anon_vma);
pte = pte_offset_map(pmd, address);
mm->nr_ptes--;
spin_unlock(&mm->page_table_lock);
+#ifndef CONFIG_NUMA
*hpage = NULL;
+#endif
khugepaged_pages_collapsed++;
-out:
+out_up_write:
up_write(&mm->mmap_sem);
+ return;
+
+out:
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
goto out_unmap;
- if (pte_young(pteval))
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (referenced)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- up_read(&mm->mmap_sem);
- collapse_huge_page(mm, address, hpage);
- }
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma);
out:
return ret;
}
while (progress < pages) {
cond_resched();
+#ifndef CONFIG_NUMA
if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag());
if (unlikely(!*hpage))
break;
}
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot)
}
}
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
static struct page *khugepaged_alloc_hugepage(void)
{
struct page *hpage;
do {
hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage) {
- DEFINE_WAIT(wait);
- add_wait_queue(&khugepaged_wait, &wait);
- schedule_timeout_interruptible(
- msecs_to_jiffies(
- khugepaged_alloc_sleep_millisecs));
- remove_wait_queue(&khugepaged_wait, &wait);
- }
+ if (!hpage)
+ khugepaged_alloc_sleep();
} while (unlikely(!hpage) &&
likely(khugepaged_enabled()));
return hpage;
}
+#endif
static void khugepaged_loop(void)
{
struct page *hpage;
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
hpage = khugepaged_alloc_hugepage();
if (unlikely(!hpage))
break;
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
if (hpage)
put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
if (khugepaged_has_work()) {
DEFINE_WAIT(wait);
if (!khugepaged_scan_sleep_millisecs)
khugepaged_scan_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
} else if (khugepaged_enabled())
- wait_event_interruptible(khugepaged_wait,
- khugepaged_wait_event());
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
}
}
{
struct mm_slot *mm_slot;
+ set_freezable();
set_user_nice(current, 19);
/* serialize with start_khugepaged() */
mutex_lock(&khugepaged_mutex);
if (!khugepaged_enabled())
break;
+ if (unlikely(kthread_should_stop()))
+ break;
}
spin_lock(&khugepaged_mm_lock);
put_page(page);
BUG_ON(pmd_trans_huge(*pmd));
}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}