remap_pfn_range(vma, vaddr, pfn, size, prot)
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+extern int has_transparent_hugepage(void);
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_HUGE);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ pmd_val(pmd) |= _PAGE_HUGE;
+
+ return pmd;
+}
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_SPLITTING);
+}
+
+static inline pmd_t pmd_mksplitting(pmd_t pmd)
+{
+ pmd_val(pmd) |= _PAGE_SPLITTING;
+
+ return pmd;
+}
+
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd);
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+/* Extern to avoid header file madness */
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_WRITE);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~(_PAGE_WRITE | _PAGE_SILENT_WRITE);
+ return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ pmd_val(pmd) |= _PAGE_WRITE;
+ if (pmd_val(pmd) & _PAGE_MODIFIED)
+ pmd_val(pmd) |= _PAGE_SILENT_WRITE;
+
+ return pmd;
+}
+
+static inline int pmd_dirty(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_MODIFIED);
+}
+
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~(_PAGE_MODIFIED | _PAGE_SILENT_WRITE);
+ return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ pmd_val(pmd) |= _PAGE_MODIFIED;
+ if (pmd_val(pmd) & _PAGE_WRITE)
+ pmd_val(pmd) |= _PAGE_SILENT_WRITE;
+
+ return pmd;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~(_PAGE_ACCESSED|_PAGE_SILENT_READ);
+
+ return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ pmd_val(pmd) |= _PAGE_ACCESSED;
+
+ if (cpu_has_rixi) {
+ if (!(pmd_val(pmd) & _PAGE_NO_READ))
+ pmd_val(pmd) |= _PAGE_SILENT_READ;
+ } else {
+ if (pmd_val(pmd) & _PAGE_READ)
+ pmd_val(pmd) |= _PAGE_SILENT_READ;
+ }
+
+ return pmd;
+}
+
+/* Extern to avoid header file madness */
+extern pmd_t mk_pmd(struct page *page, pgprot_t prot);
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ return pmd_val(pmd) >> _PFN_SHIFT;
+}
+
+static inline struct page *pmd_page(pmd_t pmd)
+{
+ if (pmd_trans_huge(pmd))
+ return pfn_to_page(pmd_pfn(pmd));
+
+ return pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT);
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmd_val(pmd) = (pmd_val(pmd) & _PAGE_CHG_MASK) | pgprot_val(newprot);
+ return pmd;
+}
+
++#define pmd_pgprot(x) __pgprot(pmd_val(x) & ~_PAGE_CHG_MASK)
++
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~(_PAGE_PRESENT | _PAGE_VALID | _PAGE_DIRTY);
+
+ return pmd;
+}
+
+/*
+ * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * different prototype.
+ */
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
+{
+ pmd_t old = *pmdp;
+
+ pmd_clear(pmdp);
+
+ return old;
+}
+
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
#include <asm-generic/pgtable.h>
/*
#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
#define pte_unmap(pte) do { } while (0)
+static inline void __pmd_idte(unsigned long address, pmd_t *pmdp)
+{
+ unsigned long sto = (unsigned long) pmdp -
+ pmd_index(address) * sizeof(pmd_t);
+
+ if (!(pmd_val(*pmdp) & _SEGMENT_ENTRY_INV)) {
+ asm volatile(
+ " .insn rrf,0xb98e0000,%2,%3,0,0"
+ : "=m" (*pmdp)
+ : "m" (*pmdp), "a" (sto),
+ "a" ((address & HPAGE_MASK))
+ : "cc"
+ );
+ }
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_PGTABLE_DEPOSIT
+extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable);
+
+#define __HAVE_ARCH_PGTABLE_WITHDRAW
+extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm);
+
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT;
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t entry)
+{
+ *pmdp = entry;
+}
+
++static inline pgprot_t pmd_pgprot(pmd_t pmd)
++{
++ pgprot_t prot = PAGE_RW;
++
++ if (pmd_val(pmd) & _SEGMENT_ENTRY_RO) {
++ if (pmd_val(pmd) & _SEGMENT_ENTRY_INV)
++ prot = PAGE_NONE;
++ else
++ prot = PAGE_RO;
++ }
++ return prot;
++}
++
+static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
+{
+ unsigned long pgprot_pmd = 0;
+
+ if (pgprot_val(pgprot) & _PAGE_INVALID) {
+ if (pgprot_val(pgprot) & _PAGE_SWT)
+ pgprot_pmd |= _HPAGE_TYPE_NONE;
+ pgprot_pmd |= _SEGMENT_ENTRY_INV;
+ }
+ if (pgprot_val(pgprot) & _PAGE_RO)
+ pgprot_pmd |= _SEGMENT_ENTRY_RO;
+ return pgprot_pmd;
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmd_val(pmd) &= _SEGMENT_CHG_MASK;
+ pmd_val(pmd) |= massage_pgprot_pmd(newprot);
+ return pmd;
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+ return pmd;
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ pmd_val(pmd) &= ~_SEGMENT_ENTRY_RO;
+ return pmd;
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ pmd_val(pmd) |= _SEGMENT_ENTRY_RO;
+ return pmd;
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ /* No dirty bit in the segment table entry. */
+ return pmd;
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ /* No referenced bit in the segment table entry. */
+ return pmd;
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ /* No referenced bit in the segment table entry. */
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ unsigned long pmd_addr = pmd_val(*pmdp) & HPAGE_MASK;
+ long tmp, rc;
+ int counter;
+
+ rc = 0;
+ if (MACHINE_HAS_RRBM) {
+ counter = PTRS_PER_PTE >> 6;
+ asm volatile(
+ "0: .insn rre,0xb9ae0000,%0,%3\n" /* rrbm */
+ " ogr %1,%0\n"
+ " la %3,0(%4,%3)\n"
+ " brct %2,0b\n"
+ : "=&d" (tmp), "+&d" (rc), "+d" (counter),
+ "+a" (pmd_addr)
+ : "a" (64 * 4096UL) : "cc");
+ rc = !!rc;
+ } else {
+ counter = PTRS_PER_PTE;
+ asm volatile(
+ "0: rrbe 0,%2\n"
+ " la %2,0(%3,%2)\n"
+ " brc 12,1f\n"
+ " lhi %0,1\n"
+ "1: brct %1,0b\n"
+ : "+d" (rc), "+d" (counter), "+a" (pmd_addr)
+ : "a" (4096UL) : "cc");
+ }
+ return rc;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+
+ __pmd_idte(address, pmdp);
+ pmd_clear(pmdp);
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
+static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+}
+
+#define __HAVE_ARCH_PMDP_INVALIDATE
+static inline void pmdp_invalidate(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ __pmd_idte(address, pmdp);
+}
+
+static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
+{
+ pmd_t __pmd;
+ pmd_val(__pmd) = physpage + massage_pgprot_pmd(pgprot);
+ return __pmd;
+}
+
+#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+ return MACHINE_HAS_HPAGE ? 1 : 0;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ if (pmd_trans_huge(pmd))
+ return pmd_val(pmd) >> HPAGE_SHIFT;
+ else
+ return pmd_val(pmd) >> PAGE_SHIFT;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/*
* 31 bit swap entry format:
* A page-table entry has some bits we have to treat in a special way.
* NUMA memory policies for Linux.
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/
-
-/*
- * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
- * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
- * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
- */
-
-/* Policies */
-enum {
- MPOL_DEFAULT,
- MPOL_PREFERRED,
- MPOL_BIND,
- MPOL_INTERLEAVE,
- MPOL_LOCAL,
- MPOL_NOOP, /* retain existing policy for range */
- MPOL_MAX, /* always last member of enum */
-};
-
-enum mpol_rebind_step {
- MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
- MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
- MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
- MPOL_REBIND_NSTEP,
-};
-
-/* Flags for set_mempolicy */
-#define MPOL_F_STATIC_NODES (1 << 15)
-#define MPOL_F_RELATIVE_NODES (1 << 14)
-
-/*
- * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
- * either set_mempolicy() or mbind().
- */
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
-
-/* Flags for get_mempolicy */
-#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
-#define MPOL_F_ADDR (1<<1) /* look up vma using address */
-#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
-
-/* Flags for mbind */
-#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
- to policy */
-#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
-#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
-#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
-
-#define MPOL_MF_VALID (MPOL_MF_STRICT | \
- MPOL_MF_MOVE | \
- MPOL_MF_MOVE_ALL | \
- MPOL_MF_LAZY)
-
-/*
- * Internal flags that share the struct mempolicy flags word with
- * "mode flags". These flags are allocated from bit 0 up, as they
- * are never OR'ed into the mode in mempolicy API arguments.
- */
-#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
-#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
-#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
-#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
-#define MPOL_F_HOME (1 << 4) /* this is the home-node policy */
-
-#ifdef __KERNEL__
+#ifndef _LINUX_MEMPOLICY_H
+#define _LINUX_MEMPOLICY_H 1
-
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
return 0;
}
+ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, int multi)
+ {
+ return -1; /* no node preference */
+ }
+
#endif /* CONFIG_NUMA */
-#endif /* __KERNEL__ */
-
#endif
#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
-
-/*
- * This interface is used by x86 PAT code to identify a pfn mapping that is
- * linear over entire vma. This is to optimize PAT code that deals with
- * marking the physical region with a particular prot. This is not for generic
- * mm use. Note also that this check will not work if the pfn mapping is
- * linear for a vma starting at physical address 0. In which case PAT code
- * falls back to slow path of reserving physical range page by page.
- */
-static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFN_AT_MMAP);
-}
-
-static inline int is_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFNMAP);
-}
+#define FAULT_FLAG_TRIED 0x40 /* second try */
+ /*
+ * Some architectures (such as x86) may need to preserve certain pgprot
+ * bits, without complicating generic pgprot code.
+ *
+ * Most architectures don't care:
+ */
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+ return newprot;
+ }
+ #endif
+
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
* ->fault function. The vma's ->fault is responsible for returning a bitmask
--- /dev/null
- #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
- #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
- #define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
+/*
+ * NUMA memory policies for Linux.
+ * Copyright 2003,2004 Andi Kleen SuSE Labs
+ */
+#ifndef _UAPI_LINUX_MEMPOLICY_H
+#define _UAPI_LINUX_MEMPOLICY_H
+
+#include <linux/errno.h>
+
+
+/*
+ * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
+ * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
+ * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
+ */
+
+/* Policies */
+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
++ MPOL_LOCAL,
++ MPOL_NOOP, /* retain existing policy for range */
+ MPOL_MAX, /* always last member of enum */
+};
+
+enum mpol_rebind_step {
+ MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
+ MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
+ MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
+ MPOL_REBIND_NSTEP,
+};
+
+/* Flags for set_mempolicy */
+#define MPOL_F_STATIC_NODES (1 << 15)
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+
+/*
+ * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
+ * either set_mempolicy() or mbind().
+ */
+#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+
+/* Flags for get_mempolicy */
+#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
+#define MPOL_F_ADDR (1<<1) /* look up vma using address */
+#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
+
+/* Flags for mbind */
+#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
-
++#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
++ to policy */
++#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
++#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
++#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
++
++#define MPOL_MF_VALID (MPOL_MF_STRICT | \
++ MPOL_MF_MOVE | \
++ MPOL_MF_MOVE_ALL | \
++ MPOL_MF_LAZY)
+
+/*
+ * Internal flags that share the struct mempolicy flags word with
+ * "mode flags". These flags are allocated from bit 0 up, as they
+ * are never OR'ed into the mode in mempolicy API arguments.
+ */
+#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
+#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
+#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
++#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
++#define MPOL_F_HOME (1 << 4) /* this is the home-node policy */
+
+#endif /* _UAPI_LINUX_MEMPOLICY_H */
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
- idr.o int_sqrt.o extable.o prio_tree.o \
+ idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
- is_single_threaded.o plist.o decompress.o
+ is_single_threaded.o plist.o decompress.o earlycpio.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
+ #include <linux/migrate.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
- pmd_populate(mm, &_pmd, pgtable);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- else
- BUG_ON(page_mapcount(page) != 1);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
+ if (!pmd)
+ goto unlock;
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
+ prot = pmd_pgprot(*pmd);
- pgtable = get_pmd_huge_pte(mm);
++ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, prot);
+ entry = pte_mkdirty(entry);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
}
- set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */
+ /*
+ * Up to this point the pmd is present and huge.
+ *
+ * If we overwrite the pmd with the not-huge version, we could trigger
+ * a small page size TLB miss on the small sized TLB while the hugepage
+ * TLB entry is still established in the huge TLB.
+ *
+ * Some CPUs don't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383
+ * on page 93.
+ *
+ * Thus it is generally safer to never allow small and huge TLB entries
+ * for overlapping virtual addresses to be loaded. So we first mark the
+ * current pmd not present, then we flush the TLB and finally we write
+ * the non-huge version of the pmd entry with pmd_populate.
+ *
+ * The above needs to be done under the ptl because pmd_trans_huge and
+ * pmd_trans_splitting must remain set on the pmd until the split is
+ * complete. The ptl also protects against concurrent faults due to
+ * making the pmd not-present.
+ */
++ pmdp_invalidate(vma, address, pmd);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+
+ unlock:
spin_unlock(&mm->page_table_lock);
return ret;
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
- !khugepaged_enabled();
+ kthread_should_stop();
}
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
{
+ struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan);
+ bool wait = true;
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
while (progress < pages) {
- cond_resched();
-
-#ifndef CONFIG_NUMA
- if (!*hpage) {
- *hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
- }
-#else
- if (IS_ERR(*hpage))
+ if (!khugepaged_prealloc_page(&hpage, &wait))
break;
-#endif
+
+ cond_resched();
if (unlikely(kthread_should_stop() || freezing(current)))
break;
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+ /**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ * @multi - use multi-stage node binding
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int multi)
+ {
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By
+ * using a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+ * equate a task's usage of a particular page (n_p) per total usage
+ * of this page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will then sample this probability and getting
+ * the same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely
+ * we act on an unlikely task<->page relation.
+ *
+ * NOTE: effectively we're using task-home-node<->page-node relations
+ * since those are the only thing we can affect.
+ *
+ * NOTE: we're using task-home-node as opposed to the current node
+ * the task might be running on, since the task-home-node is the
+ * long-term node of this task, further reducing noise. Also see
+ * task_tick_numa().
+ */
+ if (multi && (pol->flags & MPOL_F_HOME)) {
+ int last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+ out:
+ mpol_cond_put(pol);
+
+ return ret;
+ }
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);