* NUMA memory policies for Linux.
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/
-
-/*
- * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
- * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
- * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
- */
-
-/* Policies */
-enum {
- MPOL_DEFAULT,
- MPOL_PREFERRED,
- MPOL_BIND,
- MPOL_INTERLEAVE,
- MPOL_LOCAL,
- MPOL_NOOP, /* retain existing policy for range */
- MPOL_MAX, /* always last member of enum */
-};
-
-enum mpol_rebind_step {
- MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
- MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
- MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
- MPOL_REBIND_NSTEP,
-};
-
-/* Flags for set_mempolicy */
-#define MPOL_F_STATIC_NODES (1 << 15)
-#define MPOL_F_RELATIVE_NODES (1 << 14)
-
-/*
- * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
- * either set_mempolicy() or mbind().
- */
-#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
-
-/* Flags for get_mempolicy */
-#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
-#define MPOL_F_ADDR (1<<1) /* look up vma using address */
-#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
-
-/* Flags for mbind */
-#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
- to policy */
-#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
-#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
-#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
-
-#define MPOL_MF_VALID (MPOL_MF_STRICT | \
- MPOL_MF_MOVE | \
- MPOL_MF_MOVE_ALL | \
- MPOL_MF_LAZY)
-
-/*
- * Internal flags that share the struct mempolicy flags word with
- * "mode flags". These flags are allocated from bit 0 up, as they
- * are never OR'ed into the mode in mempolicy API arguments.
- */
-#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
-#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
-#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
-#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
-#define MPOL_F_HOME (1 << 4) /* this is the home-node policy */
-
-#ifdef __KERNEL__
+#ifndef _LINUX_MEMPOLICY_H
+#define _LINUX_MEMPOLICY_H 1
-
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
return 0;
}
+ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
+ {
+ return -1; /* no node preference */
+ }
+
#endif /* CONFIG_NUMA */
-#endif /* __KERNEL__ */
-
#endif
#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
-
-/*
- * This interface is used by x86 PAT code to identify a pfn mapping that is
- * linear over entire vma. This is to optimize PAT code that deals with
- * marking the physical region with a particular prot. This is not for generic
- * mm use. Note also that this check will not work if the pfn mapping is
- * linear for a vma starting at physical address 0. In which case PAT code
- * falls back to slow path of reserving physical range page by page.
- */
-static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFN_AT_MMAP);
-}
-
-static inline int is_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFNMAP);
-}
+#define FAULT_FLAG_TRIED 0x40 /* second try */
+ /*
+ * Some architectures (such as x86) may need to preserve certain pgprot
+ * bits, without complicating generic pgprot code.
+ *
+ * Most architectures don't care:
+ */
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+ return newprot;
+ }
+ #endif
+
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
* ->fault function. The vma's ->fault is responsible for returning a bitmask
--- /dev/null
- #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
- #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
- #define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
+/*
+ * NUMA memory policies for Linux.
+ * Copyright 2003,2004 Andi Kleen SuSE Labs
+ */
+#ifndef _UAPI_LINUX_MEMPOLICY_H
+#define _UAPI_LINUX_MEMPOLICY_H
+
+#include <linux/errno.h>
+
+
+/*
+ * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
+ * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
+ * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
+ */
+
+/* Policies */
+enum {
+ MPOL_DEFAULT,
+ MPOL_PREFERRED,
+ MPOL_BIND,
+ MPOL_INTERLEAVE,
++ MPOL_LOCAL,
++ MPOL_NOOP, /* retain existing policy for range */
+ MPOL_MAX, /* always last member of enum */
+};
+
+enum mpol_rebind_step {
+ MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
+ MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
+ MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
+ MPOL_REBIND_NSTEP,
+};
+
+/* Flags for set_mempolicy */
+#define MPOL_F_STATIC_NODES (1 << 15)
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+
+/*
+ * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
+ * either set_mempolicy() or mbind().
+ */
+#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
+
+/* Flags for get_mempolicy */
+#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
+#define MPOL_F_ADDR (1<<1) /* look up vma using address */
+#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
+
+/* Flags for mbind */
+#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
-
++#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
++ to policy */
++#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
++#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
++#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
++
++#define MPOL_MF_VALID (MPOL_MF_STRICT | \
++ MPOL_MF_MOVE | \
++ MPOL_MF_MOVE_ALL | \
++ MPOL_MF_LAZY)
+
+/*
+ * Internal flags that share the struct mempolicy flags word with
+ * "mode flags". These flags are allocated from bit 0 up, as they
+ * are never OR'ed into the mode in mempolicy API arguments.
+ */
+#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
+#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
+#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
++#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
++#define MPOL_F_HOME (1 << 4) /* this is the home-node policy */
+
+#endif /* _UAPI_LINUX_MEMPOLICY_H */
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
- idr.o int_sqrt.o extable.o prio_tree.o \
+ idr.o int_sqrt.o extable.o \
sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
- is_single_threaded.o plist.o decompress.o
+ is_single_threaded.o plist.o decompress.o earlycpio.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
+ #include <linux/migrate.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
- pmd_populate(mm, &_pmd, pgtable);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- else
- BUG_ON(page_mapcount(page) != 1);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
+ if (!pmd)
+ goto unlock;
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
+ prot = pmd_pgprot(*pmd);
- pgtable = get_pmd_huge_pte(mm);
++ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, prot);
+ entry = pte_mkdirty(entry);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
}
- set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */
+ /*
+ * Up to this point the pmd is present and huge.
+ *
+ * If we overwrite the pmd with the not-huge version, we could trigger
+ * a small page size TLB miss on the small sized TLB while the hugepage
+ * TLB entry is still established in the huge TLB.
+ *
+ * Some CPUs don't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383
+ * on page 93.
+ *
+ * Thus it is generally safer to never allow small and huge TLB entries
+ * for overlapping virtual addresses to be loaded. So we first mark the
+ * current pmd not present, then we flush the TLB and finally we write
+ * the non-huge version of the pmd entry with pmd_populate.
+ *
+ * The above needs to be done under the ptl because pmd_trans_huge and
+ * pmd_trans_splitting must remain set on the pmd until the split is
+ * complete. The ptl also protects against concurrent faults due to
+ * making the pmd not-present.
+ */
++ pmdp_invalidate(vma, address, pmd);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+
+ unlock:
spin_unlock(&mm->page_table_lock);
return ret;
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
- !khugepaged_enabled();
+ kthread_should_stop();
}
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
{
+ struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan);
+ bool wait = true;
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
while (progress < pages) {
- cond_resched();
-
-#ifndef CONFIG_NUMA
- if (!*hpage) {
- *hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
- }
-#else
- if (IS_ERR(*hpage))
+ if (!khugepaged_prealloc_page(&hpage, &wait))
break;
-#endif
+
+ cond_resched();
if (unlikely(kthread_should_stop() || freezing(current)))
break;
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+ /**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ * @multi - use multi-stage node binding
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr)
+ {
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By
+ * using a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+ * equate a task's usage of a particular page (n_p) per total usage
+ * of this page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will then sample this probability and getting
+ * the same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely
+ * we act on an unlikely task<->page relation.
+ */
+ if (pol->flags & MPOL_F_HOME) {
+ int last_nid;
+
+ /*
+ * Migrate towards the current node, depends on
+ * task_numa_placement() details.
+ */
+ polnid = numa_node_id();
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+ out:
+ mpol_cond_put(pol);
+
+ return ret;
+ }
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);