#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
-
-/*
- * This interface is used by x86 PAT code to identify a pfn mapping that is
- * linear over entire vma. This is to optimize PAT code that deals with
- * marking the physical region with a particular prot. This is not for generic
- * mm use. Note also that this check will not work if the pfn mapping is
- * linear for a vma starting at physical address 0. In which case PAT code
- * falls back to slow path of reserving physical range page by page.
- */
-static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFN_AT_MMAP);
-}
-
-static inline int is_pfn_mapping(struct vm_area_struct *vma)
-{
- return !!(vma->vm_flags & VM_PFNMAP);
-}
+#define FAULT_FLAG_TRIED 0x40 /* second try */
+ /*
+ * Some architectures (such as x86) may need to preserve certain pgprot
+ * bits, without complicating generic pgprot code.
+ *
+ * Most architectures don't care:
+ */
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+ return newprot;
+ }
+ #endif
+
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
* ->fault function. The vma's ->fault is responsible for returning a bitmask
spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
- pmd_populate(mm, &_pmd, pgtable);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- else
- BUG_ON(page_mapcount(page) != 1);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
+ if (!pmd)
+ goto unlock;
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
+ prot = pmd_pgprot(*pmd);
- pgtable = get_pmd_huge_pte(mm);
++ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, prot);
+ entry = pte_mkdirty(entry);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
}
- set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */
+ /*
+ * Up to this point the pmd is present and huge.
+ *
+ * If we overwrite the pmd with the not-huge version, we could trigger
+ * a small page size TLB miss on the small sized TLB while the hugepage
+ * TLB entry is still established in the huge TLB.
+ *
+ * Some CPUs don't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383
+ * on page 93.
+ *
+ * Thus it is generally safer to never allow small and huge TLB entries
+ * for overlapping virtual addresses to be loaded. So we first mark the
+ * current pmd not present, then we flush the TLB and finally we write
+ * the non-huge version of the pmd entry with pmd_populate.
+ *
+ * The above needs to be done under the ptl because pmd_trans_huge and
+ * pmd_trans_splitting must remain set on the pmd until the split is
+ * complete. The ptl also protects against concurrent faults due to
+ * making the pmd not-present.
+ */
++ pmdp_invalidate(vma, address, pmd);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+
+ unlock:
spin_unlock(&mm->page_table_lock);
return ret;
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
- !khugepaged_enabled();
+ kthread_should_stop();
}
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
{
+ struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
- unsigned int pages = khugepaged_pages_to_scan;
+ unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan);
+ bool wait = true;
- barrier(); /* write khugepaged_pages_to_scan to local stack */
-
while (progress < pages) {
- cond_resched();
-
-#ifndef CONFIG_NUMA
- if (!*hpage) {
- *hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
- }
-#else
- if (IS_ERR(*hpage))
+ if (!khugepaged_prealloc_page(&hpage, &wait))
break;
-#endif
+
+ cond_resched();
if (unlikely(kthread_should_stop() || freezing(current)))
break;
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+ /**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ * @multi - use multi-stage node binding
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int multi)
+ {
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By
+ * using a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+ * equate a task's usage of a particular page (n_p) per total usage
+ * of this page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will then sample this probability and getting
+ * the same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely
+ * we act on an unlikely task<->page relation.
+ *
+ * NOTE: effectively we're using task-home-node<->page-node relations
+ * since those are the only thing we can affect.
+ *
+ * NOTE: we're using task-home-node as opposed to the current node
+ * the task might be running on, since the task-home-node is the
+ * long-term node of this task, further reducing noise. Also see
+ * task_tick_numa().
+ */
+ if (multi && (pol->flags & MPOL_F_HOME)) {
+ int last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+ out:
+ mpol_cond_put(pol);
+
+ return ret;
+ }
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);