Merge remote-tracking branch 'tip/auto-latest'

author Stephen Rothwell <sfr@canb.auug.org.au>

Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)
author Stephen Rothwell <sfr@canb.auug.org.au>
Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)
diff --cc MAINTAINERS
Simple merge
diff --cc arch/x86/Kconfig
Simple merge
diff --cc arch/x86/include/asm/msr-index.h
Simple merge
diff --cc arch/x86/include/asm/pgtable.h
Simple merge
diff --cc arch/x86/kernel/setup.c
Simple merge
diff --cc include/linux/acpi.h
Simple merge
diff --cc include/linux/huge_mm.h
Simple merge
diff --cc include/linux/init_task.h
Simple merge
diff --cc include/linux/mempolicy.h
Simple merge
diff --cc include/linux/mm.h

index fa068040273893c27d71461e459e2eb268739396,02e8c1f28bf84bab11fdb6b6694efdbfe46d5fd7..423464bb332c5ad3d517f38105205213e8b977a2
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -161,8 -157,38 +161,21 @@@ extern pgprot_t protection_map[16]
   #define FAULT_FLAG_ALLOW_RETRY        0x08    /* Retry fault if blocking */
   #define FAULT_FLAG_RETRY_NOWAIT       0x10    /* Don't drop mmap_sem and wait when retrying */
   #define FAULT_FLAG_KILLABLE   0x20    /* The fault task is in SIGKILL killable region */
- -
- -/*
- - * This interface is used by x86 PAT code to identify a pfn mapping that is
- - * linear over entire vma. This is to optimize PAT code that deals with
- - * marking the physical region with a particular prot. This is not for generic
- - * mm use. Note also that this check will not work if the pfn mapping is
- - * linear for a vma starting at physical address 0. In which case PAT code
- - * falls back to slow path of reserving physical range page by page.
- - */
- -static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
- -{
- -      return !!(vma->vm_flags & VM_PFN_AT_MMAP);
- -}
- -
- -static inline int is_pfn_mapping(struct vm_area_struct *vma)
- -{
- -      return !!(vma->vm_flags & VM_PFNMAP);
- -}
+ +#define FAULT_FLAG_TRIED      0x40    /* second try */
   
+ /*
+  * Some architectures (such as x86) may need to preserve certain pgprot
+  * bits, without complicating generic pgprot code.
+  *
+  * Most architectures don't care:
+  */
+ #ifndef pgprot_modify
+ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+ {
+       return newprot;
+ }
+ #endif
+ 
   /*
    * vm_fault is filled by the the pagefault handler and passed to the vma's
    * ->fault function. The vma's ->fault is responsible for returning a bitmask
diff --cc include/linux/mm_types.h
Simple merge
diff --cc include/linux/mmzone.h
Simple merge
diff --cc include/linux/sched.h
Simple merge
diff --cc init/Kconfig
Simple merge
diff --cc kernel/events/core.c
Simple merge
diff --cc kernel/sysctl.c
Simple merge
diff --cc lib/Makefile

index 821a16229111eba69f189ae9be683f02eeb70862,0924041b6959188b074b3240b51ce5e31aa2c70c..e91b9dffc853e95009c44e647e0cb9e6a8de374d
--- 1/lib/Makefile
--- 2/lib/Makefile
+++ b/lib/Makefile
@@@ -9,10 -9,10 +9,10 @@@ endi
   
   lib-y := ctype.o string.o vsprintf.o cmdline.o \
          rbtree.o radix-tree.o dump_stack.o timerqueue.o\
- -       idr.o int_sqrt.o extable.o prio_tree.o \
+ +       idr.o int_sqrt.o extable.o \
          sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
          proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
-        is_single_threaded.o plist.o decompress.o
+        is_single_threaded.o plist.o decompress.o earlycpio.o
   
   lib-$(CONFIG_MMU) += ioremap.o
   lib-$(CONFIG_SMP) += cpumask.o
diff --cc mm/huge_memory.c

index a863af26c79c0190f378c36ac4e680add06f700a,5b9ab256910c387f853593eedf6156d2581cf70e..0eb1b3342247b99f8076f6bcb9a781755751813a
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -1346,59 -1428,55 +1417,54 @@@ static int __split_huge_page_map(struc
         spin_lock(&mm->page_table_lock);
         pmd = page_check_address_pmd(page, mm, address,
                                      PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
-       if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm);
-               pmd_populate(mm, &_pmd, pgtable);
- 
-               haddr = address;
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                       pte_t *pte, entry;
-                       BUG_ON(PageCompound(page+i));
-                       entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                       if (!pmd_write(*pmd))
-                               entry = pte_wrprotect(entry);
-                       else
-                               BUG_ON(page_mapcount(page) != 1);
-                       if (!pmd_young(*pmd))
-                               entry = pte_mkold(entry);
-                       pte = pte_offset_map(&_pmd, haddr);
-                       BUG_ON(!pte_none(*pte));
-                       set_pte_at(mm, haddr, pte, entry);
-                       pte_unmap(pte);
-               }
+       if (!pmd)
+               goto unlock;
   
-               smp_wmb(); /* make pte visible before pmd */
-               /*
-                * Up to this point the pmd is present and huge and
-                * userland has the whole access to the hugepage
-                * during the split (which happens in place). If we
-                * overwrite the pmd with the not-huge version
-                * pointing to the pte here (which of course we could
-                * if all CPUs were bug free), userland could trigger
-                * a small page size TLB miss on the small sized TLB
-                * while the hugepage TLB entry is still established
-                * in the huge TLB. Some CPU doesn't like that. See
-                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                * Erratum 383 on page 93. Intel should be safe but is
-                * also warns that it's only safe if the permission
-                * and cache attributes of the two entries loaded in
-                * the two TLB is identical (which should be the case
-                * here). But it is generally safer to never allow
-                * small and huge TLB entries for the same virtual
-                * address to be loaded simultaneously. So instead of
-                * doing "pmd_populate(); flush_tlb_range();" we first
-                * mark the current pmd notpresent (atomically because
-                * here the pmd_trans_huge and pmd_trans_splitting
-                * must remain set at all times on the pmd until the
-                * split is complete for this pmd), then we flush the
-                * SMP TLB and finally we write the non-huge version
-                * of the pmd entry with pmd_populate.
-                */
-               pmdp_invalidate(vma, address, pmd);
-               pmd_populate(mm, pmd, pgtable);
-               ret = 1;
+       prot = pmd_pgprot(*pmd);
- -      pgtable = get_pmd_huge_pte(mm);
++      pgtable = pgtable_trans_huge_withdraw(mm);
+       pmd_populate(mm, &_pmd, pgtable);
+ 
+       for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t *pte, entry;
+ 
+               BUG_ON(PageCompound(page+i));
+               entry = mk_pte(page + i, prot);
+               entry = pte_mkdirty(entry);
+               if (!pmd_young(*pmd))
+                       entry = pte_mkold(entry);
+               pte = pte_offset_map(&_pmd, haddr);
+               BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               pte_unmap(pte);
         }
- -      set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ 
+       smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */
+       /*
+        * Up to this point the pmd is present and huge.
+        *
+        * If we overwrite the pmd with the not-huge version, we could trigger
+        * a small page size TLB miss on the small sized TLB while the hugepage
+        * TLB entry is still established in the huge TLB.
+        *
+        * Some CPUs don't like that. See
+        * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383
+        * on page 93.
+        *
+        * Thus it is generally safer to never allow small and huge TLB entries
+        * for overlapping virtual addresses to be loaded. So we first mark the
+        * current pmd not present, then we flush the TLB and finally we write
+        * the non-huge version of the pmd entry with pmd_populate.
+        *
+        * The above needs to be done under the ptl because pmd_trans_huge and
+        * pmd_trans_splitting must remain set on the pmd until the split is
+        * complete. The ptl also protects against concurrent faults due to
+        * making the pmd not-present.
+        */
++      pmdp_invalidate(vma, address, pmd);
+       pmd_populate(mm, pmd, pgtable);
+       ret = 1;
+ 
+ unlock:
         spin_unlock(&mm->page_table_lock);
   
         return ret;
@@@ -2279,23 -2300,30 +2345,21 @@@ static int khugepaged_has_work(void
   static int khugepaged_wait_event(void)
   {
         return !list_empty(&khugepaged_scan.mm_head) ||
- -              !khugepaged_enabled();
+ +              kthread_should_stop();
   }
   
- -static void khugepaged_do_scan(struct page **hpage)
+ +static void khugepaged_do_scan(void)
   {
+ +      struct page *hpage = NULL;
         unsigned int progress = 0, pass_through_head = 0;
-       unsigned int pages = khugepaged_pages_to_scan;
+       unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan);
+ +      bool wait = true;
   
-       barrier(); /* write khugepaged_pages_to_scan to local stack */
- 
         while (progress < pages) {
- -              cond_resched();
- -
- -#ifndef CONFIG_NUMA
- -              if (!*hpage) {
- -                      *hpage = alloc_hugepage(khugepaged_defrag());
- -                      if (unlikely(!*hpage)) {
- -                              count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- -                              break;
- -                      }
- -                      count_vm_event(THP_COLLAPSE_ALLOC);
- -              }
- -#else
- -              if (IS_ERR(*hpage))
+ +              if (!khugepaged_prealloc_page(&hpage, &wait))
                         break;
- -#endif
+ +
+ +              cond_resched();
   
                 if (unlikely(kthread_should_stop() || freezing(current)))
                         break;
diff --cc mm/memory.c
Simple merge
diff --cc mm/mempolicy.c

index 0b78fb9ea65b60afa08287f0eff1d797d75c9a7e,3360a8dee5c549172ad30ae8319f08f769133aca..1dcb9fdc9889e249ea1ad0789127e848bc5f5909
--- 1/mm/mempolicy.c
--- 2/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@@ -2170,12 -2168,116 +2203,122 @@@ mpol_shared_policy_lookup(struct shared
         return pol;
   }
   
+ +static void sp_free(struct sp_node *n)
+ +{
+ +      mpol_put(n->policy);
+ +      kmem_cache_free(sn_cache, n);
+ +}
+ +
+ /**
+  * mpol_misplaced - check whether current page node is valid in policy
+  *
+  * @page   - page to be checked
+  * @vma    - vm area where page mapped
+  * @addr   - virtual address where page mapped
+  * @multi  - use multi-stage node binding
+  *
+  * Lookup current policy node id for vma,addr and "compare to" page's
+  * node id.
+  *
+  * Returns:
+  *    -1      - not misplaced, page is in the right node
+  *    node    - node id where the page should be
+  *
+  * Policy determination "mimics" alloc_page_vma().
+  * Called from fault path where we know the vma and faulting address.
+  */
+ int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+                  unsigned long addr, int multi)
+ {
+       struct mempolicy *pol;
+       struct zone *zone;
+       int curnid = page_to_nid(page);
+       unsigned long pgoff;
+       int polnid = -1;
+       int ret = -1;
+ 
+       BUG_ON(!vma);
+ 
+       pol = get_vma_policy(current, vma, addr);
+       if (!(pol->flags & MPOL_F_MOF))
+               goto out;
+ 
+       switch (pol->mode) {
+       case MPOL_INTERLEAVE:
+               BUG_ON(addr >= vma->vm_end);
+               BUG_ON(addr < vma->vm_start);
+ 
+               pgoff = vma->vm_pgoff;
+               pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+               polnid = offset_il_node(pol, vma, pgoff);
+               break;
+ 
+       case MPOL_PREFERRED:
+               if (pol->flags & MPOL_F_LOCAL)
+                       polnid = numa_node_id();
+               else
+                       polnid = pol->v.preferred_node;
+               break;
+ 
+       case MPOL_BIND:
+               /*
+                * allows binding to multiple nodes.
+                * use current page if in policy nodemask,
+                * else select nearest allowed node, if any.
+                * If no allowed nodes, use current [!misplaced].
+                */
+               if (node_isset(curnid, pol->v.nodes))
+                       goto out;
+               (void)first_zones_zonelist(
+                               node_zonelist(numa_node_id(), GFP_HIGHUSER),
+                               gfp_zone(GFP_HIGHUSER),
+                               &pol->v.nodes, &zone);
+               polnid = zone->node;
+               break;
+ 
+       default:
+               BUG();
+       }
+ 
+       /*
+        * Multi-stage node selection is used in conjunction with a periodic
+        * migration fault to build a temporal task<->page relation. By
+        * using a two-stage filter we remove short/unlikely relations.
+        *
+        * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+        * equate a task's usage of a particular page (n_p) per total usage
+        * of this page (n_t) (in a given time-span) to a probability.
+        *
+        * Our periodic faults will then sample this probability and getting
+        * the same result twice in a row, given these samples are fully
+        * independent, is then given by P(n)^2, provided our sample period
+        * is sufficiently short compared to the usage pattern.
+        *
+        * This quadric squishes small probabilities, making it less likely
+        * we act on an unlikely task<->page relation.
+        *
+        * NOTE: effectively we're using task-home-node<->page-node relations
+        * since those are the only thing we can affect.
+        *
+        * NOTE: we're using task-home-node as opposed to the current node
+        * the task might be running on, since the task-home-node is the
+        * long-term node of this task, further reducing noise. Also see
+        * task_tick_numa().
+        */
+       if (multi && (pol->flags & MPOL_F_HOME)) {
+               int last_nid = page_xchg_last_nid(page, polnid);
+               if (last_nid != polnid)
+                       goto out;
+       }
+ 
+       if (curnid != polnid)
+               ret = polnid;
+ out:
+       mpol_cond_put(pol);
+ 
+       return ret;
+ }
+ 
   static void sp_delete(struct shared_policy *sp, struct sp_node *n)
   {
         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
diff --cc mm/vmstat.c
Simple merge
diff --cc scripts/kconfig/Makefile
Simple merge
diff --cc tools/perf/Makefile
Simple merge
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Fri, 12 Oct 2012 01:08:11 +0000 (12:08 +1100)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/msr-index.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/acpi.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/huge_mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mempolicy.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mmzone.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mempolicy.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmstat.c	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/kconfig/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
tools/perf/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history