Merge branch 'linux-mfg' into tx6-mfg-devel

[karo-tx-linux.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 9cc773483624e4cbb1592ddde74f9c8faa21ef87..371aa737722add1bc38a8117772424dee1f1e001 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -994,23 +994,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
  
  #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
  static void destroy_compound_gigantic_page(struct page *page,
-                                       unsigned long order)
+                                       unsigned int order)
  {
         int i;
         int nr_pages = 1 << order;
         struct page *p = page + 1;
  
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-               __ClearPageTail(p);
+               clear_compound_head(p);
                 set_page_refcounted(p);
-               p->first_page = NULL;
         }
  
         set_compound_order(page, 0);
         __ClearPageHead(page);
  }
  
-static void free_gigantic_page(struct page *page, unsigned order)
+static void free_gigantic_page(struct page *page, unsigned int order)
  {
         free_contig_range(page_to_pfn(page), 1 << order);
  }
@@ -1054,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
         return zone_spans_pfn(zone, last_pfn);
  }
  
-static struct page *alloc_gigantic_page(int nid, unsigned order)
+static struct page *alloc_gigantic_page(int nid, unsigned int order)
  {
         unsigned long nr_pages = 1 << order;
         unsigned long ret, pfn, flags;
@@ -1090,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order)
  }
  
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static void prep_compound_gigantic_page(struct page *page, unsigned int order);
  
  static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
  {
@@ -1123,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
  static inline bool gigantic_page_supported(void) { return true; }
  #else
  static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
  static inline void destroy_compound_gigantic_page(struct page *page,
-                                               unsigned long order) { }
+                                               unsigned int order) { }
  static inline int alloc_fresh_gigantic_page(struct hstate *h,
                                         nodemask_t *nodes_allowed) { return 0; }
  #endif
@@ -1146,7 +1145,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                 1 << PG_writeback);
         }
         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       set_compound_page_dtor(page, NULL);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
         set_page_refcounted(page);
         if (hstate_is_gigantic(h)) {
                 destroy_compound_gigantic_page(page, huge_page_order(h));
@@ -1242,7 +1241,7 @@ void free_huge_page(struct page *page)
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
  {
         INIT_LIST_HEAD(&page->lru);
-       set_compound_page_dtor(page, free_huge_page);
+       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         spin_lock(&hugetlb_lock);
         set_hugetlb_cgroup(page, NULL);
         h->nr_huge_pages++;
@@ -1251,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         put_page(page); /* free it into the hugepage allocator */
  }
  
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
  {
         int i;
         int nr_pages = 1 << order;
@@ -1259,8 +1258,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
  
         /* we rely on prep_new_huge_page to set the destructor */
         set_compound_order(page, order);
-       __SetPageHead(page);
         __ClearPageReserved(page);
+       __SetPageHead(page);
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                 /*
                  * For gigantic hugepages allocated through bootmem at
@@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
                  */
                 __ClearPageReserved(p);
                 set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               set_compound_head(p, page);
         }
  }
  
@@ -1294,7 +1290,7 @@ int PageHuge(struct page *page)
                 return 0;
  
         page = compound_head(page);
-       return get_compound_page_dtor(page) == free_huge_page;
+       return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
  }
  EXPORT_SYMBOL_GPL(PageHuge);
  
@@ -1437,7 +1433,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
                 dissolve_free_huge_page(pfn_to_page(pfn));
  }
  
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
+ *    page from any node, and let the buddy allocator itself figure
+ *    it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
+ *    strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+       int order = huge_page_order(h);
+       gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+       unsigned int cpuset_mems_cookie;
+
+       /*
+        * We need a VMA to get a memory policy.  If we do not
+        * have one, we use the 'nid' argument.
+        *
+        * The mempolicy stuff below has some non-inlined bits
+        * and calls ->vm_ops.  That makes it hard to optimize at
+        * compile-time, even when NUMA is off and it does
+        * nothing.  This helps the compiler optimize it out.
+        */
+       if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+               /*
+                * If a specific node is requested, make sure to
+                * get memory from there, but only when a node
+                * is explicitly specified.
+                */
+               if (nid != NUMA_NO_NODE)
+                       gfp |= __GFP_THISNODE;
+               /*
+                * Make sure to call something that can handle
+                * nid=NUMA_NO_NODE
+                */
+               return alloc_pages_node(nid, gfp, order);
+       }
+
+       /*
+        * OK, so we have a VMA.  Fetch the mempolicy and try to
+        * allocate a huge page with it.  We will only reach this
+        * when CONFIG_NUMA=y.
+        */
+       do {
+               struct page *page;
+               struct mempolicy *mpol;
+               struct zonelist *zl;
+               nodemask_t *nodemask;
+
+               cpuset_mems_cookie = read_mems_allowed_begin();
+               zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+               mpol_cond_put(mpol);
+               page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+               if (page)
+                       return page;
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+       return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
  {
         struct page *page;
         unsigned int r_nid;
@@ -1445,6 +1516,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         if (hstate_is_gigantic(h))
                 return NULL;
  
+       if (vma || addr) {
+               VM_WARN_ON_ONCE(!addr || addr == -1);
+               VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+       }
         /*
          * Assume we will successfully allocate the surplus page to
          * prevent racing processes from causing the surplus to exceed
@@ -1478,20 +1553,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         }
         spin_unlock(&hugetlb_lock);
  
-       if (nid == NUMA_NO_NODE)
-               page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
-                                  __GFP_REPEAT|__GFP_NOWARN,
-                                  huge_page_order(h));
-       else
-               page = __alloc_pages_node(nid,
-                       htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                       __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+       page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
  
         spin_lock(&hugetlb_lock);
         if (page) {
                 INIT_LIST_HEAD(&page->lru);
                 r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, free_huge_page);
+               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
                 set_hugetlb_cgroup(page, NULL);
                 /*
                  * We incremented the global counters already
@@ -1509,6 +1577,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         return page;
  }
  
+/*
+ * Allocate a huge page from 'nid'.  Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+       unsigned long addr = -1;
+
+       return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr)
+{
+       return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
  /*
   * This allocation function is useful in the context where vma is irrelevant.
   * E.g. soft-offlining uses this function because it only cares physical
@@ -1524,7 +1615,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (!page)
-               page = alloc_buddy_huge_page(h, nid);
+               page = __alloc_buddy_huge_page_no_mpol(h, nid);
  
         return page;
  }
@@ -1554,7 +1645,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  retry:
         spin_unlock(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
                 if (!page) {
                         alloc_ok = false;
                         break;
@@ -1787,7 +1878,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
                 spin_unlock(&hugetlb_lock);
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
  
@@ -1872,7 +1963,8 @@ found:
         return 1;
  }
  
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page,
+               unsigned int order)
  {
         if (unlikely(order > (MAX_ORDER - 1)))
                 prep_compound_gigantic_page(page, order);
@@ -2376,7 +2468,7 @@ struct node_hstate {
         struct kobject          *hugepages_kobj;
         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
  };
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
  
  /*
   * A subset of global hstate attributes for node devices
@@ -2583,7 +2675,7 @@ static int __init hugetlb_init(void)
  module_init(hugetlb_init);
  
  /* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_add_hstate(unsigned order)
+void __init hugetlb_add_hstate(unsigned int order)
  {
         struct hstate *h;
         unsigned long i;
@@ -2790,6 +2882,12 @@ void hugetlb_show_meminfo(void)
                                 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
  }
  
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+       seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+                  atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
  /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
  unsigned long hugetlb_total_pages(void)
  {
@@ -3023,8 +3121,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                         entry = huge_ptep_get(src_pte);
                         ptepage = pte_page(entry);
                         get_page(ptepage);
-                       page_dup_rmap(ptepage);
+                       page_dup_rmap(ptepage, true);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
+                       hugetlb_count_add(pages_per_huge_page(h), dst);
                 }
                 spin_unlock(src_ptl);
                 spin_unlock(dst_ptl);
@@ -3105,7 +3204,8 @@ again:
                 if (huge_pte_dirty(pte))
                         set_page_dirty(page);
  
-               page_remove_rmap(page);
+               hugetlb_count_sub(pages_per_huge_page(h), mm);
+               page_remove_rmap(page, true);
                 force_flush = !__tlb_remove_page(tlb, page);
                 if (force_flush) {
                         address += sz;
@@ -3334,7 +3434,7 @@ retry_avoidcopy:
                 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                 set_huge_pte_at(mm, address, ptep,
                                 make_huge_pte(vma, new_page, 1));
-               page_remove_rmap(old_page);
+               page_remove_rmap(old_page, true);
                 hugepage_add_new_anon_rmap(new_page, vma, address);
                 /* Make the old page be freed below */
                 new_page = old_page;
@@ -3504,11 +3604,12 @@ retry:
                 ClearPagePrivate(page);
                 hugepage_add_new_anon_rmap(page, vma, address);
         } else
-               page_dup_rmap(page);
+               page_dup_rmap(page, true);
         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                 && (vma->vm_flags & VM_SHARED)));
         set_huge_pte_at(mm, address, ptep, new_pte);
  
+       hugetlb_count_add(pages_per_huge_page(h), mm);
         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -3574,6 +3675,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         struct page *pagecache_page = NULL;
         struct hstate *h = hstate_vma(vma);
         struct address_space *mapping;
+       struct inode *inode = file_inode(vma->vm_file);
         int need_wait_lock = 0;
  
         address &= huge_page_mask(h);
@@ -3596,6 +3698,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         mapping = vma->vm_file->f_mapping;
         idx = vma_hugecache_offset(h, vma, address);
  
+       /*
+        * page faults could race with fallocate hole punch.  If a page
+        * is faulted between unmap and deallocation, it will still remain
+        * in the punched hole.  During hole punch operations, a hugetlb_falloc
+        * structure will be pointed to by i_private.  If this fault is for
+        * a page in a hole being punched, wait for the operation to finish
+        * before proceeding.
+        *
+        * Even with this strategy, it is still possible for a page fault to
+        * race with hole punch.  In this case, remove_inode_hugepages() will
+        * unmap the page and then remove.  Checking i_private as below should
+        * catch most of these races as we want to minimize unmapping a page
+        * multiple times.
+        */
+       if (unlikely(inode->i_private)) {
+               struct hugetlb_falloc *hugetlb_falloc;
+
+               spin_lock(&inode->i_lock);
+               hugetlb_falloc = inode->i_private;
+               if (hugetlb_falloc && hugetlb_falloc->waitq &&
+                   idx >= hugetlb_falloc->start &&
+                   idx <= hugetlb_falloc->end) {
+                       wait_queue_head_t *hugetlb_falloc_waitq;
+                       DEFINE_WAIT(hugetlb_fault_wait);
+
+                       hugetlb_falloc_waitq = hugetlb_falloc->waitq;
+                       prepare_to_wait(hugetlb_falloc_waitq,
+                                       &hugetlb_fault_wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       schedule();
+
+                       spin_lock(&inode->i_lock);
+                       finish_wait(hugetlb_falloc_waitq, &hugetlb_fault_wait);
+               }
+               spin_unlock(&inode->i_lock);
+       }
+
         /*
          * Serialize hugepage allocation and instantiation, so that we don't
          * get spurious allocation failures if two CPUs race to instantiate
@@ -3783,7 +3923,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
  same_page:
                 if (pages) {
                         pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page_foll(pages[i]);
+                       get_page(pages[i]);
                 }
  
                 if (vmas)
@@ -4028,8 +4168,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
         unsigned long s_end = sbase + PUD_SIZE;
  
         /* Allow segments to share if only one is marked locked */
-       unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
-       unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+       unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+       unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
  
         /*
          * match the virtual addresses, permission and the alignment of the