Merge branch 'linux-mfg' into tx6-mfg-devel

[karo-tx-linux.git] / mm / slub.c
diff --git a/mm/slub.c b/mm/slub.c

index f614b5dc396bc17b43cebacd97383243bbb03b99..438ebf8bbab1ddaba0e25db37a8f2b1913859b96 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
   */
  static __always_inline void slab_lock(struct page *page)
  {
+       VM_BUG_ON_PAGE(PageTail(page), page);
         bit_spin_lock(PG_locked, &page->flags);
  }
  
  static __always_inline void slab_unlock(struct page *page)
  {
+       VM_BUG_ON_PAGE(PageTail(page), page);
         __bit_spin_unlock(PG_locked, &page->flags);
  }
  
@@ -459,8 +461,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
  /*
   * Debug settings:
   */
-#ifdef CONFIG_SLUB_DEBUG_ON
+#if defined(CONFIG_SLUB_DEBUG_ON)
  static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined(CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
  #else
  static int slub_debug;
  #endif
@@ -1063,11 +1067,15 @@ bad:
         return 0;
  }
  
+/* Supports checking bulk free of a constructed freelist */
  static noinline struct kmem_cache_node *free_debug_processing(
-       struct kmem_cache *s, struct page *page, void *object,
+       struct kmem_cache *s, struct page *page,
+       void *head, void *tail, int bulk_cnt,
         unsigned long addr, unsigned long *flags)
  {
         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+       void *object = head;
+       int cnt = 0;
  
         spin_lock_irqsave(&n->list_lock, *flags);
         slab_lock(page);
@@ -1075,6 +1083,9 @@ static noinline struct kmem_cache_node *free_debug_processing(
         if (!check_slab(s, page))
                 goto fail;
  
+next_object:
+       cnt++;
+
         if (!check_valid_pointer(s, page, object)) {
                 slab_err(s, page, "Invalid object pointer 0x%p", object);
                 goto fail;
@@ -1105,8 +1116,19 @@ static noinline struct kmem_cache_node *free_debug_processing(
         if (s->flags & SLAB_STORE_USER)
                 set_track(s, object, TRACK_FREE, addr);
         trace(s, page, object, 0);
+       /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
         init_object(s, object, SLUB_RED_INACTIVE);
+
+       /* Reached end of constructed freelist yet? */
+       if (object != tail) {
+               object = get_freepointer(s, object);
+               goto next_object;
+       }
  out:
+       if (cnt != bulk_cnt)
+               slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+                        bulk_cnt, cnt);
+
         slab_unlock(page);
         /*
          * Keep node_lock to preserve integrity
@@ -1202,7 +1224,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
  
         return flags;
  }
-#else
+#else /* !CONFIG_SLUB_DEBUG */
  static inline void setup_object_debug(struct kmem_cache *s,
                         struct page *page, void *object) {}
  
@@ -1210,7 +1232,8 @@ static inline int alloc_debug_processing(struct kmem_cache *s,
         struct page *page, void *object, unsigned long addr) { return 0; }
  
  static inline struct kmem_cache_node *free_debug_processing(
-       struct kmem_cache *s, struct page *page, void *object,
+       struct kmem_cache *s, struct page *page,
+       void *head, void *tail, int bulk_cnt,
         unsigned long addr, unsigned long *flags) { return NULL; }
  
  static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
@@ -1263,7 +1286,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
  {
         flags &= gfp_allowed_mask;
         lockdep_trace_alloc(flags);
-       might_sleep_if(flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(flags));
  
         if (should_failslab(s->object_size, flags, s->flags))
                 return NULL;
@@ -1306,6 +1329,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
         kasan_slab_free(s, x);
  }
  
+static inline void slab_free_freelist_hook(struct kmem_cache *s,
+                                          void *head, void *tail)
+{
+/*
+ * Compiler cannot detect this function can be removed if slab_free_hook()
+ * evaluates to nothing.  Thus, catch all relevant config debug options here.
+ */
+#if defined(CONFIG_KMEMCHECK) ||               \
+       defined(CONFIG_LOCKDEP) ||              \
+       defined(CONFIG_DEBUG_KMEMLEAK) ||       \
+       defined(CONFIG_DEBUG_OBJECTS_FREE) ||   \
+       defined(CONFIG_KASAN)
+
+       void *object = head;
+       void *tail_obj = tail ? : head;
+
+       do {
+               slab_free_hook(s, object);
+       } while ((object != tail_obj) &&
+                (object = get_freepointer(s, object)));
+#endif
+}
+
  static void setup_object(struct kmem_cache *s, struct page *page,
                                 void *object)
  {
@@ -1328,16 +1374,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
  
         flags |= __GFP_NOTRACK;
  
-       if (memcg_charge_slab(s, flags, order))
-               return NULL;
-
         if (node == NUMA_NO_NODE)
                 page = alloc_pages(flags, order);
         else
                 page = __alloc_pages_node(node, flags, order);
  
-       if (!page)
-               memcg_uncharge_slab(s, order);
+       if (page && memcg_charge_slab(page, flags, order, s)) {
+               __free_pages(page, order);
+               page = NULL;
+       }
  
         return page;
  }
@@ -1352,7 +1397,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
  
         flags &= gfp_allowed_mask;
  
-       if (flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(flags))
                 local_irq_enable();
  
         flags |= s->allocflags;
@@ -1362,8 +1407,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
          * so we fall-back to the minimum order allocation.
          */
         alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
-       if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
-               alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
+       if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
+               alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
  
         page = alloc_slab_page(s, alloc_gfp, node, oo);
         if (unlikely(!page)) {
@@ -1423,7 +1468,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         page->frozen = 1;
  
  out:
-       if (flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(flags))
                 local_irq_disable();
         if (!page)
                 return NULL;
@@ -1476,8 +1521,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
         page_mapcount_reset(page);
         if (current->reclaim_state)
                 current->reclaim_state->reclaimed_slab += pages;
-       __free_pages(page, order);
-       memcg_uncharge_slab(s, order);
+       __free_kmem_pages(page, order);
  }
  
  #define need_reserve_slab_rcu                                          \
@@ -1507,10 +1551,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
                         VM_BUG_ON(s->reserved != sizeof(*head));
                         head = page_address(page) + offset;
                 } else {
-                       /*
-                        * RCU free overloads the RCU head over the LRU
-                        */
-                       head = (void *)&page->lru;
+                       head = &page->rcu_head;
                 }
  
                 call_rcu(head, rcu_free_slab);
@@ -2298,23 +2339,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
   * And if we were unable to get a new slab from the partial slab lists then
   * we need to allocate a new slab. This is the slowest path since it involves
   * a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
   */
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                           unsigned long addr, struct kmem_cache_cpu *c)
  {
         void *freelist;
         struct page *page;
-       unsigned long flags;
-
-       local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
-       /*
-        * We may have been preempted and rescheduled on a different
-        * cpu before disabling interrupts. Need to reload cpu area
-        * pointer.
-        */
-       c = this_cpu_ptr(s->cpu_slab);
-#endif
  
         page = c->page;
         if (!page)
@@ -2372,7 +2405,6 @@ load_freelist:
         VM_BUG_ON(!c->page->frozen);
         c->freelist = get_freepointer(s, freelist);
         c->tid = next_tid(c->tid);
-       local_irq_restore(flags);
         return freelist;
  
  new_slab:
@@ -2389,7 +2421,6 @@ new_slab:
  
         if (unlikely(!freelist)) {
                 slab_out_of_memory(s, gfpflags, node);
-               local_irq_restore(flags);
                 return NULL;
         }
  
@@ -2405,10 +2436,34 @@ new_slab:
         deactivate_slab(s, page, get_freepointer(s, freelist));
         c->page = NULL;
         c->freelist = NULL;
-       local_irq_restore(flags);
         return freelist;
  }
  
+/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+                         unsigned long addr, struct kmem_cache_cpu *c)
+{
+       void *p;
+       unsigned long flags;
+
+       local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+       /*
+        * We may have been preempted and rescheduled on a different
+        * cpu before disabling interrupts. Need to reload cpu area
+        * pointer.
+        */
+       c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+       p = ___slab_alloc(s, gfpflags, node, addr, c);
+       local_irq_restore(flags);
+       return p;
+}
+
  /*
   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
   * have the fastpath folded into their functions. So no function call
@@ -2572,10 +2627,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
   * handling required then we can return immediately.
   */
  static void __slab_free(struct kmem_cache *s, struct page *page,
-                       void *x, unsigned long addr)
+                       void *head, void *tail, int cnt,
+                       unsigned long addr)
+
  {
         void *prior;
-       void **object = (void *)x;
         int was_frozen;
         struct page new;
         unsigned long counters;
@@ -2585,7 +2641,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
         stat(s, FREE_SLOWPATH);
  
         if (kmem_cache_debug(s) &&
-               !(n = free_debug_processing(s, page, x, addr, &flags)))
+           !(n = free_debug_processing(s, page, head, tail, cnt,
+                                       addr, &flags)))
                 return;
  
         do {
@@ -2595,10 +2652,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 }
                 prior = page->freelist;
                 counters = page->counters;
-               set_freepointer(s, object, prior);
+               set_freepointer(s, tail, prior);
                 new.counters = counters;
                 was_frozen = new.frozen;
-               new.inuse--;
+               new.inuse -= cnt;
                 if ((!new.inuse || !prior) && !was_frozen) {
  
                         if (kmem_cache_has_cpu_partial(s) && !prior) {
@@ -2629,7 +2686,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
  
         } while (!cmpxchg_double_slab(s, page,
                 prior, counters,
-               object, new.counters,
+               head, new.counters,
                 "__slab_free"));
  
         if (likely(!n)) {
@@ -2694,15 +2751,20 @@ slab_empty:
   *
   * If fastpath is not possible then fall back to __slab_free where we deal
   * with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
   */
-static __always_inline void slab_free(struct kmem_cache *s,
-                       struct page *page, void *x, unsigned long addr)
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+                                     void *head, void *tail, int cnt,
+                                     unsigned long addr)
  {
-       void **object = (void *)x;
+       void *tail_obj = tail ? : head;
         struct kmem_cache_cpu *c;
         unsigned long tid;
  
-       slab_free_hook(s, x);
+       slab_free_freelist_hook(s, head, tail);
  
  redo:
         /*
@@ -2721,19 +2783,19 @@ redo:
         barrier();
  
         if (likely(page == c->page)) {
-               set_freepointer(s, object, c->freelist);
+               set_freepointer(s, tail_obj, c->freelist);
  
                 if (unlikely(!this_cpu_cmpxchg_double(
                                 s->cpu_slab->freelist, s->cpu_slab->tid,
                                 c->freelist, tid,
-                               object, next_tid(tid)))) {
+                               head, next_tid(tid)))) {
  
                         note_cmpxchg_failure("slab_free", s, tid);
                         goto redo;
                 }
                 stat(s, FREE_FASTPATH);
         } else
-               __slab_free(s, page, x, addr);
+               __slab_free(s, page, head, tail_obj, cnt, addr);
  
  }
  
@@ -2742,49 +2804,98 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
         s = cache_from_obj(s, x);
         if (!s)
                 return;
-       slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+       slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
         trace_kmem_cache_free(_RET_IP_, x);
  }
  EXPORT_SYMBOL(kmem_cache_free);
  
-/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
-       struct kmem_cache_cpu *c;
+struct detached_freelist {
         struct page *page;
-       int i;
+       void *tail;
+       void *freelist;
+       int cnt;
+};
  
-       local_irq_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page.  It builds a detached freelist directly within the given
+ * page/objects.  This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive.  Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static int build_detached_freelist(struct kmem_cache *s, size_t size,
+                                  void **p, struct detached_freelist *df)
+{
+       size_t first_skipped_index = 0;
+       int lookahead = 3;
+       void *object;
  
-       for (i = 0; i < size; i++) {
-               void *object = p[i];
+       /* Always re-init detached_freelist */
+       df->page = NULL;
  
-               BUG_ON(!object);
-               /* kmem cache debug support */
-               s = cache_from_obj(s, object);
-               if (unlikely(!s))
-                       goto exit;
-               slab_free_hook(s, object);
+       do {
+               object = p[--size];
+       } while (!object && size);
+
+       if (!object)
+               return 0;
  
-               page = virt_to_head_page(object);
+       /* Start new detached freelist */
+       set_freepointer(s, object, NULL);
+       df->page = virt_to_head_page(object);
+       df->tail = object;
+       df->freelist = object;
+       p[size] = NULL; /* mark object processed */
+       df->cnt = 1;
+
+       while (size) {
+               object = p[--size];
+               if (!object)
+                       continue; /* Skip processed objects */
+
+               /* df->page is always set at this point */
+               if (df->page == virt_to_head_page(object)) {
+                       /* Opportunity build freelist */
+                       set_freepointer(s, object, df->freelist);
+                       df->freelist = object;
+                       df->cnt++;
+                       p[size] = NULL; /* mark object processed */
  
-               if (c->page == page) {
-                       /* Fastpath: local CPU free */
-                       set_freepointer(s, object, c->freelist);
-                       c->freelist = object;
-               } else {
-                       c->tid = next_tid(c->tid);
-                       local_irq_enable();
-                       /* Slowpath: overhead locked cmpxchg_double_slab */
-                       __slab_free(s, page, object, _RET_IP_);
-                       local_irq_disable();
-                       c = this_cpu_ptr(s->cpu_slab);
+                       continue;
                 }
+
+               /* Limit look ahead search */
+               if (!--lookahead)
+                       break;
+
+               if (!first_skipped_index)
+                       first_skipped_index = size + 1;
         }
-exit:
-       c->tid = next_tid(c->tid);
-       local_irq_enable();
+
+       return first_skipped_index;
+}
+
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       if (WARN_ON(!size))
+               return;
+
+       do {
+               struct detached_freelist df;
+
+               size = build_detached_freelist(s, size, p, &df);
+               if (unlikely(!df.page))
+                       continue;
+
+               slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+       } while (likely(size));
  }
  EXPORT_SYMBOL(kmem_cache_free_bulk);
  
@@ -2807,30 +2918,23 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                 void *object = c->freelist;
  
                 if (unlikely(!object)) {
-                       local_irq_enable();
                         /*
                          * Invoking slow path likely have side-effect
                          * of re-populating per CPU c->freelist
                          */
-                       p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+                       p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
                                             _RET_IP_, c);
-                       if (unlikely(!p[i])) {
-                               __kmem_cache_free_bulk(s, i, p);
-                               return false;
-                       }
-                       local_irq_disable();
+                       if (unlikely(!p[i]))
+                               goto error;
+
                         c = this_cpu_ptr(s->cpu_slab);
                         continue; /* goto for-loop */
                 }
  
                 /* kmem_cache debug support */
                 s = slab_pre_alloc_hook(s, flags);
-               if (unlikely(!s)) {
-                       __kmem_cache_free_bulk(s, i, p);
-                       c->tid = next_tid(c->tid);
-                       local_irq_enable();
-                       return false;
-               }
+               if (unlikely(!s))
+                       goto error;
  
                 c->freelist = get_freepointer(s, object);
                 p[i] = object;
@@ -2850,6 +2954,11 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
         }
  
         return true;
+
+error:
+       __kmem_cache_free_bulk(s, i, p);
+       local_irq_enable();
+       return false;
  }
  EXPORT_SYMBOL(kmem_cache_alloc_bulk);
  
@@ -2912,20 +3021,15 @@ static inline int slab_order(int size, int min_objects,
         if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
                 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
  
-       for (order = max(min_order,
-                               fls(min_objects * size - 1) - PAGE_SHIFT);
+       for (order = max(min_order, get_order(min_objects * size + reserved));
                         order <= max_order; order++) {
  
                 unsigned long slab_size = PAGE_SIZE << order;
  
-               if (slab_size < min_objects * size + reserved)
-                       continue;
-
                 rem = (slab_size - reserved) % size;
  
                 if (rem <= slab_size / fract_leftover)
                         break;
-
         }
  
         return order;
@@ -2943,7 +3047,7 @@ static inline int calculate_order(int size, int reserved)
          * works by first attempting to generate a layout with
          * the best configuration and backing off gradually.
          *
-        * First we reduce the acceptable waste in a slab. Then
+        * First we increase the acceptable waste in a slab. Then
          * we reduce the minimum objects required in a slab.
          */
         min_objects = slub_min_objects;
@@ -3519,7 +3623,7 @@ void kfree(const void *x)
                 __free_kmem_pages(page, compound_order(page));
                 return;
         }
-       slab_free(page->slab_cache, page, object, _RET_IP_);
+       slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
  }
  EXPORT_SYMBOL(kfree);