]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/rds/ib_recv.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/wim/linux-watchdog
[karo-tx-linux.git] / net / rds / ib_recv.c
index f6dbf16e07410e2aa11420d36da2e23f86e9b353..e29e0ca32f740d978aeccf23a4fba5453a4e8aa4 100644 (file)
@@ -43,32 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t        rds_ib_allocation = ATOMIC_INIT(0);
 
-/* Free frag and attached recv buffer f_sg */
-static void rds_ib_frag_free(struct rds_page_frag *frag)
-{
-       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
-       __free_page(sg_page(&frag->f_sg));
-       kmem_cache_free(rds_ib_frag_slab, frag);
-}
-
-/*
- * We map a page at a time.  Its fragments are posted in order.  This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
-                                  struct rds_ib_recv_work *recv)
-{
-       struct rds_page_frag *frag = recv->r_frag;
-
-       rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg));
-       ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE);
-}
-
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
        struct rds_ib_recv_work *recv;
@@ -97,6 +71,151 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
        }
 }
 
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+                                   struct list_head *to)
+{
+       struct list_head *from_last = from->prev;
+
+       list_splice_tail(from_last, to);
+       list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+       struct list_head *tmp;
+
+       tmp = xchg(&cache->xfer, NULL);
+       if (tmp) {
+               if (cache->ready)
+                       list_splice_entire_tail(tmp, cache->ready);
+               else
+                       cache->ready = tmp;
+       }
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+       struct rds_ib_cache_head *head;
+       int cpu;
+
+       cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+       if (!cache->percpu)
+              return -ENOMEM;
+
+       for_each_possible_cpu(cpu) {
+               head = per_cpu_ptr(cache->percpu, cpu);
+               head->first = NULL;
+               head->count = 0;
+       }
+       cache->xfer = NULL;
+       cache->ready = NULL;
+
+       return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+       int ret;
+
+       ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+       if (!ret) {
+               ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+               if (ret)
+                       free_percpu(ic->i_cache_incs.percpu);
+       }
+
+       return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+                                         struct list_head *caller_list)
+{
+       struct rds_ib_cache_head *head;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               head = per_cpu_ptr(cache->percpu, cpu);
+               if (head->first) {
+                       list_splice_entire_tail(head->first, caller_list);
+                       head->first = NULL;
+               }
+       }
+
+       if (cache->ready) {
+               list_splice_entire_tail(cache->ready, caller_list);
+               cache->ready = NULL;
+       }
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+       struct rds_ib_incoming *inc;
+       struct rds_ib_incoming *inc_tmp;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *frag_tmp;
+       LIST_HEAD(list);
+
+       rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+       rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+       free_percpu(ic->i_cache_incs.percpu);
+
+       list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+               list_del(&inc->ii_cache_entry);
+               WARN_ON(!list_empty(&inc->ii_frags));
+               kmem_cache_free(rds_ib_incoming_slab, inc);
+       }
+
+       rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+       rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+       free_percpu(ic->i_cache_frags.percpu);
+
+       list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+               list_del(&frag->f_cache_entry);
+               WARN_ON(!list_empty(&frag->f_item));
+               kmem_cache_free(rds_ib_frag_slab, frag);
+       }
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+                                 struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+                            struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+       rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+       struct rds_ib_incoming *ibinc;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *pos;
+       struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+       /* Free attached frags */
+       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+               list_del_init(&frag->f_item);
+               rds_ib_frag_free(ic, frag);
+       }
+       BUG_ON(!list_empty(&ibinc->ii_frags));
+
+       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+       rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                                  struct rds_ib_recv_work *recv)
 {
@@ -105,8 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                recv->r_ibinc = NULL;
        }
        if (recv->r_frag) {
-               rds_ib_recv_unmap_page(ic, recv);
-               rds_ib_frag_free(recv->r_frag);
+               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+               rds_ib_frag_free(ic, recv->r_frag);
                recv->r_frag = NULL;
        }
 }
@@ -119,42 +238,98 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
                rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
 }
 
-static int rds_ib_recv_refill_one(struct rds_connection *conn,
-                                 struct rds_ib_recv_work *recv)
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+                                                    gfp_t slab_mask)
 {
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct ib_sge *sge;
-       int ret = -ENOMEM;
+       struct rds_ib_incoming *ibinc;
+       struct list_head *cache_item;
+       int avail_allocs;
 
-       if (!recv->r_ibinc) {
-               if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
+       cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+       if (cache_item) {
+               ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+       } else {
+               avail_allocs = atomic_add_unless(&rds_ib_allocation,
+                                                1, rds_ib_sysctl_max_recv_allocation);
+               if (!avail_allocs) {
                        rds_ib_stats_inc(s_ib_rx_alloc_limit);
-                       goto out;
+                       return NULL;
                }
-               recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT);
-               if (!recv->r_ibinc) {
+               ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+               if (!ibinc) {
                        atomic_dec(&rds_ib_allocation);
-                       goto out;
+                       return NULL;
                }
-               INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
-               rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
        }
+       INIT_LIST_HEAD(&ibinc->ii_frags);
+       rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
 
-       if (!recv->r_frag) {
-               recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT);
-               if (!recv->r_frag)
-                       goto out;
-               INIT_LIST_HEAD(&recv->r_frag->f_item);
-               sg_init_table(&recv->r_frag->f_sg, 1);
-               ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
-                                              RDS_FRAG_SIZE, GFP_NOWAIT);
+       return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+                                                   gfp_t slab_mask, gfp_t page_mask)
+{
+       struct rds_page_frag *frag;
+       struct list_head *cache_item;
+       int ret;
+
+       cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+       if (cache_item) {
+               frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+       } else {
+               frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+               if (!frag)
+                       return NULL;
+
+               sg_init_table(&frag->f_sg, 1);
+               ret = rds_page_remainder_alloc(&frag->f_sg,
+                                              RDS_FRAG_SIZE, page_mask);
                if (ret) {
-                       kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
-                       recv->r_frag = NULL;
-                       goto out;
+                       kmem_cache_free(rds_ib_frag_slab, frag);
+                       return NULL;
                }
        }
 
+       INIT_LIST_HEAD(&frag->f_item);
+
+       return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+                                 struct rds_ib_recv_work *recv, int prefill)
+{
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       struct ib_sge *sge;
+       int ret = -ENOMEM;
+       gfp_t slab_mask = GFP_NOWAIT;
+       gfp_t page_mask = GFP_NOWAIT;
+
+       if (prefill) {
+               slab_mask = GFP_KERNEL;
+               page_mask = GFP_HIGHUSER;
+       }
+
+       if (!ic->i_cache_incs.ready)
+               rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+       if (!ic->i_cache_frags.ready)
+               rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+       /*
+        * ibinc was taken from recv if recv contained the start of a message.
+        * recvs that were continuations will still have this allocated.
+        */
+       if (!recv->r_ibinc) {
+               recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+               if (!recv->r_ibinc)
+                       goto out;
+       }
+
+       WARN_ON(recv->r_frag); /* leak! */
+       recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+       if (!recv->r_frag)
+               goto out;
+
        ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
                            1, DMA_FROM_DEVICE);
        WARN_ON(ret != 1);
@@ -175,12 +350,11 @@ out:
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
- * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
+ * sockets.
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_recv_work *recv;
@@ -194,14 +368,12 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
                if (pos >= ic->i_recv_ring.w_nr) {
                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                        pos);
-                       ret = -EINVAL;
                        break;
                }
 
                recv = &ic->i_recvs[pos];
-               ret = rds_ib_recv_refill_one(conn, recv);
+               ret = rds_ib_recv_refill_one(conn, recv, prefill);
                if (ret) {
-                       ret = -1;
                        break;
                }
 
@@ -215,7 +387,6 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
                               "%pI4 returned %d, disconnecting and "
                               "reconnecting\n", &conn->c_faddr,
                               ret);
-                       ret = -1;
                        break;
                }
 
@@ -228,36 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 
        if (ret)
                rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
-       return ret;
 }
 
-static void rds_ib_inc_purge(struct rds_incoming *inc)
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+                                struct rds_ib_refill_cache *cache)
 {
-       struct rds_ib_incoming *ibinc;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *pos;
+       unsigned long flags;
+       struct rds_ib_cache_head *chp;
+       struct list_head *old;
 
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-       rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+       local_irq_save(flags);
 
-       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-               list_del_init(&frag->f_item);
-               rds_ib_frag_free(frag);
-       }
+       chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+       if (!chp->first)
+               INIT_LIST_HEAD(new_item);
+       else /* put on front */
+               list_add_tail(new_item, chp->first);
+       chp->first = new_item;
+       chp->count++;
+
+       if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+               goto end;
+
+       /*
+        * Return our per-cpu first list to the cache's xfer by atomically
+        * grabbing the current xfer list, appending it to our per-cpu list,
+        * and then atomically returning that entire list back to the
+        * cache's xfer list as long as it's still empty.
+        */
+       do {
+               old = xchg(&cache->xfer, NULL);
+               if (old)
+                       list_splice_entire_tail(old, chp->first);
+               old = cmpxchg(&cache->xfer, NULL, chp->first);
+       } while (old);
+
+       chp->first = NULL;
+       chp->count = 0;
+end:
+       local_irq_restore(flags);
 }
 
-void rds_ib_inc_free(struct rds_incoming *inc)
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
 {
-       struct rds_ib_incoming *ibinc;
-
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+       struct list_head *head = cache->ready;
+
+       if (head) {
+               if (!list_empty(head)) {
+                       cache->ready = head->next;
+                       list_del_init(head);
+               } else
+                       cache->ready = NULL;
+       }
 
-       rds_ib_inc_purge(inc);
-       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-       BUG_ON(!list_empty(&ibinc->ii_frags));
-       kmem_cache_free(rds_ib_incoming_slab, ibinc);
-       atomic_dec(&rds_ib_allocation);
-       BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+       return head;
 }
 
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -662,7 +870,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                 *
                 * FIXME: Fold this into the code path below.
                 */
-               rds_ib_frag_free(recv->r_frag);
+               rds_ib_frag_free(ic, recv->r_frag);
                recv->r_frag = NULL;
                return;
        }
@@ -758,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
        struct rds_ib_recv_work *recv;
 
        while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status,
+                        rds_ib_wc_status_str(wc.status), wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
                rds_ib_stats_inc(s_ib_rx_cq_event);
 
                recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-               rds_ib_recv_unmap_page(ic, recv);
+               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 
                /*
                 * Also process recvs in connecting state because it is possible
                 * to get a recv completion _before_ the rdmacm ESTABLISHED
                 * event is processed.
                 */
-               if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+               if (wc.status == IB_WC_SUCCESS) {
+                       rds_ib_process_recv(conn, recv, wc.byte_len, state);
+               } else {
                        /* We expect errors as the qp is drained during shutdown */
-                       if (wc.status == IB_WC_SUCCESS) {
-                               rds_ib_process_recv(conn, recv, wc.byte_len, state);
-                       } else {
-                               rds_ib_conn_error(conn, "recv completion on "
-                                      "%pI4 had status %u, disconnecting and "
-                                      "reconnecting\n", &conn->c_faddr,
-                                      wc.status);
-                       }
+                       if (rds_conn_up(conn) || rds_conn_connecting(conn))
+                               rds_ib_conn_error(conn, "recv completion on %pI4 had "
+                                                 "status %u (%s), disconnecting and "
+                                                 "reconnecting\n", &conn->c_faddr,
+                                                 wc.status,
+                                                 rds_ib_wc_status_str(wc.status));
                }
 
+               /*
+                * It's very important that we only free this ring entry if we've truly
+                * freed the resources allocated to the entry.  The refilling path can
+                * leak if we don't.
+                */
                rds_ib_ring_free(&ic->i_recv_ring, 1);
        }
 }
@@ -829,7 +1043,7 @@ int rds_ib_recv(struct rds_connection *conn)
        return ret;
 }
 
-int __init rds_ib_recv_init(void)
+int rds_ib_recv_init(void)
 {
        struct sysinfo si;
        int ret = -ENOMEM;
@@ -840,13 +1054,13 @@ int __init rds_ib_recv_init(void)
 
        rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
                                        sizeof(struct rds_ib_incoming),
-                                       0, 0, NULL);
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
        if (!rds_ib_incoming_slab)
                goto out;
 
        rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
                                        sizeof(struct rds_page_frag),
-                                       0, 0, NULL);
+                                       0, SLAB_HWCACHE_ALIGN, NULL);
        if (!rds_ib_frag_slab)
                kmem_cache_destroy(rds_ib_incoming_slab);
        else