]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/sunrpc/xprtrdma/verbs.c
xprtrdma: Properly handle exhaustion of the rb_mws list
[karo-tx-linux.git] / net / sunrpc / xprtrdma / verbs.c
index 54edf2ac48a1d685447a7ae3ee512a42cc73e263..017f0abb2a8675e98a3c3e1adee655ba304b5b6e 100644 (file)
@@ -48,7 +48,6 @@
  */
 
 #include <linux/interrupt.h>
-#include <linux/pci.h> /* for Tavor hack below */
 #include <linux/slab.h>
 #include <asm/bitops.h>
 
@@ -157,9 +156,9 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
                return;
 
        if (wc->opcode == IB_WC_FAST_REG_MR)
-               frmr->r.frmr.state = FRMR_IS_VALID;
+               frmr->r.frmr.fr_state = FRMR_IS_VALID;
        else if (wc->opcode == IB_WC_LOCAL_INV)
-               frmr->r.frmr.state = FRMR_IS_INVALID;
+               frmr->r.frmr.fr_state = FRMR_IS_INVALID;
 }
 
 static int
@@ -311,6 +310,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
        rpcrdma_recvcq_poll(cq, ep);
 }
 
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+       rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+       rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
 #ifdef RPC_DEBUG
 static const char * const conn[] = {
        "address resolved",
@@ -614,6 +620,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        /* Else will do memory reg/dereg for each chunk */
        ia->ri_memreg_strategy = memreg;
 
+       rwlock_init(&ia->ri_qplock);
        return 0;
 out2:
        rdma_destroy_id(ia->ri_id);
@@ -860,7 +867,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 int
 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-       struct rdma_cm_id *id;
+       struct rdma_cm_id *id, *old;
        int rc = 0;
        int retry_count = 0;
 
@@ -872,9 +879,7 @@ retry:
                if (rc && rc != -ENOTCONN)
                        dprintk("RPC:       %s: rpcrdma_ep_disconnect"
                                " status %i\n", __func__, rc);
-
-               rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-               rpcrdma_clean_cq(ep->rep_attr.send_cq);
+               rpcrdma_flush_cqs(ep);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
                id = rpcrdma_create_id(xprt, ia,
@@ -906,9 +911,14 @@ retry:
                        rc = -ENETUNREACH;
                        goto out;
                }
-               rdma_destroy_qp(ia->ri_id);
-               rdma_destroy_id(ia->ri_id);
+
+               write_lock(&ia->ri_qplock);
+               old = ia->ri_id;
                ia->ri_id = id;
+               write_unlock(&ia->ri_qplock);
+
+               rdma_destroy_qp(old);
+               rdma_destroy_id(old);
        } else {
                dprintk("RPC:       %s: connecting...\n", __func__);
                rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -920,19 +930,6 @@ retry:
                }
        }
 
-/* XXX Tavor device performs badly with 2K MTU! */
-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
-       struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
-       if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
-           (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
-            pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
-               struct ib_qp_attr attr = {
-                       .path_mtu = IB_MTU_1024
-               };
-               rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
-       }
-}
-
        ep->rep_connected = 0;
 
        rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
@@ -993,8 +990,7 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
        int rc;
 
-       rpcrdma_clean_cq(ep->rep_attr.recv_cq);
-       rpcrdma_clean_cq(ep->rep_attr.send_cq);
+       rpcrdma_flush_cqs(ep);
        rc = rdma_disconnect(ia->ri_id);
        if (!rc) {
                /* returns without wait if not connected */
@@ -1078,6 +1074,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
        p += cdata->padding;
 
        INIT_LIST_HEAD(&buf->rb_mws);
+       INIT_LIST_HEAD(&buf->rb_all);
        r = (struct rpcrdma_mw *)p;
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
@@ -1102,6 +1099,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                                ib_dereg_mr(r->r.frmr.fr_mr);
                                goto out;
                        }
+                       list_add(&r->mw_all, &buf->rb_all);
                        list_add(&r->mw_list, &buf->rb_mws);
                        ++r;
                }
@@ -1120,6 +1118,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
                                        " failed %i\n", __func__, rc);
                                goto out;
                        }
+                       list_add(&r->mw_all, &buf->rb_all);
                        list_add(&r->mw_list, &buf->rb_mws);
                        ++r;
                }
@@ -1229,6 +1228,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
        while (!list_empty(&buf->rb_mws)) {
                r = list_entry(buf->rb_mws.next,
                        struct rpcrdma_mw, mw_list);
+               list_del(&r->mw_all);
                list_del(&r->mw_list);
                switch (ia->ri_memreg_strategy) {
                case RPCRDMA_FRMR:
@@ -1256,6 +1256,67 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
        kfree(buf->rb_pool);
 }
 
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+       if (*mw) {
+               list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+               *mw = NULL;
+       }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mr_seg *seg = req->rl_segments;
+       struct rpcrdma_mr_seg *seg1 = seg;
+       int i;
+
+       for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+               rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+       rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       buf->rb_send_bufs[--buf->rb_send_index] = req;
+       req->rl_niovs = 0;
+       if (req->rl_reply) {
+               buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+               req->rl_reply->rr_func = NULL;
+               req->rl_reply = NULL;
+       }
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_mw *r;
+       int i;
+
+       i = RPCRDMA_MAX_SEGS - 1;
+       while (!list_empty(&buf->rb_mws)) {
+               r = list_entry(buf->rb_mws.next,
+                              struct rpcrdma_mw, mw_list);
+               list_del(&r->mw_list);
+               req->rl_segments[i].mr_chunk.rl_mw = r;
+               if (unlikely(i-- == 0))
+                       return req;     /* Success */
+       }
+
+       /* Not enough entries on rb_mws for this req */
+       rpcrdma_buffer_put_sendbuf(req, buf);
+       rpcrdma_buffer_put_mrs(req, buf);
+       return NULL;
+}
+
 /*
  * Get a set of request/reply buffers.
  *
@@ -1268,10 +1329,9 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
+       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
        struct rpcrdma_req *req;
        unsigned long flags;
-       int i;
-       struct rpcrdma_mw *r;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1291,14 +1351,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
                buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
-       if (!list_empty(&buffers->rb_mws)) {
-               i = RPCRDMA_MAX_SEGS - 1;
-               do {
-                       r = list_entry(buffers->rb_mws.next,
-                                       struct rpcrdma_mw, mw_list);
-                       list_del(&r->mw_list);
-                       req->rl_segments[i].mr_chunk.rl_mw = r;
-               } while (--i >= 0);
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+       case RPCRDMA_MTHCAFMR:
+               req = rpcrdma_buffer_get_mrs(req, buffers);
+               break;
+       default:
+               break;
        }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
        return req;
@@ -1313,35 +1372,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
        struct rpcrdma_buffer *buffers = req->rl_buffer;
        struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-       int i;
        unsigned long flags;
 
-       BUG_ON(req->rl_nchunks != 0);
        spin_lock_irqsave(&buffers->rb_lock, flags);
-       buffers->rb_send_bufs[--buffers->rb_send_index] = req;
-       req->rl_niovs = 0;
-       if (req->rl_reply) {
-               buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
-               req->rl_reply->rr_func = NULL;
-               req->rl_reply = NULL;
-       }
+       rpcrdma_buffer_put_sendbuf(req, buffers);
        switch (ia->ri_memreg_strategy) {
        case RPCRDMA_FRMR:
        case RPCRDMA_MTHCAFMR:
-               /*
-                * Cycle mw's back in reverse order, and "spin" them.
-                * This delays and scrambles reuse as much as possible.
-                */
-               i = 1;
-               do {
-                       struct rpcrdma_mw **mw;
-                       mw = &req->rl_segments[i].mr_chunk.rl_mw;
-                       list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
-                       *mw = NULL;
-               } while (++i < RPCRDMA_MAX_SEGS);
-               list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
-                                       &buffers->rb_mws);
-               req->rl_segments[0].mr_chunk.rl_mw = NULL;
+               rpcrdma_buffer_put_mrs(req, buffers);
                break;
        default:
                break;
@@ -1403,6 +1441,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
         */
        iov->addr = ib_dma_map_single(ia->ri_id->device,
                        va, len, DMA_BIDIRECTIONAL);
+       if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+               return -ENOMEM;
+
        iov->length = len;
 
        if (ia->ri_have_dma_lkey) {
@@ -1498,6 +1539,9 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_mr_seg *seg1 = seg;
+       struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+       struct rpcrdma_frmr *frmr = &mw->r.frmr;
+       struct ib_mr *mr = frmr->fr_mr;
        struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
 
        u8 key;
@@ -1517,8 +1561,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                rpcrdma_map_one(ia, seg, writing);
                pa = seg->mr_dma;
                for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
-                               page_list[page_no++] = pa;
+                       frmr->fr_pgl->page_list[page_no++] = pa;
                        pa += PAGE_SIZE;
                }
                len += seg->mr_len;
@@ -1530,44 +1573,46 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
                        break;
        }
        dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-               __func__, seg1->mr_chunk.rl_mw, i);
+               __func__, mw, i);
 
-       if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
+       if (unlikely(frmr->fr_state == FRMR_IS_VALID)) {
                dprintk("RPC:       %s: frmr %x left valid, posting invalidate.\n",
-                       __func__,
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
+                       __func__, mr->rkey);
                /* Invalidate before using. */
                memset(&invalidate_wr, 0, sizeof invalidate_wr);
-               invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+               invalidate_wr.wr_id = (unsigned long)(void *)mw;
                invalidate_wr.next = &frmr_wr;
                invalidate_wr.opcode = IB_WR_LOCAL_INV;
                invalidate_wr.send_flags = IB_SEND_SIGNALED;
-               invalidate_wr.ex.invalidate_rkey =
-                       seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+               invalidate_wr.ex.invalidate_rkey = mr->rkey;
                DECR_CQCOUNT(&r_xprt->rx_ep);
                post_wr = &invalidate_wr;
        } else
                post_wr = &frmr_wr;
 
-       /* Bump the key */
-       key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
-       ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
-
        /* Prepare FRMR WR */
        memset(&frmr_wr, 0, sizeof frmr_wr);
-       frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+       frmr_wr.wr_id = (unsigned long)(void *)mw;
        frmr_wr.opcode = IB_WR_FAST_REG_MR;
        frmr_wr.send_flags = IB_SEND_SIGNALED;
        frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-       frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
+       frmr_wr.wr.fast_reg.page_list = frmr->fr_pgl;
        frmr_wr.wr.fast_reg.page_list_len = page_no;
        frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
        frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-       BUG_ON(frmr_wr.wr.fast_reg.length < len);
+       if (frmr_wr.wr.fast_reg.length < len) {
+               rc = -EIO;
+               goto out_err;
+       }
+
+       /* Bump the key */
+       key = (u8)(mr->rkey & 0x000000FF);
+       ib_update_fast_reg_key(mr, ++key);
+
        frmr_wr.wr.fast_reg.access_flags = (writing ?
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
                                IB_ACCESS_REMOTE_READ);
-       frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+       frmr_wr.wr.fast_reg.rkey = mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
        rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
@@ -1575,15 +1620,19 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
        if (rc) {
                dprintk("RPC:       %s: failed ib_post_send for register,"
                        " status %i\n", __func__, rc);
-               while (i--)
-                       rpcrdma_unmap_one(ia, --seg);
+               ib_update_fast_reg_key(mr, --key);
+               goto out_err;
        } else {
-               seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+               seg1->mr_rkey = mr->rkey;
                seg1->mr_base = seg1->mr_dma + pageoff;
                seg1->mr_nsegs = i;
                seg1->mr_len = len;
        }
        *nsegs = i;
+       return 0;
+out_err:
+       while (i--)
+               rpcrdma_unmap_one(ia, --seg);
        return rc;
 }
 
@@ -1595,9 +1644,6 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
        struct ib_send_wr invalidate_wr, *bad_wr;
        int rc;
 
-       while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(ia, seg++);
-
        memset(&invalidate_wr, 0, sizeof invalidate_wr);
        invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
        invalidate_wr.opcode = IB_WR_LOCAL_INV;
@@ -1605,7 +1651,11 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
        invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
+       read_lock(&ia->ri_qplock);
+       while (seg1->mr_nsegs--)
+               rpcrdma_unmap_one(ia, seg++);
        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+       read_unlock(&ia->ri_qplock);
        if (rc)
                dprintk("RPC:       %s: failed ib_post_send for invalidate,"
                        " status %i\n", __func__, rc);
@@ -1666,8 +1716,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
 
        list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
        rc = ib_unmap_fmr(&l);
+       read_lock(&ia->ri_qplock);
        while (seg1->mr_nsegs--)
                rpcrdma_unmap_one(ia, seg++);
+       read_unlock(&ia->ri_qplock);
        if (rc)
                dprintk("RPC:       %s: failed ib_unmap_fmr,"
                        " status %i\n", __func__, rc);
@@ -1723,9 +1775,9 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
 
 #if RPCRDMA_PERSISTENT_REGISTRATION
        case RPCRDMA_ALLPHYSICAL:
-               BUG_ON(nsegs != 1);
+               read_lock(&ia->ri_qplock);
                rpcrdma_unmap_one(ia, seg);
-               rc = 0;
+               read_unlock(&ia->ri_qplock);
                break;
 #endif
 
@@ -1821,3 +1873,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
                        rc);
        return rc;
 }
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ *     which has the same constraint. Factor in the inline
+ *     rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+       unsigned int inline_size, pages;
+
+       inline_size = min_t(unsigned int,
+                           cdata->inline_wsize, cdata->inline_rsize);
+       inline_size -= RPCRDMA_HDRLEN_MIN;
+       pages = inline_size / sizeof(struct rpcrdma_segment);
+       return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+       size_t result;
+
+       switch (r_xprt->rx_ia.ri_memreg_strategy) {
+       case RPCRDMA_ALLPHYSICAL:
+               result = rpcrdma_physical_max_payload(r_xprt);
+               break;
+       default:
+               result = rpcrdma_mr_max_payload(r_xprt);
+       }
+       return result;
+}