]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - net/rds/ib_cm.c
Merge branch 'master' into csb1725
[mv-sheeva.git] / net / rds / ib_cm.c
index f68832798db224d6abffcc08f4ad494fa2b3bc17..ee369d201a65891b8f8e3b562bf738067665df2f 100644 (file)
 #include "rds.h"
 #include "ib.h"
 
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+               [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+       RDS_IB_EVENT_STRING(CQ_ERR),
+       RDS_IB_EVENT_STRING(QP_FATAL),
+       RDS_IB_EVENT_STRING(QP_REQ_ERR),
+       RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+       RDS_IB_EVENT_STRING(COMM_EST),
+       RDS_IB_EVENT_STRING(SQ_DRAINED),
+       RDS_IB_EVENT_STRING(PATH_MIG),
+       RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+       RDS_IB_EVENT_STRING(DEVICE_FATAL),
+       RDS_IB_EVENT_STRING(PORT_ACTIVE),
+       RDS_IB_EVENT_STRING(PORT_ERR),
+       RDS_IB_EVENT_STRING(LID_CHANGE),
+       RDS_IB_EVENT_STRING(PKEY_CHANGE),
+       RDS_IB_EVENT_STRING(SM_CHANGE),
+       RDS_IB_EVENT_STRING(SRQ_ERR),
+       RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+       RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+       RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+       return rds_str_array(rds_ib_event_type_strings,
+                            ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
 /*
  * Set the selected protocol version
  */
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
        const struct rds_ib_connect_private *dp = NULL;
        struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_device *rds_ibdev;
        struct ib_qp_attr qp_attr;
        int err;
 
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                }
        }
 
-       printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-                       &conn->c_faddr,
-                       RDS_PROTOCOL_MAJOR(conn->c_version),
-                       RDS_PROTOCOL_MINOR(conn->c_version),
-                       ic->i_flowctl ? ", flow control" : "");
+       if (conn->c_version < RDS_PROTOCOL(3,1)) {
+               printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+                      " no longer supported\n",
+                      &conn->c_faddr,
+                      RDS_PROTOCOL_MAJOR(conn->c_version),
+                      RDS_PROTOCOL_MINOR(conn->c_version));
+               rds_conn_destroy(conn);
+               return;
+       } else {
+               printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+                      &conn->c_faddr,
+                      RDS_PROTOCOL_MAJOR(conn->c_version),
+                      RDS_PROTOCOL_MINOR(conn->c_version),
+                      ic->i_flowctl ? ", flow control" : "");
+       }
 
        /*
         * Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        rds_ib_recv_init_ring(ic);
        /* Post receive buffers - as a side effect, this will update
         * the posted credit count. */
-       rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+       rds_ib_recv_refill(conn, 1);
 
        /* Tune RNR behavior */
        rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        if (err)
                printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
-       /* update ib_device with this local ipaddr & conn */
-       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-       err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+       /* update ib_device with this local ipaddr */
+       err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
        if (err)
-               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-       rds_ib_add_conn(rds_ibdev, conn);
+               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+                       err);
 
        /* If the peer gave us the last packet it saw, process this as if
         * we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                        struct rdma_conn_param *conn_param,
                        struct rds_ib_connect_private *dp,
-                       u32 protocol_version)
+                       u32 protocol_version,
+                       u32 max_responder_resources,
+                       u32 max_initiator_depth)
 {
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
        memset(conn_param, 0, sizeof(struct rdma_conn_param));
-       /* XXX tune these? */
-       conn_param->responder_resources = 1;
-       conn_param->initiator_depth = 1;
+
+       conn_param->responder_resources =
+               min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+       conn_param->initiator_depth =
+               min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
        conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
        conn_param->rnr_retry_count = 7;
 
        if (dp) {
-               struct rds_ib_connection *ic = conn->c_transport_data;
-
                memset(dp, 0, sizeof(*dp));
                dp->dp_saddr = conn->c_laddr;
                dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
 {
-       rdsdebug("event %u data %p\n", event->event, data);
+       rdsdebug("event %u (%s) data %p\n",
+                event->event, rds_ib_event_str(event->event), data);
 }
 
 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
        struct rds_connection *conn = data;
        struct rds_ib_connection *ic = conn->c_transport_data;
 
-       rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+       rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+                rds_ib_event_str(event->event));
 
        switch (event->event) {
        case IB_EVENT_COMM_EST:
                rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
                break;
        default:
-               rdsdebug("Fatal QP Event %u "
+               rdsdebug("Fatal QP Event %u (%s) "
                        "- connection %pI4->%pI4, reconnecting\n",
-                       event->event, &conn->c_laddr, &conn->c_faddr);
+                       event->event, rds_ib_event_str(event->event),
+                       &conn->c_laddr, &conn->c_faddr);
                rds_conn_drop(conn);
                break;
        }
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
        struct rds_ib_device *rds_ibdev;
        int ret;
 
-       /* rds_ib_add_one creates a rds_ib_device object per IB device,
-        * and allocates a protection domain, memory range and FMR pool
-        * for each.  If that fails for any reason, it will not register
-        * the rds_ibdev at all.
+       /*
+        * It's normal to see a null device if an incoming connection races
+        * with device removal, so we don't print a warning.
         */
-       rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
-       if (rds_ibdev == NULL) {
-               if (printk_ratelimit())
-                       printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
-                                       dev->name);
+       rds_ibdev = rds_ib_get_client_data(dev);
+       if (!rds_ibdev)
                return -EOPNOTSUPP;
-       }
+
+       /* add the conn now so that connection establishment has the dev */
+       rds_ib_add_conn(rds_ibdev, conn);
 
        if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
                rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_send_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (ic->i_send_hdrs == NULL) {
+       if (!ic->i_send_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent send failed\n");
                goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_recv_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (ic->i_recv_hdrs == NULL) {
+       if (!ic->i_recv_hdrs) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent recv failed\n");
                goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                       &ic->i_ack_dma, GFP_KERNEL);
-       if (ic->i_ack == NULL) {
+       if (!ic->i_ack) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
-       ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
-       if (ic->i_sends == NULL) {
+       ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+                                  ibdev_to_node(dev));
+       if (!ic->i_sends) {
                ret = -ENOMEM;
                rdsdebug("send allocation failed\n");
                goto out;
        }
        memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
-       ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
-       if (ic->i_recvs == NULL) {
+       ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+                                  ibdev_to_node(dev));
+       if (!ic->i_recvs) {
                ret = -ENOMEM;
                rdsdebug("recv allocation failed\n");
                goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                 ic->i_send_cq, ic->i_recv_cq);
 
 out:
+       rds_ib_dev_put(rds_ibdev);
        return ret;
 }
 
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        struct rds_ib_connection *ic = NULL;
        struct rdma_conn_param conn_param;
        u32 version;
-       int err, destroy = 1;
+       int err = 1, destroy = 1;
 
        /* Check whether the remote protocol version matches ours. */
        version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                        /* Wait and see - our connect may still be succeeding */
                        rds_ib_stats_inc(s_ib_connect_raced);
                }
-               mutex_unlock(&conn->c_cm_lock);
                goto out;
        }
 
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        err = rds_ib_setup_qp(conn);
        if (err) {
                rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
-               mutex_unlock(&conn->c_cm_lock);
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+               event->param.conn.responder_resources,
+               event->param.conn.initiator_depth);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        err = rdma_accept(cm_id, &conn_param);
-       mutex_unlock(&conn->c_cm_lock);
-       if (err) {
+       if (err)
                rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
-               goto out;
-       }
-
-       return 0;
 
 out:
-       rdma_reject(cm_id, NULL, 0);
+       if (conn)
+               mutex_unlock(&conn->c_cm_lock);
+       if (err)
+               rdma_reject(cm_id, NULL, 0);
        return destroy;
 }
 
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+               UINT_MAX, UINT_MAX);
        ret = rdma_connect(cm_id, &conn_param);
        if (ret)
                rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
                                ic->i_cm_id, err);
                }
 
+               /*
+                * We want to wait for tx and rx completion to finish
+                * before we tear down the connection, but we have to be
+                * careful not to get stuck waiting on a send ring that
+                * only has unsignaled sends in it.  We've shutdown new
+                * sends before getting here so by waiting for signaled
+                * sends to complete we're ensured that there will be no
+                * more tx processing.
+                */
                wait_event(rds_ib_ring_empty_wait,
-                       rds_ib_ring_empty(&ic->i_send_ring) &&
-                       rds_ib_ring_empty(&ic->i_recv_ring));
+                          rds_ib_ring_empty(&ic->i_recv_ring) &&
+                          (atomic_read(&ic->i_signaled_sends) == 0));
+               tasklet_kill(&ic->i_recv_tasklet);
 
                if (ic->i_send_hdrs)
                        ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
        BUG_ON(ic->rds_ibdev);
 
        /* Clear pending transmit */
-       if (ic->i_rm) {
-               rds_message_put(ic->i_rm);
-               ic->i_rm = NULL;
+       if (ic->i_data_op) {
+               struct rds_message *rm;
+
+               rm = container_of(ic->i_data_op, struct rds_message, data);
+               rds_message_put(rm);
+               ic->i_data_op = NULL;
        }
 
        /* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
        struct rds_ib_connection *ic;
        unsigned long flags;
+       int ret;
 
        /* XXX too lazy? */
        ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
-       if (ic == NULL)
+       if (!ic)
                return -ENOMEM;
 
+       ret = rds_ib_recv_alloc_caches(ic);
+       if (ret) {
+               kfree(ic);
+               return ret;
+       }
+
        INIT_LIST_HEAD(&ic->ib_node);
        tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
                     (unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 #ifndef KERNEL_HAS_ATOMIC64
        spin_lock_init(&ic->i_ack_lock);
 #endif
+       atomic_set(&ic->i_signaled_sends, 0);
 
        /*
         * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
        list_del(&ic->ib_node);
        spin_unlock_irq(lock_ptr);
 
+       rds_ib_recv_free_caches(ic);
+
        kfree(ic);
 }