nvme-rdma: default MR page size to 4k

[karo-tx-linux.git] / drivers / nvme / host / rdma.c
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index 6d4119dfbdaacf5df18c41bf92d0711c2e074bbf..a03299d779229de271eb28704a73515f1020ffe1 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -86,7 +86,7 @@ enum nvme_rdma_queue_flags {
  
  struct nvme_rdma_queue {
         struct nvme_rdma_qe     *rsp_ring;
-       u8                      sig_count;
+       atomic_t                sig_count;
         int                     queue_size;
         size_t                  cmnd_capsule_len;
         struct nvme_rdma_ctrl   *ctrl;
@@ -103,7 +103,6 @@ struct nvme_rdma_queue {
  struct nvme_rdma_ctrl {
         /* read only in the hot path */
         struct nvme_rdma_queue  *queues;
-       u32                     queue_count;
  
         /* other member variables */
         struct blk_mq_tag_set   tag_set;
@@ -119,7 +118,6 @@ struct nvme_rdma_ctrl {
         struct blk_mq_tag_set   admin_tag_set;
         struct nvme_rdma_device *device;
  
-       u64                     cap;
         u32                     max_fr_pages;
  
         struct sockaddr_storage addr;
@@ -274,9 +272,6 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq)
         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
         int ret = 0;
  
-       if (!req->mr->need_inval)
-               goto out;
-
         ib_dereg_mr(req->mr);
  
         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
@@ -349,7 +344,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
         struct nvme_rdma_ctrl *ctrl = data;
         struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
  
-       BUG_ON(hctx_idx >= ctrl->queue_count);
+       BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
  
         hctx->driver_data = queue;
         return 0;
@@ -525,6 +520,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
  
         queue->queue_size = queue_size;
+       atomic_set(&queue->sig_count, 0);
  
         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
                         RDMA_PS_TCP, IB_QPT_RC);
@@ -587,7 +583,7 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
  {
         int i;
  
-       for (i = 1; i < ctrl->queue_count; i++)
+       for (i = 1; i < ctrl->ctrl.queue_count; i++)
                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
  }
  
@@ -595,7 +591,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
  {
         int i, ret = 0;
  
-       for (i = 1; i < ctrl->queue_count; i++) {
+       for (i = 1; i < ctrl->ctrl.queue_count; i++) {
                 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
                 if (ret) {
                         dev_info(ctrl->ctrl.device,
@@ -623,14 +619,14 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
         if (ret)
                 return ret;
  
-       ctrl->queue_count = nr_io_queues + 1;
-       if (ctrl->queue_count < 2)
+       ctrl->ctrl.queue_count = nr_io_queues + 1;
+       if (ctrl->ctrl.queue_count < 2)
                 return 0;
  
         dev_info(ctrl->ctrl.device,
                 "creating %d I/O queues.\n", nr_io_queues);
  
-       for (i = 1; i < ctrl->queue_count; i++) {
+       for (i = 1; i < ctrl->ctrl.queue_count; i++) {
                 ret = nvme_rdma_init_queue(ctrl, i,
                                            ctrl->ctrl.opts->queue_size);
                 if (ret) {
@@ -705,7 +701,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
  
         ++ctrl->ctrl.nr_reconnects;
  
-       if (ctrl->queue_count > 1) {
+       if (ctrl->ctrl.queue_count > 1) {
                 nvme_rdma_free_io_queues(ctrl);
  
                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
@@ -729,13 +725,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
  
         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
  
-       ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+       ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
         if (ret)
                 goto requeue;
  
-       nvme_start_keep_alive(&ctrl->ctrl);
-
-       if (ctrl->queue_count > 1) {
+       if (ctrl->ctrl.queue_count > 1) {
                 ret = nvme_rdma_init_io_queues(ctrl);
                 if (ret)
                         goto requeue;
@@ -743,16 +737,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
                 ret = nvme_rdma_connect_io_queues(ctrl);
                 if (ret)
                         goto requeue;
+
+               blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+                               ctrl->ctrl.queue_count - 1);
         }
  
         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
         WARN_ON_ONCE(!changed);
         ctrl->ctrl.nr_reconnects = 0;
  
-       if (ctrl->queue_count > 1) {
-               nvme_queue_scan(&ctrl->ctrl);
-               nvme_queue_async_events(&ctrl->ctrl);
-       }
+       nvme_start_ctrl(&ctrl->ctrl);
  
         dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
  
@@ -770,17 +764,17 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
                         struct nvme_rdma_ctrl, err_work);
         int i;
  
-       nvme_stop_keep_alive(&ctrl->ctrl);
+       nvme_stop_ctrl(&ctrl->ctrl);
  
-       for (i = 0; i < ctrl->queue_count; i++)
+       for (i = 0; i < ctrl->ctrl.queue_count; i++)
                 clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
  
-       if (ctrl->queue_count > 1)
+       if (ctrl->ctrl.queue_count > 1)
                 nvme_stop_queues(&ctrl->ctrl);
-       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
  
         /* We must take care of fastfail/requeue all our inflight requests */
-       if (ctrl->queue_count > 1)
+       if (ctrl->ctrl.queue_count > 1)
                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
                                         nvme_cancel_request, &ctrl->ctrl);
         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
@@ -790,7 +784,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
          * queues are not a live anymore, so restart the queues to fail fast
          * new IO
          */
-       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
         nvme_start_queues(&ctrl->ctrl);
  
         nvme_rdma_reconnect_or_remove(ctrl);
@@ -926,7 +920,11 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
         int nr;
  
-       nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
+       /*
+        * Align the MR to a 4K page size to match the ctrl page size and
+        * the block virtual boundary.
+        */
+       nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
         if (nr < count) {
                 if (nr < 0)
                         return nr;
@@ -1008,17 +1006,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
                 nvme_rdma_wr_error(cq, wc, "SEND");
  }
  
-static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
+/*
+ * We want to signal completion at least every queue depth/2.  This returns the
+ * largest power of two that is not above half of (queue size + 1) to optimize
+ * (avoid divisions).
+ */
+static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue)
  {
-       int sig_limit;
+       int limit = 1 << ilog2((queue->queue_size + 1) / 2);
  
-       /*
-        * We signal completion every queue depth/2 and also handle the
-        * degenerated case of a  device with queue_depth=1, where we
-        * would need to signal every message.
-        */
-       sig_limit = max(queue->queue_size / 2, 1);
-       return (++queue->sig_count % sig_limit) == 0;
+       return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0;
  }
  
  static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
@@ -1574,7 +1571,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
  
         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
  
-       error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+       error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP,
+                       &ctrl->ctrl.cap);
         if (error) {
                 dev_err(ctrl->ctrl.device,
                         "prop_get NVME_REG_CAP failed\n");
@@ -1582,14 +1580,14 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
         }
  
         ctrl->ctrl.sqsize =
-               min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
+               min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
  
-       error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+       error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
         if (error)
                 goto out_cleanup_queue;
  
         ctrl->ctrl.max_hw_sectors =
-               (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
+               (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
  
         error = nvme_init_identify(&ctrl->ctrl);
         if (error)
@@ -1601,8 +1599,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
         if (error)
                 goto out_cleanup_queue;
  
-       nvme_start_keep_alive(&ctrl->ctrl);
-
         return 0;
  
  out_cleanup_queue:
@@ -1620,11 +1616,10 @@ out_free_queue:
  
  static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
  {
-       nvme_stop_keep_alive(&ctrl->ctrl);
         cancel_work_sync(&ctrl->err_work);
         cancel_delayed_work_sync(&ctrl->reconnect_work);
  
-       if (ctrl->queue_count > 1) {
+       if (ctrl->ctrl.queue_count > 1) {
                 nvme_stop_queues(&ctrl->ctrl);
                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
                                         nvme_cancel_request, &ctrl->ctrl);
@@ -1634,18 +1629,21 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
         if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags))
                 nvme_shutdown_ctrl(&ctrl->ctrl);
  
-       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+       blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                 nvme_cancel_request, &ctrl->ctrl);
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
         nvme_rdma_destroy_admin_queue(ctrl);
  }
  
  static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
  {
-       nvme_uninit_ctrl(&ctrl->ctrl);
+       nvme_stop_ctrl(&ctrl->ctrl);
+       nvme_remove_namespaces(&ctrl->ctrl);
         if (shutdown)
                 nvme_rdma_shutdown_ctrl(ctrl);
  
+       nvme_uninit_ctrl(&ctrl->ctrl);
         if (ctrl->ctrl.tagset) {
                 blk_cleanup_queue(ctrl->ctrl.connect_q);
                 blk_mq_free_tag_set(&ctrl->tag_set);
@@ -1707,6 +1705,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
         int ret;
         bool changed;
  
+       nvme_stop_ctrl(&ctrl->ctrl);
         nvme_rdma_shutdown_ctrl(ctrl);
  
         ret = nvme_rdma_configure_admin_queue(ctrl);
@@ -1716,7 +1715,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
                 goto del_dead_ctrl;
         }
  
-       if (ctrl->queue_count > 1) {
+       if (ctrl->ctrl.queue_count > 1) {
                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
                 if (ret)
                         goto del_dead_ctrl;
@@ -1728,16 +1727,15 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
                 ret = nvme_rdma_connect_io_queues(ctrl);
                 if (ret)
                         goto del_dead_ctrl;
+
+               blk_mq_update_nr_hw_queues(&ctrl->tag_set,
+                               ctrl->ctrl.queue_count - 1);
         }
  
         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
         WARN_ON_ONCE(!changed);
  
-       if (ctrl->queue_count > 1) {
-               nvme_start_queues(&ctrl->ctrl);
-               nvme_queue_scan(&ctrl->ctrl);
-               nvme_queue_async_events(&ctrl->ctrl);
-       }
+       nvme_start_ctrl(&ctrl->ctrl);
  
         return;
  
@@ -1785,7 +1783,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
         ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
         ctrl->tag_set.driver_data = ctrl;
-       ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+       ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
         ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
  
         ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
@@ -1863,12 +1861,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
         INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
         INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
  
-       ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+       ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
         ctrl->ctrl.sqsize = opts->queue_size - 1;
         ctrl->ctrl.kato = opts->kato;
  
         ret = -ENOMEM;
-       ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
+       ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
                                 GFP_KERNEL);
         if (!ctrl->queues)
                 goto out_uninit_ctrl;
@@ -1925,15 +1923,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
         mutex_unlock(&nvme_rdma_ctrl_mutex);
  
-       if (opts->nr_io_queues) {
-               nvme_queue_scan(&ctrl->ctrl);
-               nvme_queue_async_events(&ctrl->ctrl);
-       }
+       nvme_start_ctrl(&ctrl->ctrl);
  
         return &ctrl->ctrl;
  
  out_remove_admin_queue:
-       nvme_stop_keep_alive(&ctrl->ctrl);
         nvme_rdma_destroy_admin_queue(ctrl);
  out_kfree_queues:
         kfree(ctrl->queues);