Merge branch 'nvme-4.12' of git://git.infradead.org/nvme into for-linus

author Jens Axboe <axboe@fb.com>

Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)

committer Jens Axboe <axboe@fb.com>

Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)
author Jens Axboe <axboe@fb.com>
Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)
committer Jens Axboe <axboe@fb.com>
Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index a60926410438b98c2e414de081f7c8093bac5862..903d5813023a93588c08857ff0db1339bbb99c86 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  static int nvme_char_major;
  module_param(nvme_char_major, int, 0);
  
-static unsigned long default_ps_max_latency_us = 25000;
+static unsigned long default_ps_max_latency_us = 100000;
  module_param(default_ps_max_latency_us, ulong, 0644);
  MODULE_PARM_DESC(default_ps_max_latency_us,
                  "max power saving latency for new devices; use PM QOS to change per device");
@@ -1342,7 +1342,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
          * transitioning between power states.  Therefore, when running
          * in any given state, we will enter the next lower-power
          * non-operational state after waiting 50 * (enlat + exlat)
-        * microseconds, as long as that state's total latency is under
+        * microseconds, as long as that state's exit latency is under
          * the requested maximum latency.
          *
          * We will not autonomously enter any non-operational state for
@@ -1387,7 +1387,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
                  * lowest-power state, not the number of states.
                  */
                 for (state = (int)ctrl->npss; state >= 0; state--) {
-                       u64 total_latency_us, transition_ms;
+                       u64 total_latency_us, exit_latency_us, transition_ms;
  
                         if (target)
                                 table->entries[state] = target;
@@ -1408,12 +1408,15 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
                               NVME_PS_FLAGS_NON_OP_STATE))
                                 continue;
  
-                       total_latency_us =
-                               (u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
-                               + le32_to_cpu(ctrl->psd[state].exit_lat);
-                       if (total_latency_us > ctrl->ps_max_latency_us)
+                       exit_latency_us =
+                               (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
+                       if (exit_latency_us > ctrl->ps_max_latency_us)
                                 continue;
  
+                       total_latency_us =
+                               exit_latency_us +
+                               le32_to_cpu(ctrl->psd[state].entry_lat);
+
                         /*
                          * This state is good.  Use it as the APST idle
                          * target for higher power states.
@@ -2438,6 +2441,10 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
         struct nvme_ns *ns;
  
         mutex_lock(&ctrl->namespaces_mutex);
+
+       /* Forcibly start all queues to avoid having stuck requests */
+       blk_mq_start_hw_queues(ctrl->admin_q);
+
         list_for_each_entry(ns, &ctrl->namespaces, list) {
                 /*
                  * Revalidating a dead namespace sets capacity to 0. This will
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c

index 5b14cbefb7240d5e7d50bb1ade8fd958417282e8..92964cef0f4be5795bed3e874407c74a3e3cc725 100644 (file)
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1139,6 +1139,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
  /* *********************** NVME Ctrl Routines **************************** */
  
  static void __nvme_fc_final_op_cleanup(struct request *rq);
+static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
  
  static int
  nvme_fc_reinit_request(void *data, struct request *rq)
@@ -1265,7 +1266,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
         struct nvme_command *sqe = &op->cmd_iu.sqe;
         __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
         union nvme_result result;
-       bool complete_rq;
+       bool complete_rq, terminate_assoc = true;
  
         /*
          * WARNING:
@@ -1294,6 +1295,14 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
          * fabricate a CQE, the following fields will not be set as they
          * are not referenced:
          *      cqe.sqid,  cqe.sqhd,  cqe.command_id
+        *
+        * Failure or error of an individual i/o, in a transport
+        * detected fashion unrelated to the nvme completion status,
+        * potentially cause the initiator and target sides to get out
+        * of sync on SQ head/tail (aka outstanding io count allowed).
+        * Per FC-NVME spec, failure of an individual command requires
+        * the connection to be terminated, which in turn requires the
+        * association to be terminated.
          */
  
         fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
@@ -1359,6 +1368,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
                 goto done;
         }
  
+       terminate_assoc = false;
+
  done:
         if (op->flags & FCOP_FLAGS_AEN) {
                 nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
@@ -1366,7 +1377,7 @@ done:
                 atomic_set(&op->state, FCPOP_STATE_IDLE);
                 op->flags = FCOP_FLAGS_AEN;     /* clear other flags */
                 nvme_fc_ctrl_put(ctrl);
-               return;
+               goto check_error;
         }
  
         complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
@@ -1379,6 +1390,10 @@ done:
                 nvme_end_request(rq, status, result);
         } else
                 __nvme_fc_final_op_cleanup(rq);
+
+check_error:
+       if (terminate_assoc)
+               nvme_fc_error_recovery(ctrl, "transport detected io error");
  }
  
  static int
@@ -2791,6 +2806,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
                 ctrl->ctrl.opts = NULL;
                 /* initiate nvme ctrl ref counting teardown */
                 nvme_uninit_ctrl(&ctrl->ctrl);
+               nvme_put_ctrl(&ctrl->ctrl);
  
                 /* as we're past the point where we transition to the ref
                  * counting teardown path, if we return a bad pointer here,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index d52701df72457d0fa2b85a168c500fd022b8b717..951042a375d6b22dbd34988e38fef7114593c366 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1367,7 +1367,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
         bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
  
         /* If there is a reset ongoing, we shouldn't reset again. */
-       if (work_busy(&dev->reset_work))
+       if (dev->ctrl.state == NVME_CTRL_RESETTING)
                 return false;
  
         /* We shouldn't reset unless the controller is on fatal error state
@@ -1903,7 +1903,7 @@ static void nvme_reset_work(struct work_struct *work)
         bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
         int result = -ENODEV;
  
-       if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
+       if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
                 goto out;
  
         /*
@@ -1913,9 +1913,6 @@ static void nvme_reset_work(struct work_struct *work)
         if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                 nvme_dev_disable(dev, false);
  
-       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
-               goto out;
-
         result = nvme_pci_enable(dev);
         if (result)
                 goto out;
@@ -2009,8 +2006,8 @@ static int nvme_reset(struct nvme_dev *dev)
  {
         if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
                 return -ENODEV;
-       if (work_busy(&dev->reset_work))
-               return -ENODEV;
+       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
+               return -EBUSY;
         if (!queue_work(nvme_workq, &dev->reset_work))
                 return -EBUSY;
         return 0;
@@ -2136,6 +2133,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
         if (result)
                 goto release_pools;
  
+       nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
         dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
  
         queue_work(nvme_workq, &dev->reset_work);
@@ -2179,6 +2177,7 @@ static void nvme_remove(struct pci_dev *pdev)
  
         nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
  
+       cancel_work_sync(&dev->reset_work);
         pci_set_drvdata(pdev, NULL);
  
         if (!pci_device_is_present(pdev)) {
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c

index 28bd255c144dcca10aa60cede2c9a51cd101426a..24397d306d532213cf66e1ca0de9aa43bf12d3d5 100644 (file)
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
         if (ret)
                 goto requeue;
  
-       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
-
         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
         if (ret)
-               goto stop_admin_q;
+               goto requeue;
  
         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
  
         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
         if (ret)
-               goto stop_admin_q;
+               goto requeue;
  
         nvme_start_keep_alive(&ctrl->ctrl);
  
         if (ctrl->queue_count > 1) {
                 ret = nvme_rdma_init_io_queues(ctrl);
                 if (ret)
-                       goto stop_admin_q;
+                       goto requeue;
  
                 ret = nvme_rdma_connect_io_queues(ctrl);
                 if (ret)
-                       goto stop_admin_q;
+                       goto requeue;
         }
  
         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
@@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
         ctrl->ctrl.opts->nr_reconnects = 0;
  
         if (ctrl->queue_count > 1) {
-               nvme_start_queues(&ctrl->ctrl);
                 nvme_queue_scan(&ctrl->ctrl);
                 nvme_queue_async_events(&ctrl->ctrl);
         }
@@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
  
         return;
  
-stop_admin_q:
-       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
  requeue:
         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                         ctrl->ctrl.opts->nr_reconnects);
@@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
                                 nvme_cancel_request, &ctrl->ctrl);
  
+       /*
+        * queues are not a live anymore, so restart the queues to fail fast
+        * new IO
+        */
+       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
+       nvme_start_queues(&ctrl->ctrl);
+
         nvme_rdma_reconnect_or_remove(ctrl);
  }
  
@@ -1433,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
  /*
   * We cannot accept any other command until the Connect command has completed.
   */
-static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
+static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
                 struct request *rq)
  {
         if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
@@ -1441,11 +1443,22 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
  
                 if (!blk_rq_is_passthrough(rq) ||
                     cmd->common.opcode != nvme_fabrics_command ||
-                   cmd->fabrics.fctype != nvme_fabrics_type_connect)
-                       return false;
+                   cmd->fabrics.fctype != nvme_fabrics_type_connect) {
+                       /*
+                        * reconnecting state means transport disruption, which
+                        * can take a long time and even might fail permanently,
+                        * so we can't let incoming I/O be requeued forever.
+                        * fail it fast to allow upper layers a chance to
+                        * failover.
+                        */
+                       if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING)
+                               return -EIO;
+                       else
+                               return -EAGAIN;
+               }
         }
  
-       return true;
+       return 0;
  }
  
  static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1463,8 +1476,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
  
         WARN_ON_ONCE(rq->tag < 0);
  
-       if (!nvme_rdma_queue_is_ready(queue, rq))
-               return BLK_MQ_RQ_QUEUE_BUSY;
+       ret = nvme_rdma_queue_is_ready(queue, rq);
+       if (unlikely(ret))
+               goto err;
  
         dev = queue->device->dev;
         ib_dma_sync_single_for_cpu(dev, sqe->dma,
author	Jens Axboe <axboe@fb.com>
	Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)
committer	Jens Axboe <axboe@fb.com>
	Thu, 8 Jun 2017 14:33:45 +0000 (08:33 -0600)
drivers/nvme/host/core.c		patch \| blob \| history
drivers/nvme/host/fc.c		patch \| blob \| history
drivers/nvme/host/pci.c		patch \| blob \| history
drivers/nvme/host/rdma.c		patch \| blob \| history