]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/block/nvme-core.c
NVMe: check for integer overflow in nvme_map_user_pages()
[karo-tx-linux.git] / drivers / block / nvme-core.c
index 391a874e41330bb6a2cbf347070c787b671f0f43..437637551d1e0eb0152398ec5157752cda305603 100644 (file)
@@ -308,16 +308,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
        kfree(iod);
 }
 
-static void requeue_bio(struct nvme_dev *dev, struct bio *bio)
-{
-       struct nvme_queue *nvmeq = get_nvmeq(dev);
-       if (bio_list_empty(&nvmeq->sq_cong))
-               add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
-       bio_list_add(&nvmeq->sq_cong, bio);
-       put_nvmeq(nvmeq);
-       wake_up_process(nvme_thread);
-}
-
 static void bio_completion(struct nvme_dev *dev, void *ctx,
                                                struct nvme_completion *cqe)
 {
@@ -329,13 +319,10 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
                dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
                        bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
        nvme_free_iod(dev, iod);
-       if (status) {
+       if (status)
                bio_endio(bio, -EIO);
-       } else if (bio->bi_vcnt > bio->bi_idx) {
-               requeue_bio(dev, bio);
-       } else {
+       else
                bio_endio(bio, 0);
-       }
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@ -419,40 +406,152 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
        return total_len;
 }
 
+struct nvme_bio_pair {
+       struct bio b1, b2, *parent;
+       struct bio_vec *bv1, *bv2;
+       int err;
+       atomic_t cnt;
+};
+
+static void nvme_bio_pair_endio(struct bio *bio, int err)
+{
+       struct nvme_bio_pair *bp = bio->bi_private;
+
+       if (err)
+               bp->err = err;
+
+       if (atomic_dec_and_test(&bp->cnt)) {
+               bio_endio(bp->parent, bp->err);
+               if (bp->bv1)
+                       kfree(bp->bv1);
+               if (bp->bv2)
+                       kfree(bp->bv2);
+               kfree(bp);
+       }
+}
+
+static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
+                                                       int len, int offset)
+{
+       struct nvme_bio_pair *bp;
+
+       BUG_ON(len > bio->bi_size);
+       BUG_ON(idx > bio->bi_vcnt);
+
+       bp = kmalloc(sizeof(*bp), GFP_ATOMIC);
+       if (!bp)
+               return NULL;
+       bp->err = 0;
+
+       bp->b1 = *bio;
+       bp->b2 = *bio;
+
+       bp->b1.bi_size = len;
+       bp->b2.bi_size -= len;
+       bp->b1.bi_vcnt = idx;
+       bp->b2.bi_idx = idx;
+       bp->b2.bi_sector += len >> 9;
+
+       if (offset) {
+               bp->bv1 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec),
+                                                               GFP_ATOMIC);
+               if (!bp->bv1)
+                       goto split_fail_1;
+
+               bp->bv2 = kmalloc(bio->bi_max_vecs * sizeof(struct bio_vec),
+                                                               GFP_ATOMIC);
+               if (!bp->bv2)
+                       goto split_fail_2;
+
+               memcpy(bp->bv1, bio->bi_io_vec,
+                       bio->bi_max_vecs * sizeof(struct bio_vec));
+               memcpy(bp->bv2, bio->bi_io_vec,
+                       bio->bi_max_vecs * sizeof(struct bio_vec));
+
+               bp->b1.bi_io_vec = bp->bv1;
+               bp->b2.bi_io_vec = bp->bv2;
+               bp->b2.bi_io_vec[idx].bv_offset += offset;
+               bp->b2.bi_io_vec[idx].bv_len -= offset;
+               bp->b1.bi_io_vec[idx].bv_len = offset;
+               bp->b1.bi_vcnt++;
+       } else
+               bp->bv1 = bp->bv2 = NULL;
+
+       bp->b1.bi_private = bp;
+       bp->b2.bi_private = bp;
+
+       bp->b1.bi_end_io = nvme_bio_pair_endio;
+       bp->b2.bi_end_io = nvme_bio_pair_endio;
+
+       bp->parent = bio;
+       atomic_set(&bp->cnt, 2);
+
+       return bp;
+
+ split_fail_2:
+       kfree(bp->bv1);
+ split_fail_1:
+       kfree(bp);
+       return NULL;
+}
+
+static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
+                                               int idx, int len, int offset)
+{
+       struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset);
+       if (!bp)
+               return -ENOMEM;
+
+       if (bio_list_empty(&nvmeq->sq_cong))
+               add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
+       bio_list_add(&nvmeq->sq_cong, &bp->b1);
+       bio_list_add(&nvmeq->sq_cong, &bp->b2);
+
+       return 0;
+}
+
 /* NVMe scatterlists require no holes in the virtual address */
 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)  ((vec2)->bv_offset || \
                        (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
 
-static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
+static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
                struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 {
        struct bio_vec *bvec, *bvprv = NULL;
        struct scatterlist *sg = NULL;
-       int i, old_idx, length = 0, nsegs = 0;
+       int i, length = 0, nsegs = 0, split_len = bio->bi_size;
+
+       if (nvmeq->dev->stripe_size)
+               split_len = nvmeq->dev->stripe_size -
+                       ((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1));
 
        sg_init_table(iod->sg, psegs);
-       old_idx = bio->bi_idx;
        bio_for_each_segment(bvec, bio, i) {
                if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) {
                        sg->length += bvec->bv_len;
                } else {
                        if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec))
-                               break;
+                               return nvme_split_and_submit(bio, nvmeq, i,
+                                                               length, 0);
+
                        sg = sg ? sg + 1 : iod->sg;
                        sg_set_page(sg, bvec->bv_page, bvec->bv_len,
                                                        bvec->bv_offset);
                        nsegs++;
                }
+
+               if (split_len - length < bvec->bv_len)
+                       return nvme_split_and_submit(bio, nvmeq, i, split_len,
+                                                       split_len - length);
                length += bvec->bv_len;
                bvprv = bvec;
        }
-       bio->bi_idx = i;
        iod->nents = nsegs;
        sg_mark_end(sg);
-       if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) {
-               bio->bi_idx = old_idx;
+       if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
                return -ENOMEM;
-       }
+
+       BUG_ON(length != bio->bi_size);
        return length;
 }
 
@@ -581,8 +680,8 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
                dma_dir = DMA_FROM_DEVICE;
        }
 
-       result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs);
-       if (result < 0)
+       result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
+       if (result <= 0)
                goto free_cmdid;
        length = result;
 
@@ -595,8 +694,6 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
        cmnd->rw.control = cpu_to_le16(control);
        cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 
-       bio->bi_sector += length >> 9;
-
        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
        writel(nvmeq->sq_tail, nvmeq->q_db);
@@ -731,7 +828,7 @@ int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
 
        set_current_state(TASK_KILLABLE);
        nvme_submit_cmd(nvmeq, cmd);
-       schedule();
+       schedule_timeout(timeout);
 
        if (cmdinfo.status == -EINTR) {
                nvme_abort_command(nvmeq, cmdid);
@@ -956,7 +1053,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
        return nvmeq;
 
  free_cqdma:
-       dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes,
+       dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
                                                        nvmeq->cq_dma_addr);
  free_nvmeq:
        kfree(nvmeq);
@@ -1011,15 +1108,60 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid,
        return ERR_PTR(result);
 }
 
+static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
+{
+       unsigned long timeout;
+       u32 bit = enabled ? NVME_CSTS_RDY : 0;
+
+       timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
+       while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
+               msleep(100);
+               if (fatal_signal_pending(current))
+                       return -EINTR;
+               if (time_after(jiffies, timeout)) {
+                       dev_err(&dev->pci_dev->dev,
+                               "Device not ready; aborting initialisation\n");
+                       return -ENODEV;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+       u32 cc = readl(&dev->bar->cc);
+
+       if (cc & NVME_CC_ENABLE)
+               writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
+       return nvme_wait_ready(dev, cap, false);
+}
+
+static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+       return nvme_wait_ready(dev, cap, true);
+}
+
 static int nvme_configure_admin_queue(struct nvme_dev *dev)
 {
-       int result = 0;
+       int result;
        u32 aqa;
-       u64 cap;
-       unsigned long timeout;
+       u64 cap = readq(&dev->bar->cap);
        struct nvme_queue *nvmeq;
 
        dev->dbs = ((void __iomem *)dev->bar) + 4096;
+       dev->db_stride = NVME_CAP_STRIDE(cap);
+
+       result = nvme_disable_ctrl(dev, cap);
+       if (result < 0)
+               return result;
 
        nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
        if (!nvmeq)
@@ -1033,27 +1175,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
        dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
        dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 
-       writel(0, &dev->bar->cc);
        writel(aqa, &dev->bar->aqa);
        writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
        writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
        writel(dev->ctrl_config, &dev->bar->cc);
 
-       cap = readq(&dev->bar->cap);
-       timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
-       dev->db_stride = NVME_CAP_STRIDE(cap);
-
-       while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
-               msleep(100);
-               if (fatal_signal_pending(current))
-                       result = -EINTR;
-               if (time_after(jiffies, timeout)) {
-                       dev_err(&dev->pci_dev->dev,
-                               "Device not ready; aborting initialisation\n");
-                       result = -ENODEV;
-               }
-       }
-
+       result = nvme_enable_ctrl(dev, cap);
        if (result)
                goto free_q;
 
@@ -1079,7 +1206,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 
        if (addr & 3)
                return ERR_PTR(-EINVAL);
-       if (!length)
+       if (!length || length > INT_MAX - PAGE_SIZE)
                return ERR_PTR(-EINVAL);
 
        offset = offset_in_page(addr);
@@ -1100,7 +1227,8 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
        sg_init_table(sg, count);
        for (i = 0; i < count; i++) {
                sg_set_page(&sg[i], pages[i],
-                               min_t(int, length, PAGE_SIZE - offset), offset);
+                           min_t(unsigned, length, PAGE_SIZE - offset),
+                           offset);
                length -= (PAGE_SIZE - offset);
                offset = 0;
        }
@@ -1143,13 +1271,19 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        struct nvme_queue *nvmeq;
        struct nvme_user_io io;
        struct nvme_command c;
-       unsigned length;
-       int status;
-       struct nvme_iod *iod;
+       unsigned length, meta_len;
+       int status, i;
+       struct nvme_iod *iod, *meta_iod = NULL;
+       dma_addr_t meta_dma_addr;
+       void *meta, *uninitialized_var(meta_mem);
 
        if (copy_from_user(&io, uio, sizeof(io)))
                return -EFAULT;
        length = (io.nblocks + 1) << ns->lba_shift;
+       meta_len = (io.nblocks + 1) * ns->ms;
+
+       if (meta_len && ((io.metadata & 3) || !io.metadata))
+               return -EINVAL;
 
        switch (io.opcode) {
        case nvme_cmd_write:
@@ -1175,7 +1309,38 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        c.rw.reftag = cpu_to_le32(io.reftag);
        c.rw.apptag = cpu_to_le16(io.apptag);
        c.rw.appmask = cpu_to_le16(io.appmask);
-       /* XXX: metadata */
+
+       if (meta_len) {
+               meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len);
+               if (IS_ERR(meta_iod)) {
+                       status = PTR_ERR(meta_iod);
+                       meta_iod = NULL;
+                       goto unmap;
+               }
+
+               meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+                                               &meta_dma_addr, GFP_KERNEL);
+               if (!meta_mem) {
+                       status = -ENOMEM;
+                       goto unmap;
+               }
+
+               if (io.opcode & 1) {
+                       int meta_offset = 0;
+
+                       for (i = 0; i < meta_iod->nents; i++) {
+                               meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+                                               meta_iod->sg[i].offset;
+                               memcpy(meta_mem + meta_offset, meta,
+                                               meta_iod->sg[i].length);
+                               kunmap_atomic(meta);
+                               meta_offset += meta_iod->sg[i].length;
+                       }
+               }
+
+               c.rw.metadata = cpu_to_le64(meta_dma_addr);
+       }
+
        length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
        nvmeq = get_nvmeq(dev);
@@ -1191,8 +1356,33 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
        else
                status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
+       if (meta_len) {
+               if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
+                       int meta_offset = 0;
+
+                       for (i = 0; i < meta_iod->nents; i++) {
+                               meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+                                               meta_iod->sg[i].offset;
+                               memcpy(meta, meta_mem + meta_offset,
+                                               meta_iod->sg[i].length);
+                               kunmap_atomic(meta);
+                               meta_offset += meta_iod->sg[i].length;
+                       }
+               }
+
+               dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem,
+                                                               meta_dma_addr);
+       }
+
+ unmap:
        nvme_unmap_user_pages(dev, io.opcode & 1, iod);
        nvme_free_iod(dev, iod);
+
+       if (meta_iod) {
+               nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod);
+               nvme_free_iod(dev, meta_iod);
+       }
+
        return status;
 }
 
@@ -1203,6 +1393,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
        struct nvme_command c;
        int status, length;
        struct nvme_iod *uninitialized_var(iod);
+       unsigned timeout;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
@@ -1232,10 +1423,13 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
                                                                GFP_KERNEL);
        }
 
+       timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
+                                                               ADMIN_TIMEOUT;
        if (length != cmd.data_len)
                status = -ENOMEM;
        else
-               status = nvme_submit_admin_cmd(dev, &c, &cmd.result);
+               status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result,
+                                                               timeout);
 
        if (cmd.data_len) {
                nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1281,13 +1475,17 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
        while (bio_list_peek(&nvmeq->sq_cong)) {
                struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
                struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+
+               if (bio_list_empty(&nvmeq->sq_cong))
+                       remove_wait_queue(&nvmeq->sq_full,
+                                                       &nvmeq->sq_cong_wait);
                if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+                       if (bio_list_empty(&nvmeq->sq_cong))
+                               add_wait_queue(&nvmeq->sq_full,
+                                                       &nvmeq->sq_cong_wait);
                        bio_list_add_head(&nvmeq->sq_cong, bio);
                        break;
                }
-               if (bio_list_empty(&nvmeq->sq_cong))
-                       remove_wait_queue(&nvmeq->sq_full,
-                                                       &nvmeq->sq_cong_wait);
        }
 }
 
@@ -1385,6 +1583,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
        ns->disk = disk;
        lbaf = id->flbas & 0xf;
        ns->lba_shift = id->lbaf[lbaf].ds;
+       ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
        if (dev->max_hw_sectors)
                blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
@@ -1515,11 +1714,12 @@ static void nvme_free_queues(struct nvme_dev *dev)
 static int nvme_dev_add(struct nvme_dev *dev)
 {
        int res, nn, i;
-       struct nvme_ns *ns, *next;
+       struct nvme_ns *ns;
        struct nvme_id_ctrl *ctrl;
        struct nvme_id_ns *id_ns;
        void *mem;
        dma_addr_t dma_addr;
+       int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
        res = nvme_setup_io_queues(dev);
        if (res)
@@ -1527,11 +1727,13 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
        mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
                                                                GFP_KERNEL);
+       if (!mem)
+               return -ENOMEM;
 
        res = nvme_identify(dev, 0, 1, dma_addr);
        if (res) {
                res = -EIO;
-               goto out_free;
+               goto out;
        }
 
        ctrl = mem;
@@ -1540,10 +1742,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
        memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
        memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
        memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
-       if (ctrl->mdts) {
-               int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+       if (ctrl->mdts)
                dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
-       }
+       if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) &&
+                       (dev->pci_dev->device == 0x0953) && ctrl->vs[3])
+               dev->stripe_size = 1 << (ctrl->vs[3] + shift);
 
        id_ns = mem;
        for (i = 1; i <= nn; i++) {
@@ -1566,13 +1769,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
        list_for_each_entry(ns, &dev->namespaces, list)
                add_disk(ns->disk);
        res = 0;
-       goto out;
-
- out_free:
-       list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-               list_del(&ns->list);
-               nvme_ns_free(ns);
-       }
 
  out:
        dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);