]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 2 Jun 2014 16:29:34 +0000 (09:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 2 Jun 2014 16:29:34 +0000 (09:29 -0700)
Pull block core updates from Jens Axboe:
 "It's a big(ish) round this time, lots of development effort has gone
  into blk-mq in the last 3 months.  Generally we're heading to where
  3.16 will be a feature complete and performant blk-mq.  scsi-mq is
  progressing nicely and will hopefully be in 3.17.  A nvme port is in
  progress, and the Micron pci-e flash driver, mtip32xx, is converted
  and will be sent in with the driver pull request for 3.16.

  This pull request contains:

   - Lots of prep and support patches for scsi-mq have been integrated.
     All from Christoph.

   - API and code cleanups for blk-mq from Christoph.

   - Lots of good corner case and error handling cleanup fixes for
     blk-mq from Ming Lei.

   - A flew of blk-mq updates from me:

     * Provide strict mappings so that the driver can rely on the CPU
       to queue mapping.  This enables optimizations in the driver.

     * Provided a bitmap tagging instead of percpu_ida, which never
       really worked well for blk-mq.  percpu_ida relies on the fact
       that we have a lot more tags available than we really need, it
       fails miserably for cases where we exhaust (or are close to
       exhausting) the tag space.

     * Provide sane support for shared tag maps, as utilized by scsi-mq

     * Various fixes for IO timeouts.

     * API cleanups, and lots of perf tweaks and optimizations.

   - Remove 'buffer' from struct request.  This is ancient code, from
     when requests were always virtually mapped.  Kill it, to reclaim
     some space in struct request.  From me.

   - Remove 'magic' from blk_plug.  Since we store these on the stack
     and since we've never caught any actual bugs with this, lets just
     get rid of it.  From me.

   - Only call part_in_flight() once for IO completion, as includes two
     atomic reads.  Hopefully we'll get a better implementation soon, as
     the part IO stats are now one of the more expensive parts of doing
     IO on blk-mq.  From me.

   - File migration of block code from {mm,fs}/ to block/.  This
     includes bio.c, bio-integrity.c, bounce.c, and ioprio.c.  From me,
     from a discussion on lkml.

  That should describe the meat of the pull request.  Also has various
  little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong,
  Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam
  Bradshaw"

* 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits)
  blk-mq: push IPI or local end_io decision to __blk_mq_complete_request()
  blk-mq: remember to start timeout handler for direct queue
  block: ensure that the timer is always added
  blk-mq: blk_mq_unregister_hctx() can be static
  blk-mq: make the sysfs mq/ layout reflect current mappings
  blk-mq: blk_mq_tag_to_rq should handle flush request
  block: remove dead code in scsi_ioctl:blk_verify_command
  blk-mq: request initialization optimizations
  block: add queue flag for disabling SG merging
  block: remove 'magic' from struct blk_plug
  blk-mq: remove alloc_hctx and free_hctx methods
  blk-mq: add file comments and update copyright notices
  blk-mq: remove blk_mq_alloc_request_pinned
  blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request
  blk-mq: remove blk_mq_wait_for_tags
  blk-mq: initialize request in __blk_mq_alloc_request
  blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request
  blk-mq: add helper to insert requests from irq context
  blk-mq: remove stale comment for blk_mq_complete_request()
  blk-mq: allow non-softirq completions
  ...

56 files changed:
Documentation/DocBook/filesystems.tmpl
block/Makefile
block/bio-integrity.c [moved from fs/bio-integrity.c with 99% similarity]
block/bio.c [moved from fs/bio.c with 99% similarity]
block/blk-core.c
block/blk-flush.c
block/blk-iopoll.c
block/blk-lib.c
block/blk-map.c
block/blk-merge.c
block/blk-mq-cpu.c
block/blk-mq-cpumap.c
block/blk-mq-sysfs.c
block/blk-mq-tag.c
block/blk-mq-tag.h
block/blk-mq.c
block/blk-mq.h
block/blk-sysfs.c
block/blk-throttle.c
block/blk-timeout.c
block/blk.h
block/bounce.c [moved from mm/bounce.c with 100% similarity]
block/cfq-iosched.c
block/ioprio.c [moved from fs/ioprio.c with 100% similarity]
block/scsi_ioctl.c
drivers/block/amiflop.c
drivers/block/ataflop.c
drivers/block/floppy.c
drivers/block/hd.c
drivers/block/mg_disk.c
drivers/block/null_blk.c
drivers/block/paride/pcd.c
drivers/block/paride/pd.c
drivers/block/paride/pf.c
drivers/block/skd_main.c
drivers/block/swim.c
drivers/block/swim3.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/block/xsysace.c
drivers/block/z2ram.c
drivers/cdrom/gdrom.c
drivers/char/random.c
drivers/ide/ide-disk.c
drivers/md/dm.c
drivers/mtd/mtd_blkdevs.c
drivers/mtd/ubi/block.c
drivers/sbus/char/jsflash.c
drivers/scsi/scsi_lib.c
drivers/scsi/sd.c
fs/Makefile
include/linux/bio.h
include/linux/blk-mq.h
include/linux/blk_types.h
include/linux/blkdev.h
mm/Makefile

index 4f676838da06a61aed3e26243185a0a7fa2f0311..bcdfdb9a927700477f9d559db2ba20b02a23ebd1 100644 (file)
@@ -62,7 +62,7 @@
 !Efs/mpage.c
 !Efs/namei.c
 !Efs/buffer.c
-!Efs/bio.c
+!Eblock/bio.c
 !Efs/seq_file.c
 !Efs/filesystems.c
 !Efs/fs-writeback.c
index 20645e88fb572edc926a79ffdc764a41cfab3f36..a2ce6ac935ecf566ec8e956f666f76feec4e9682 100644 (file)
@@ -2,13 +2,15 @@
 # Makefile for the kernel block layer
 #
 
-obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
                        blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
-                       genhd.o scsi_ioctl.o partition-generic.o partitions/
+                       genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+                       partitions/
 
+obj-$(CONFIG_BOUNCE)   += bounce.o
 obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)   += bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)       += blk-cgroup.o
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ)     += cfq-iosched.o
 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)        += blk-integrity.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)       += cmdline-parser.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
similarity index 99%
rename from fs/bio-integrity.c
rename to block/bio-integrity.c
index 1c2ce0c8771133194ecb9d91517a7dd67c571765..9e241063a616f2c4ad023a08c8b6b06d676adb56 100644 (file)
@@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
        if (!bs->bio_integrity_pool)
                return -1;
 
-       bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+       bs->bvec_integrity_pool = biovec_create_pool(pool_size);
        if (!bs->bvec_integrity_pool) {
                mempool_destroy(bs->bio_integrity_pool);
                return -1;
similarity index 99%
rename from fs/bio.c
rename to block/bio.c
index 6f0362b77806c61909aa37433a9e77eb77476cff..96d28eee8a1eeb425f0025f4c0ccaa61e184f8ab 100644 (file)
--- a/fs/bio.c
@@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error)
 
 /**
  * bio_chain - chain bio completions
+ * @bio: the target bio
+ * @parent: the @bio's parent bio
  *
  * The caller won't have a bi_end_io called when @bio completes - instead,
  * @parent's bi_end_io won't be called until both @parent and @bio have
@@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
        bio->bi_private = bmd;
 }
 
-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
-                                              unsigned int iov_count,
+static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
                                               gfp_t gfp_mask)
 {
        if (iov_count > UIO_MAXIOV)
@@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        if (offset)
                nr_pages++;
 
-       bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
+       bmd = bio_alloc_map_data(iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
 
@@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(int pool_entries)
 {
        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
@@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
 
-       bs->bvec_pool = biovec_create_pool(bs, pool_size);
+       bs->bvec_pool = biovec_create_pool(pool_size);
        if (!bs->bvec_pool)
                goto bad;
 
index a0e3096c4bb53a48c129d3df0337ad66731c417a..40d654861c33e545dd79da5ce5a6dff40057e703 100644 (file)
@@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
-       printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
-              rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
+       printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
+              rq->bio, rq->biotail, blk_rq_bytes(rq));
 
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                printk(KERN_INFO "  cdb: ");
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
                struct blk_mq_hw_ctx *hctx;
                int i;
 
-               queue_for_each_hw_ctx(q, hctx, i)
-                       cancel_delayed_work_sync(&hctx->delayed_work);
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       cancel_delayed_work_sync(&hctx->run_work);
+                       cancel_delayed_work_sync(&hctx->delay_work);
+               }
        } else {
                cancel_delayed_work_sync(&q->delay_work);
        }
@@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
        if (!q)
                return NULL;
 
-       if (percpu_counter_init(&q->mq_usage_counter, 0))
-               goto fail_q;
-
        q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
        if (q->id < 0)
-               goto fail_c;
+               goto fail_q;
 
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -637,8 +636,6 @@ fail_bdi:
        bdi_destroy(&q->backing_dev_info);
 fail_id:
        ida_simple_remove(&blk_queue_ida, q->id);
-fail_c:
-       percpu_counter_destroy(&q->mq_usage_counter);
 fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return NULL;
@@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
                __freed_request(rl, sync ^ 1);
 }
 
+int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+       struct request_list *rl;
+
+       spin_lock_irq(q->queue_lock);
+       q->nr_requests = nr;
+       blk_queue_congestion_threshold(q);
+
+       /* congestion isn't cgroup aware and follows root blkcg for now */
+       rl = &q->root_rl;
+
+       if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+               blk_set_queue_congested(q, BLK_RW_SYNC);
+       else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+               blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+       if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+               blk_set_queue_congested(q, BLK_RW_ASYNC);
+       else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+               blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+       blk_queue_for_each_rl(rl, q) {
+               if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+                       blk_set_rl_full(rl, BLK_RW_SYNC);
+               } else {
+                       blk_clear_rl_full(rl, BLK_RW_SYNC);
+                       wake_up(&rl->wait[BLK_RW_SYNC]);
+               }
+
+               if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+                       blk_set_rl_full(rl, BLK_RW_ASYNC);
+               } else {
+                       blk_clear_rl_full(rl, BLK_RW_ASYNC);
+                       wake_up(&rl->wait[BLK_RW_ASYNC]);
+               }
+       }
+
+       spin_unlock_irq(q->queue_lock);
+       return 0;
+}
+
 /*
  * Determine if elevator data should be initialized when allocating the
  * request associated with @bio.
@@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
        if (q->mq_ops)
-               return blk_mq_alloc_request(q, rw, gfp_mask);
+               return blk_mq_alloc_request(q, rw, gfp_mask, false);
        else
                return blk_old_get_request(q, rw, gfp_mask);
 }
@@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 static void part_round_stats_single(int cpu, struct hd_struct *part,
                                    unsigned long now)
 {
+       int inflight;
+
        if (now == part->stamp)
                return;
 
-       if (part_in_flight(part)) {
+       inflight = part_in_flight(part);
+       if (inflight) {
                __part_stat_add(cpu, part, time_in_queue,
-                               part_in_flight(part) * (now - part->stamp));
+                               inflight * (now - part->stamp));
                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
        }
        part->stamp = now;
@@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 
        rq->__data_len = rq->resid_len = len;
        rq->nr_phys_segments = 1;
-       rq->buffer = bio_data(bio);
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 
@@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
        bio->bi_next = req->bio;
        req->bio = bio;
 
-       /*
-        * may not be valid. if the low level driver said
-        * it didn't need a bounce buffer then it better
-        * not touch req->buffer either...
-        */
-       req->buffer = bio_data(bio);
        req->__sector = bio->bi_iter.bi_sector;
        req->__data_len += bio->bi_iter.bi_size;
        req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * added on the elevator at this point.  In addition, we don't have
  * reliable access to the elevator outside queue lock.  Only check basic
  * merging parameters without querying the elevator.
+ *
+ * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                            unsigned int *request_count)
@@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
        bool ret = false;
        struct list_head *plug_list;
 
-       if (blk_queue_nomerges(q))
-               goto out;
-
        plug = current->plug;
        if (!plug)
                goto out;
@@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
-       if (blk_attempt_plug_merge(q, bio, &request_count))
+       if (!blk_queue_nomerges(q) &&
+           blk_attempt_plug_merge(q, bio, &request_count))
                return;
 
        spin_lock_irq(q->queue_lock);
@@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);
 
-       return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+       return PTR_ERR_OR_ZERO(dir);
 }
 
 late_initcall(fail_make_request_debugfs);
@@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
        }
 
        req->__data_len -= total_bytes;
-       req->buffer = bio_data(req->bio);
 
        /* update sector only for requests with clear definition of sector */
        if (req->cmd_type == REQ_TYPE_FS)
@@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 /*
  * queue lock must be held
  */
-static void blk_finish_request(struct request *req, int error)
+void blk_finish_request(struct request *req, int error)
 {
        if (blk_rq_tagged(req))
                blk_queue_end_tag(req->q, req);
@@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
                __blk_put_request(req->q, req);
        }
 }
+EXPORT_SYMBOL(blk_finish_request);
 
 /**
  * blk_end_bidi_request - Complete a bidi request
@@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
        /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
        rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
 
-       if (bio_has_data(bio)) {
+       if (bio_has_data(bio))
                rq->nr_phys_segments = bio_phys_segments(q, bio);
-               rq->buffer = bio_data(bio);
-       }
+
        rq->__data_len = bio->bi_iter.bi_size;
        rq->bio = rq->biotail = bio;
 
@@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 
 /*
  * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ * The actual data parts (e.g. ->cmd, ->sense) are not copied.
  */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
@@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
  *
  * Description:
  *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
  *     are not copied, and copying such parts is the caller's responsibility.
  *     Also, pages which the original bios are pointing to are not copied
  *     and the cloned bios just point same pages.
@@ -2904,20 +2937,25 @@ free_and_out:
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
+int kblockd_schedule_work(struct work_struct *work)
 {
        return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-int kblockd_schedule_delayed_work(struct request_queue *q,
-                       struct delayed_work *dwork, unsigned long delay)
+int kblockd_schedule_delayed_work(struct delayed_work *dwork,
+                                 unsigned long delay)
 {
        return queue_delayed_work(kblockd_workqueue, dwork, delay);
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 
-#define PLUG_MAGIC     0x91827364
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+                                    unsigned long delay)
+{
+       return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
 
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
@@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug)
 {
        struct task_struct *tsk = current;
 
-       plug->magic = PLUG_MAGIC;
        INIT_LIST_HEAD(&plug->list);
        INIT_LIST_HEAD(&plug->mq_list);
        INIT_LIST_HEAD(&plug->cb_list);
@@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
        LIST_HEAD(list);
        unsigned int depth;
 
-       BUG_ON(plug->magic != PLUG_MAGIC);
-
        flush_plug_callbacks(plug, from_schedule);
 
        if (!list_empty(&plug->mq_list))
index 43e6b4755e9a7e74e05479a83d42fbd88762e9f4..ff87c664b7df63e62710faad323f8eae8fcd082c 100644 (file)
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
        blk_clear_rq_complete(rq);
 }
 
-static void mq_flush_run(struct work_struct *work)
-{
-       struct request *rq;
-
-       rq = container_of(work, struct request, mq_flush_work);
-
-       memset(&rq->csd, 0, sizeof(rq->csd));
-       blk_mq_insert_request(rq, false, true, false);
-}
-
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
        if (rq->q->mq_ops) {
-               INIT_WORK(&rq->mq_flush_work, mq_flush_run);
-               kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+               struct request_queue *q = rq->q;
+
+               blk_mq_add_to_requeue_list(rq, add_front);
+               blk_mq_kick_requeue_list(q);
                return false;
        } else {
                if (add_front)
@@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
        struct request *rq, *n;
        unsigned long flags = 0;
 
-       if (q->mq_ops)
+       if (q->mq_ops) {
                spin_lock_irqsave(&q->mq_flush_lock, flags);
+               q->flush_rq->cmd_flags = 0;
+       }
 
        running = &q->flush_queue[q->flush_running_idx];
        BUG_ON(q->flush_pending_idx == q->flush_running_idx);
@@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q)
         */
        q->flush_pending_idx ^= 1;
 
-       if (q->mq_ops) {
-               struct blk_mq_ctx *ctx = first_rq->mq_ctx;
-               struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-               blk_mq_rq_init(hctx, q->flush_rq);
-               q->flush_rq->mq_ctx = ctx;
-
-               /*
-                * Reuse the tag value from the fist waiting request,
-                * with blk-mq the tag is generated during request
-                * allocation and drivers can rely on it being inside
-                * the range they asked for.
-                */
-               q->flush_rq->tag = first_rq->tag;
-       } else {
-               blk_rq_init(q, q->flush_rq);
-       }
+       blk_rq_init(q, q->flush_rq);
+       if (q->mq_ops)
+               blk_mq_clone_flush_request(q->flush_rq, first_rq);
 
        q->flush_rq->cmd_type = REQ_TYPE_FS;
        q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
index c11d24e379e2a4b89caf1936d4433773caef1bdc..d828b44a404b4f5abc35d7760346870a3902a911 100644 (file)
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
  *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
  *     is called.
  **/
-void blk_iopoll_complete(struct blk_iopoll *iopoll)
+void blk_iopoll_complete(struct blk_iopoll *iop)
 {
        unsigned long flags;
 
        local_irq_save(flags);
-       __blk_iopoll_complete(iopoll);
+       __blk_iopoll_complete(iop);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_iopoll_complete);
index 97a733cf3d5f925d1eede00d0dbca63f3565fd38..8411be3c19d30c8f4c4b745c56ad415d8ac4a126 100644 (file)
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
  *  Generate and issue number of bios with zerofiled pages.
  */
 
-int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-                       sector_t nr_sects, gfp_t gfp_mask)
+static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+                                 sector_t nr_sects, gfp_t gfp_mask)
 {
        int ret;
        struct bio *bio;
index f7b22bc215180d4b7f467135faeaf52975a77013..f890d4345b0cb63f9faa88e70d466a3cec3e6b3f 100644 (file)
@@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
        if (!bio_flagged(bio, BIO_USER_MAPPED))
                rq->cmd_flags |= REQ_COPY_USER;
 
-       rq->buffer = NULL;
        return 0;
 unmap_rq:
        blk_rq_unmap_user(bio);
@@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
        blk_queue_bounce(q, &bio);
        bio_get(bio);
        blk_rq_bio_prep(q, rq, bio);
-       rq->buffer = NULL;
        return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        }
 
        blk_queue_bounce(q, &rq->bio);
-       rq->buffer = NULL;
        return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
index 6c583f9c5b65d002a6ca357c53b19799f822f512..b3bf0df0f4c2743aa99ef7ac179db324eb5e4657 100644 (file)
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
                                             struct bio *bio)
 {
        struct bio_vec bv, bvprv = { NULL };
-       int cluster, high, highprv = 1;
+       int cluster, high, highprv = 1, no_sg_merge;
        unsigned int seg_size, nr_phys_segs;
        struct bio *fbio, *bbio;
        struct bvec_iter iter;
@@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
        cluster = blk_queue_cluster(q);
        seg_size = 0;
        nr_phys_segs = 0;
+       no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
+       high = 0;
        for_each_bio(bio) {
                bio_for_each_segment(bv, bio, iter) {
+                       /*
+                        * If SG merging is disabled, each bio vector is
+                        * a segment
+                        */
+                       if (no_sg_merge)
+                               goto new_segment;
+
                        /*
                         * the trick here is making sure that a high page is
-                        * never considered part of another segment, since that
-                        * might change with the bounce page.
+                        * never considered part of another segment, since
+                        * that might change with the bounce page.
                         */
                        high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
                        if (!high && !highprv && cluster) {
@@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-       struct bio *nxt = bio->bi_next;
+       if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
+               bio->bi_phys_segments = bio->bi_vcnt;
+       else {
+               struct bio *nxt = bio->bi_next;
+
+               bio->bi_next = NULL;
+               bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+               bio->bi_next = nxt;
+       }
 
-       bio->bi_next = NULL;
-       bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
-       bio->bi_next = nxt;
        bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
index 136ef8643bbade3dd2d5d84e35183c749bc00399..bb3ed488f7b5b26053e91dd3a5f02a0f2c472a70 100644 (file)
@@ -1,3 +1,8 @@
+/*
+ * CPU notifier helper code for blk-mq
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
 {
        unsigned int cpu = (unsigned long) hcpu;
        struct blk_mq_cpu_notifier *notify;
+       int ret = NOTIFY_OK;
 
        raw_spin_lock(&blk_mq_cpu_notify_lock);
 
-       list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
-               notify->notify(notify->data, action, cpu);
+       list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
+               ret = notify->notify(notify->data, action, cpu);
+               if (ret != NOTIFY_OK)
+                       break;
+       }
 
        raw_spin_unlock(&blk_mq_cpu_notify_lock);
-       return NOTIFY_OK;
+       return ret;
 }
 
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
 }
 
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-                             void (*fn)(void *, unsigned long, unsigned int),
+                             int (*fn)(void *, unsigned long, unsigned int),
                              void *data)
 {
        notifier->notify = fn;
index 09792132961991e8168c2dee283a7b0cb0b967d0..1065d7c65fa15b60b1104944322533033df69df7 100644 (file)
@@ -1,3 +1,8 @@
+/*
+ * CPU <-> hardware queue mapping helpers
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/module.h>
@@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
        return 0;
 }
 
-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 {
        unsigned int *map;
 
        /* If cpus are offline, map them to first hctx */
        map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
-                               reg->numa_node);
+                               set->numa_node);
        if (!map)
                return NULL;
 
-       if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+       if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
                return map;
 
        kfree(map);
        return NULL;
 }
+
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+       int i;
+
+       for_each_possible_cpu(i) {
+               if (index == mq_map[i])
+                       return cpu_to_node(i);
+       }
+
+       return NUMA_NO_NODE;
+}
index b0ba264b05225ca2432a9a86878cf9501efa2265..ed521786755570989dad96a14613be5ae80e7d3f 100644 (file)
@@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
        return ret;
 }
 
-static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-       ssize_t ret;
-
-       spin_lock(&hctx->lock);
-       ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
-       spin_unlock(&hctx->lock);
-
-       return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
-                                        const char *page, size_t len)
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-       struct blk_mq_ctx *ctx;
-       unsigned long ret;
-       unsigned int i;
-
-       if (kstrtoul(page, 10, &ret)) {
-               pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
-               return -EINVAL;
-       }
-
-       spin_lock(&hctx->lock);
-       if (ret)
-               hctx->flags |= BLK_MQ_F_SHOULD_IPI;
-       else
-               hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
-       spin_unlock(&hctx->lock);
-
-       hctx_for_each_ctx(hctx, ctx, i)
-               ctx->ipi_redirect = !!ret;
-
-       return len;
+       return blk_mq_tag_sysfs_show(hctx->tags, page);
 }
 
-static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-       return blk_mq_tag_sysfs_show(hctx->tags, page);
+       return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
 }
 
 static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-       unsigned int i, queue_num, first = 1;
+       unsigned int i, first = 1;
        ssize_t ret = 0;
 
        blk_mq_disable_hotplug();
 
-       for_each_online_cpu(i) {
-               queue_num = hctx->queue->mq_map[i];
-               if (queue_num != hctx->queue_num)
-                       continue;
-
+       for_each_cpu(i, hctx->cpumask) {
                if (first)
                        ret += sprintf(ret + page, "%u", i);
                else
@@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
        .attr = {.name = "dispatched", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_dispatched_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
+       .attr = {.name = "active", .mode = S_IRUGO },
+       .show = blk_mq_hw_sysfs_active_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
        .attr = {.name = "pending", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_rq_list_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
-       .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
-       .show = blk_mq_hw_sysfs_ipi_show,
-       .store = blk_mq_hw_sysfs_ipi_store,
-};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
        .attr = {.name = "tags", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_tags_show,
@@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
        &blk_mq_hw_sysfs_run.attr,
        &blk_mq_hw_sysfs_dispatched.attr,
        &blk_mq_hw_sysfs_pending.attr,
-       &blk_mq_hw_sysfs_ipi.attr,
        &blk_mq_hw_sysfs_tags.attr,
        &blk_mq_hw_sysfs_cpus.attr,
+       &blk_mq_hw_sysfs_active.attr,
        NULL,
 };
 
@@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
        .release        = blk_mq_sysfs_release,
 };
 
+static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_ctx *ctx;
+       int i;
+
+       if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+               return;
+
+       hctx_for_each_ctx(hctx, ctx, i)
+               kobject_del(&ctx->kobj);
+
+       kobject_del(&hctx->kobj);
+}
+
+static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
+{
+       struct request_queue *q = hctx->queue;
+       struct blk_mq_ctx *ctx;
+       int i, ret;
+
+       if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+               return 0;
+
+       ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+       if (ret)
+               return ret;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+               ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
 void blk_mq_unregister_disk(struct gendisk *disk)
 {
        struct request_queue *q = disk->queue;
@@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
        int i, j;
 
        queue_for_each_hw_ctx(q, hctx, i) {
-               hctx_for_each_ctx(hctx, ctx, j) {
-                       kobject_del(&ctx->kobj);
+               blk_mq_unregister_hctx(hctx);
+
+               hctx_for_each_ctx(hctx, ctx, j)
                        kobject_put(&ctx->kobj);
-               }
-               kobject_del(&hctx->kobj);
+
                kobject_put(&hctx->kobj);
        }
 
@@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
        kobject_put(&disk_to_dev(disk)->kobj);
 }
 
+static void blk_mq_sysfs_init(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       int i, j;
+
+       kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+
+               hctx_for_each_ctx(hctx, ctx, j)
+                       kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+       }
+}
+
 int blk_mq_register_disk(struct gendisk *disk)
 {
        struct device *dev = disk_to_dev(disk);
        struct request_queue *q = disk->queue;
        struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
-       int ret, i, j;
+       int ret, i;
 
-       kobject_init(&q->mq_kobj, &blk_mq_ktype);
+       blk_mq_sysfs_init(q);
 
        ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
        if (ret < 0)
@@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
        kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
        queue_for_each_hw_ctx(q, hctx, i) {
-               kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-               ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
+               hctx->flags |= BLK_MQ_F_SYSFS_UP;
+               ret = blk_mq_register_hctx(hctx);
                if (ret)
                        break;
-
-               if (!hctx->nr_ctx)
-                       continue;
-
-               hctx_for_each_ctx(hctx, ctx, j) {
-                       kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-                       ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
-                       if (ret)
-                               break;
-               }
        }
 
        if (ret) {
@@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)
 
        return 0;
 }
+
+void blk_mq_sysfs_unregister(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_unregister_hctx(hctx);
+}
+
+int blk_mq_sysfs_register(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i, ret = 0;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               ret = blk_mq_register_hctx(hctx);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
index 83ae96c51a2762cf7386f096e348eace58525e37..d90c4aeb7dd38c02582d8f8e88f0e176dd9118be 100644 (file)
+/*
+ * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
+ * over multiple cachelines to avoid ping-pong between multiple submitters
+ * or submitter and completer. Uses rolling wakeups to avoid falling of
+ * the scaling cliff when we run out of tags and have to start putting
+ * submitters to sleep.
+ *
+ * Uses active queue tracking to support fairer distribution of tags
+ * between multiple submitters when a shared tag map is used.
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/percpu_ida.h>
+#include <linux/random.h>
 
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
+static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
+{
+       int i;
+
+       for (i = 0; i < bt->map_nr; i++) {
+               struct blk_align_bitmap *bm = &bt->map[i];
+               int ret;
+
+               ret = find_first_zero_bit(&bm->word, bm->depth);
+               if (ret < bm->depth)
+                       return true;
+       }
+
+       return false;
+}
+
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+       if (!tags)
+               return true;
+
+       return bt_has_free_tags(&tags->bitmap_tags);
+}
+
+static inline void bt_index_inc(unsigned int *index)
+{
+       *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
 /*
- * Per tagged queue (tag address space) map
+ * If a previously inactive queue goes active, bump the active user count.
  */
-struct blk_mq_tags {
-       unsigned int nr_tags;
-       unsigned int nr_reserved_tags;
-       unsigned int nr_batch_move;
-       unsigned int nr_max_cache;
+bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+       if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+           !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+               atomic_inc(&hctx->tags->active_queues);
 
-       struct percpu_ida free_tags;
-       struct percpu_ida reserved_tags;
-};
+       return true;
+}
 
-void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
+/*
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
+ */
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
 {
-       int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
-       blk_mq_put_tag(tags, tag);
+       struct blk_mq_bitmap_tags *bt;
+       int i, wake_index;
+
+       bt = &tags->bitmap_tags;
+       wake_index = bt->wake_index;
+       for (i = 0; i < BT_WAIT_QUEUES; i++) {
+               struct bt_wait_state *bs = &bt->bs[wake_index];
+
+               if (waitqueue_active(&bs->wait))
+                       wake_up(&bs->wait);
+
+               bt_index_inc(&wake_index);
+       }
 }
 
-bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_tags *tags = hctx->tags;
+
+       if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+               return;
+
+       atomic_dec(&tags->active_queues);
+
+       blk_mq_tag_wakeup_all(tags);
+}
+
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+                                 struct blk_mq_bitmap_tags *bt)
+{
+       unsigned int depth, users;
+
+       if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+               return true;
+       if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+               return true;
+
+       /*
+        * Don't try dividing an ant
+        */
+       if (bt->depth == 1)
+               return true;
+
+       users = atomic_read(&hctx->tags->active_queues);
+       if (!users)
+               return true;
+
+       /*
+        * Allow at least some tags
+        */
+       depth = max((bt->depth + users - 1) / users, 4U);
+       return atomic_read(&hctx->nr_active) < depth;
+}
+
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 {
-       return !tags ||
-               percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
+       int tag, org_last_tag, end;
+
+       org_last_tag = last_tag;
+       end = bm->depth;
+       do {
+restart:
+               tag = find_next_zero_bit(&bm->word, end, last_tag);
+               if (unlikely(tag >= end)) {
+                       /*
+                        * We started with an offset, start from 0 to
+                        * exhaust the map.
+                        */
+                       if (org_last_tag && last_tag) {
+                               end = last_tag;
+                               last_tag = 0;
+                               goto restart;
+                       }
+                       return -1;
+               }
+               last_tag = tag + 1;
+       } while (test_and_set_bit_lock(tag, &bm->word));
+
+       return tag;
 }
 
-static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
+/*
+ * Straight forward bitmap tag implementation, where each bit is a tag
+ * (cleared == free, and set == busy). The small twist is using per-cpu
+ * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
+ * contexts. This enables us to drastically limit the space searched,
+ * without dirtying an extra shared cacheline like we would if we stored
+ * the cache value inside the shared blk_mq_bitmap_tags structure. On top
+ * of that, each word of tags is in a separate cacheline. This means that
+ * multiple users will tend to stick to different cachelines, at least
+ * until the map is exhausted.
+ */
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+                   unsigned int *tag_cache)
 {
+       unsigned int last_tag, org_last_tag;
+       int index, i, tag;
+
+       if (!hctx_may_queue(hctx, bt))
+               return -1;
+
+       last_tag = org_last_tag = *tag_cache;
+       index = TAG_TO_INDEX(bt, last_tag);
+
+       for (i = 0; i < bt->map_nr; i++) {
+               tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+               if (tag != -1) {
+                       tag += (index << bt->bits_per_word);
+                       goto done;
+               }
+
+               last_tag = 0;
+               if (++index >= bt->map_nr)
+                       index = 0;
+       }
+
+       *tag_cache = 0;
+       return -1;
+
+       /*
+        * Only update the cache from the allocation path, if we ended
+        * up using the specific cached tag.
+        */
+done:
+       if (tag == org_last_tag) {
+               last_tag = tag + 1;
+               if (last_tag >= bt->depth - 1)
+                       last_tag = 0;
+
+               *tag_cache = last_tag;
+       }
+
+       return tag;
+}
+
+static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
+                                        struct blk_mq_hw_ctx *hctx)
+{
+       struct bt_wait_state *bs;
+
+       if (!hctx)
+               return &bt->bs[0];
+
+       bs = &bt->bs[hctx->wait_index];
+       bt_index_inc(&hctx->wait_index);
+       return bs;
+}
+
+static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
+                 unsigned int *last_tag, gfp_t gfp)
+{
+       struct bt_wait_state *bs;
+       DEFINE_WAIT(wait);
        int tag;
 
-       tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ?
-                              TASK_UNINTERRUPTIBLE : TASK_RUNNING);
-       if (tag < 0)
-               return BLK_MQ_TAG_FAIL;
-       return tag + tags->nr_reserved_tags;
+       tag = __bt_get(hctx, bt, last_tag);
+       if (tag != -1)
+               return tag;
+
+       if (!(gfp & __GFP_WAIT))
+               return -1;
+
+       bs = bt_wait_ptr(bt, hctx);
+       do {
+               bool was_empty;
+
+               was_empty = list_empty(&wait.task_list);
+               prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+               tag = __bt_get(hctx, bt, last_tag);
+               if (tag != -1)
+                       break;
+
+               if (was_empty)
+                       atomic_set(&bs->wait_cnt, bt->wake_cnt);
+
+               io_schedule();
+       } while (1);
+
+       finish_wait(&bs->wait, &wait);
+       return tag;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
+                                    struct blk_mq_hw_ctx *hctx,
+                                    unsigned int *last_tag, gfp_t gfp)
+{
+       int tag;
+
+       tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
+       if (tag >= 0)
+               return tag + tags->nr_reserved_tags;
+
+       return BLK_MQ_TAG_FAIL;
 }
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
                                              gfp_t gfp)
 {
-       int tag;
+       int tag, zero = 0;
 
        if (unlikely(!tags->nr_reserved_tags)) {
                WARN_ON_ONCE(1);
                return BLK_MQ_TAG_FAIL;
        }
 
-       tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ?
-                              TASK_UNINTERRUPTIBLE : TASK_RUNNING);
+       tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
        if (tag < 0)
                return BLK_MQ_TAG_FAIL;
+
        return tag;
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
+unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
+                           gfp_t gfp, bool reserved)
 {
        if (!reserved)
-               return __blk_mq_get_tag(tags, gfp);
+               return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
 
-       return __blk_mq_get_reserved_tag(tags, gfp);
+       return __blk_mq_get_reserved_tag(hctx->tags, gfp);
+}
+
+static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+{
+       int i, wake_index;
+
+       wake_index = bt->wake_index;
+       for (i = 0; i < BT_WAIT_QUEUES; i++) {
+               struct bt_wait_state *bs = &bt->bs[wake_index];
+
+               if (waitqueue_active(&bs->wait)) {
+                       if (wake_index != bt->wake_index)
+                               bt->wake_index = wake_index;
+
+                       return bs;
+               }
+
+               bt_index_inc(&wake_index);
+       }
+
+       return NULL;
+}
+
+static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+{
+       const int index = TAG_TO_INDEX(bt, tag);
+       struct bt_wait_state *bs;
+
+       /*
+        * The unlock memory barrier need to order access to req in free
+        * path and clearing tag bit
+        */
+       clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+
+       bs = bt_wake_ptr(bt);
+       if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
+               atomic_set(&bs->wait_cnt, bt->wake_cnt);
+               bt_index_inc(&bt->wake_index);
+               wake_up(&bs->wait);
+       }
 }
 
 static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
 {
        BUG_ON(tag >= tags->nr_tags);
 
-       percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
+       bt_clear_tag(&tags->bitmap_tags, tag);
 }
 
 static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
 {
        BUG_ON(tag >= tags->nr_reserved_tags);
 
-       percpu_ida_free(&tags->reserved_tags, tag);
+       bt_clear_tag(&tags->breserved_tags, tag);
 }
 
-void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
+                   unsigned int *last_tag)
 {
-       if (tag >= tags->nr_reserved_tags)
-               __blk_mq_put_tag(tags, tag);
-       else
+       struct blk_mq_tags *tags = hctx->tags;
+
+       if (tag >= tags->nr_reserved_tags) {
+               const int real_tag = tag - tags->nr_reserved_tags;
+
+               __blk_mq_put_tag(tags, real_tag);
+               *last_tag = real_tag;
+       } else
                __blk_mq_put_reserved_tag(tags, tag);
 }
 
-static int __blk_mq_tag_iter(unsigned id, void *data)
+static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
+                            unsigned long *free_map, unsigned int off)
 {
-       unsigned long *tag_map = data;
-       __set_bit(id, tag_map);
-       return 0;
+       int i;
+
+       for (i = 0; i < bt->map_nr; i++) {
+               struct blk_align_bitmap *bm = &bt->map[i];
+               int bit = 0;
+
+               do {
+                       bit = find_next_zero_bit(&bm->word, bm->depth, bit);
+                       if (bit >= bm->depth)
+                               break;
+
+                       __set_bit(bit + off, free_map);
+                       bit++;
+               } while (1);
+
+               off += (1 << bt->bits_per_word);
+       }
 }
 
 void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
        if (!tag_map)
                return;
 
-       percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
+       bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
        if (tags->nr_reserved_tags)
-               percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
-                       tag_map);
+               bt_for_each_free(&tags->breserved_tags, tag_map, 0);
 
        fn(data, tag_map);
        kfree(tag_map);
 }
+EXPORT_SYMBOL(blk_mq_tag_busy_iter);
+
+static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+{
+       unsigned int i, used;
+
+       for (i = 0, used = 0; i < bt->map_nr; i++) {
+               struct blk_align_bitmap *bm = &bt->map[i];
+
+               used += bitmap_weight(&bm->word, bm->depth);
+       }
+
+       return bt->depth - used;
+}
+
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+                           unsigned int depth)
+{
+       unsigned int tags_per_word = 1U << bt->bits_per_word;
+       unsigned int map_depth = depth;
+
+       if (depth) {
+               int i;
+
+               for (i = 0; i < bt->map_nr; i++) {
+                       bt->map[i].depth = min(map_depth, tags_per_word);
+                       map_depth -= bt->map[i].depth;
+               }
+       }
+
+       bt->wake_cnt = BT_WAIT_BATCH;
+       if (bt->wake_cnt > depth / 4)
+               bt->wake_cnt = max(1U, depth / 4);
+
+       bt->depth = depth;
+}
+
+static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
+                       int node, bool reserved)
+{
+       int i;
+
+       bt->bits_per_word = ilog2(BITS_PER_LONG);
+
+       /*
+        * Depth can be zero for reserved tags, that's not a failure
+        * condition.
+        */
+       if (depth) {
+               unsigned int nr, tags_per_word;
+
+               tags_per_word = (1 << bt->bits_per_word);
+
+               /*
+                * If the tag space is small, shrink the number of tags
+                * per word so we spread over a few cachelines, at least.
+                * If less than 4 tags, just forget about it, it's not
+                * going to work optimally anyway.
+                */
+               if (depth >= 4) {
+                       while (tags_per_word * 4 > depth) {
+                               bt->bits_per_word--;
+                               tags_per_word = (1 << bt->bits_per_word);
+                       }
+               }
+
+               nr = ALIGN(depth, tags_per_word) / tags_per_word;
+               bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
+                                               GFP_KERNEL, node);
+               if (!bt->map)
+                       return -ENOMEM;
+
+               bt->map_nr = nr;
+       }
+
+       bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
+       if (!bt->bs) {
+               kfree(bt->map);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < BT_WAIT_QUEUES; i++)
+               init_waitqueue_head(&bt->bs[i].wait);
+
+       bt_update_count(bt, depth);
+       return 0;
+}
+
+static void bt_free(struct blk_mq_bitmap_tags *bt)
+{
+       kfree(bt->map);
+       kfree(bt->bs);
+}
+
+static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+                                                  int node)
+{
+       unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+       if (bt_alloc(&tags->bitmap_tags, depth, node, false))
+               goto enomem;
+       if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
+               goto enomem;
+
+       return tags;
+enomem:
+       bt_free(&tags->bitmap_tags);
+       kfree(tags);
+       return NULL;
+}
 
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                     unsigned int reserved_tags, int node)
 {
-       unsigned int nr_tags, nr_cache;
        struct blk_mq_tags *tags;
-       int ret;
 
        if (total_tags > BLK_MQ_TAG_MAX) {
                pr_err("blk-mq: tag depth too large\n");
@@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
        if (!tags)
                return NULL;
 
-       nr_tags = total_tags - reserved_tags;
-       nr_cache = nr_tags / num_possible_cpus();
-
-       if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
-               nr_cache = BLK_MQ_TAG_CACHE_MIN;
-       else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
-               nr_cache = BLK_MQ_TAG_CACHE_MAX;
-
        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
-       tags->nr_max_cache = nr_cache;
-       tags->nr_batch_move = max(1u, nr_cache / 2);
 
-       ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
-                               tags->nr_reserved_tags,
-                               tags->nr_max_cache,
-                               tags->nr_batch_move);
-       if (ret)
-               goto err_free_tags;
+       return blk_mq_init_bitmap_tags(tags, node);
+}
 
-       if (reserved_tags) {
-               /*
-                * With max_cahe and batch set to 1, the allocator fallbacks to
-                * no cached. It's fine reserved tags allocation is slow.
-                */
-               ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
-                               1, 1);
-               if (ret)
-                       goto err_reserved_tags;
-       }
+void blk_mq_free_tags(struct blk_mq_tags *tags)
+{
+       bt_free(&tags->bitmap_tags);
+       bt_free(&tags->breserved_tags);
+       kfree(tags);
+}
 
-       return tags;
+void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+{
+       unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
 
-err_reserved_tags:
-       percpu_ida_destroy(&tags->free_tags);
-err_free_tags:
-       kfree(tags);
-       return NULL;
+       *tag = prandom_u32() % depth;
 }
 
-void blk_mq_free_tags(struct blk_mq_tags *tags)
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
 {
-       percpu_ida_destroy(&tags->free_tags);
-       percpu_ida_destroy(&tags->reserved_tags);
-       kfree(tags);
+       tdepth -= tags->nr_reserved_tags;
+       if (tdepth > tags->nr_tags)
+               return -EINVAL;
+
+       /*
+        * Don't need (or can't) update reserved tags here, they remain
+        * static and should never need resizing.
+        */
+       bt_update_count(&tags->bitmap_tags, tdepth);
+       blk_mq_tag_wakeup_all(tags);
+       return 0;
 }
 
 ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
 {
        char *orig_page = page;
-       unsigned int cpu;
+       unsigned int free, res;
 
        if (!tags)
                return 0;
 
-       page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
-                       " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
-                       tags->nr_batch_move, tags->nr_max_cache);
+       page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
+                       "bits_per_word=%u\n",
+                       tags->nr_tags, tags->nr_reserved_tags,
+                       tags->bitmap_tags.bits_per_word);
 
-       page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
-                       percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
-                       percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
+       free = bt_unused_tags(&tags->bitmap_tags);
+       res = bt_unused_tags(&tags->breserved_tags);
 
-       for_each_possible_cpu(cpu) {
-               page += sprintf(page, "  cpu%02u: nr_free=%u\n", cpu,
-                               percpu_ida_free_tags(&tags->free_tags, cpu));
-       }
+       page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
+       page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
 
        return page - orig_page;
 }
index 947ba2c6148e0b24e53ecaf9eceb7efff15472d0..c959de58d2a55fba4c9c248d871cfa3fd3e66f16 100644 (file)
@@ -1,17 +1,59 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
 
-struct blk_mq_tags;
+#include "blk-mq.h"
+
+enum {
+       BT_WAIT_QUEUES  = 8,
+       BT_WAIT_BATCH   = 8,
+};
+
+struct bt_wait_state {
+       atomic_t wait_cnt;
+       wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+#define TAG_TO_INDEX(bt, tag)  ((tag) >> (bt)->bits_per_word)
+#define TAG_TO_BIT(bt, tag)    ((tag) & ((1 << (bt)->bits_per_word) - 1))
+
+struct blk_mq_bitmap_tags {
+       unsigned int depth;
+       unsigned int wake_cnt;
+       unsigned int bits_per_word;
+
+       unsigned int map_nr;
+       struct blk_align_bitmap *map;
+
+       unsigned int wake_index;
+       struct bt_wait_state *bs;
+};
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+       unsigned int nr_tags;
+       unsigned int nr_reserved_tags;
+
+       atomic_t active_queues;
+
+       struct blk_mq_bitmap_tags bitmap_tags;
+       struct blk_mq_bitmap_tags breserved_tags;
+
+       struct request **rqs;
+       struct list_head page_list;
+};
+
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
-extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
-extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
-extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
+extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
 
 enum {
        BLK_MQ_TAG_CACHE_MIN    = 1,
@@ -24,4 +66,23 @@ enum {
        BLK_MQ_TAG_MAX          = BLK_MQ_TAG_FAIL - 1,
 };
 
+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+               return false;
+
+       return __blk_mq_tag_busy(hctx);
+}
+
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+       if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+               return;
+
+       __blk_mq_tag_idle(hctx);
+}
+
 #endif
index 1d2a9bdbee57f100faacf91ab3a9aef6b7b2a944..0f5879c42dcd3e489688bf26f263d5e2ef614ed4 100644 (file)
@@ -1,3 +1,9 @@
+/*
+ * Block multiqueue core code
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ * Copyright (C) 2013-2014 Christoph Hellwig
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
@@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
        unsigned int i;
 
-       for (i = 0; i < hctx->nr_ctx_map; i++)
-               if (hctx->ctx_map[i])
+       for (i = 0; i < hctx->ctx_map.map_size; i++)
+               if (hctx->ctx_map.map[i].word)
                        return true;
 
        return false;
 }
 
+static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
+                                             struct blk_mq_ctx *ctx)
+{
+       return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+}
+
+#define CTX_TO_BIT(hctx, ctx)  \
+       ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
+
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
 {
-       if (!test_bit(ctx->index_hw, hctx->ctx_map))
-               set_bit(ctx->index_hw, hctx->ctx_map);
+       struct blk_align_bitmap *bm = get_bm(hctx, ctx);
+
+       if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
+               set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
-static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
-                                             gfp_t gfp, bool reserved)
+static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
+                                     struct blk_mq_ctx *ctx)
 {
-       struct request *rq;
-       unsigned int tag;
-
-       tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
-       if (tag != BLK_MQ_TAG_FAIL) {
-               rq = hctx->rqs[tag];
-               rq->tag = tag;
-
-               return rq;
-       }
+       struct blk_align_bitmap *bm = get_bm(hctx, ctx);
 
-       return NULL;
+       clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 
 static int blk_mq_queue_enter(struct request_queue *q)
@@ -186,78 +194,95 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
        if (blk_queue_io_stat(q))
                rw_flags |= REQ_IO_STAT;
 
+       INIT_LIST_HEAD(&rq->queuelist);
+       /* csd/requeue_work/fifo_time is initialized before use */
+       rq->q = q;
        rq->mq_ctx = ctx;
-       rq->cmd_flags = rw_flags;
-       rq->start_time = jiffies;
+       rq->cmd_flags |= rw_flags;
+       /* do not touch atomic flags, it needs atomic ops against the timer */
+       rq->cpu = -1;
+       INIT_HLIST_NODE(&rq->hash);
+       RB_CLEAR_NODE(&rq->rb_node);
+       rq->rq_disk = NULL;
+       rq->part = NULL;
+#ifdef CONFIG_BLK_CGROUP
+       rq->rl = NULL;
        set_start_time_ns(rq);
+       rq->io_start_time_ns = 0;
+#endif
+       rq->nr_phys_segments = 0;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+       rq->nr_integrity_segments = 0;
+#endif
+       rq->special = NULL;
+       /* tag was already set */
+       rq->errors = 0;
+
+       rq->extra_len = 0;
+       rq->sense_len = 0;
+       rq->resid_len = 0;
+       rq->sense = NULL;
+
+       INIT_LIST_HEAD(&rq->timeout_list);
+       rq->end_io = NULL;
+       rq->end_io_data = NULL;
+       rq->next_rq = NULL;
+
        ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 }
 
-static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
-                                                  int rw, gfp_t gfp,
-                                                  bool reserved)
+static struct request *
+__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+               struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
 {
        struct request *rq;
+       unsigned int tag;
 
-       do {
-               struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-               struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+       tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
+       if (tag != BLK_MQ_TAG_FAIL) {
+               rq = hctx->tags->rqs[tag];
 
-               rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
-               if (rq) {
-                       blk_mq_rq_ctx_init(q, ctx, rq, rw);
-                       break;
+               rq->cmd_flags = 0;
+               if (blk_mq_tag_busy(hctx)) {
+                       rq->cmd_flags = REQ_MQ_INFLIGHT;
+                       atomic_inc(&hctx->nr_active);
                }
 
-               blk_mq_put_ctx(ctx);
-               if (!(gfp & __GFP_WAIT))
-                       break;
-
-               __blk_mq_run_hw_queue(hctx);
-               blk_mq_wait_for_tags(hctx->tags);
-       } while (1);
+               rq->tag = tag;
+               blk_mq_rq_ctx_init(q, ctx, rq, rw);
+               return rq;
+       }
 
-       return rq;
+       return NULL;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
+               bool reserved)
 {
+       struct blk_mq_ctx *ctx;
+       struct blk_mq_hw_ctx *hctx;
        struct request *rq;
 
        if (blk_mq_queue_enter(q))
                return NULL;
 
-       rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
-       if (rq)
-               blk_mq_put_ctx(rq->mq_ctx);
-       return rq;
-}
-
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
-                                             gfp_t gfp)
-{
-       struct request *rq;
+       ctx = blk_mq_get_ctx(q);
+       hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-       if (blk_mq_queue_enter(q))
-               return NULL;
+       rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
+                                   reserved);
+       if (!rq && (gfp & __GFP_WAIT)) {
+               __blk_mq_run_hw_queue(hctx);
+               blk_mq_put_ctx(ctx);
 
-       rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
-       if (rq)
-               blk_mq_put_ctx(rq->mq_ctx);
+               ctx = blk_mq_get_ctx(q);
+               hctx = q->mq_ops->map_queue(q, ctx->cpu);
+               rq =  __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
+       }
+       blk_mq_put_ctx(ctx);
        return rq;
 }
-EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
-
-/*
- * Re-init and set pdu, if we have it
- */
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
-       blk_rq_init(hctx->queue, rq);
-
-       if (hctx->cmd_size)
-               rq->special = blk_mq_rq_to_pdu(rq);
-}
+EXPORT_SYMBOL(blk_mq_alloc_request);
 
 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
                                  struct blk_mq_ctx *ctx, struct request *rq)
@@ -265,9 +290,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
        const int tag = rq->tag;
        struct request_queue *q = rq->q;
 
-       blk_mq_rq_init(hctx, rq);
-       blk_mq_put_tag(hctx->tags, tag);
+       if (rq->cmd_flags & REQ_MQ_INFLIGHT)
+               atomic_dec(&hctx->nr_active);
 
+       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       blk_mq_put_tag(hctx, tag, &ctx->last_tag);
        blk_mq_queue_exit(q);
 }
 
@@ -283,20 +310,47 @@ void blk_mq_free_request(struct request *rq)
        __blk_mq_free_request(hctx, ctx, rq);
 }
 
-bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes)
+/*
+ * Clone all relevant state from a request that has been put on hold in
+ * the flush state machine into the preallocated flush request that hangs
+ * off the request queue.
+ *
+ * For a driver the flush request should be invisible, that's why we are
+ * impersonating the original request here.
+ */
+void blk_mq_clone_flush_request(struct request *flush_rq,
+               struct request *orig_rq)
 {
-       if (blk_update_request(rq, error, blk_rq_bytes(rq)))
-               return true;
+       struct blk_mq_hw_ctx *hctx =
+               orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
+
+       flush_rq->mq_ctx = orig_rq->mq_ctx;
+       flush_rq->tag = orig_rq->tag;
+       memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
+               hctx->cmd_size);
+}
 
+inline void __blk_mq_end_io(struct request *rq, int error)
+{
        blk_account_io_done(rq);
 
-       if (rq->end_io)
+       if (rq->end_io) {
                rq->end_io(rq, error);
-       else
+       } else {
+               if (unlikely(blk_bidi_rq(rq)))
+                       blk_mq_free_request(rq->next_rq);
                blk_mq_free_request(rq);
-       return false;
+       }
+}
+EXPORT_SYMBOL(__blk_mq_end_io);
+
+void blk_mq_end_io(struct request *rq, int error)
+{
+       if (blk_update_request(rq, error, blk_rq_bytes(rq)))
+               BUG();
+       __blk_mq_end_io(rq, error);
 }
-EXPORT_SYMBOL(blk_mq_end_io_partial);
+EXPORT_SYMBOL(blk_mq_end_io);
 
 static void __blk_mq_complete_request_remote(void *data)
 {
@@ -305,18 +359,22 @@ static void __blk_mq_complete_request_remote(void *data)
        rq->q->softirq_done_fn(rq);
 }
 
-void __blk_mq_complete_request(struct request *rq)
+static void blk_mq_ipi_complete_request(struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
+       bool shared = false;
        int cpu;
 
-       if (!ctx->ipi_redirect) {
+       if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
                rq->q->softirq_done_fn(rq);
                return;
        }
 
        cpu = get_cpu();
-       if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
+       if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+               shared = cpus_share_cache(cpu, ctx->cpu);
+
+       if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
                rq->csd.func = __blk_mq_complete_request_remote;
                rq->csd.info = rq;
                rq->csd.flags = 0;
@@ -327,6 +385,16 @@ void __blk_mq_complete_request(struct request *rq)
        put_cpu();
 }
 
+void __blk_mq_complete_request(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+
+       if (!q->softirq_done_fn)
+               blk_mq_end_io(rq, rq->errors);
+       else
+               blk_mq_ipi_complete_request(rq);
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:                the request being processed
@@ -337,7 +405,9 @@ void __blk_mq_complete_request(struct request *rq)
  **/
 void blk_mq_complete_request(struct request *rq)
 {
-       if (unlikely(blk_should_fake_timeout(rq->q)))
+       struct request_queue *q = rq->q;
+
+       if (unlikely(blk_should_fake_timeout(q)))
                return;
        if (!blk_mark_rq_complete(rq))
                __blk_mq_complete_request(rq);
@@ -350,13 +420,31 @@ static void blk_mq_start_request(struct request *rq, bool last)
 
        trace_block_rq_issue(q, rq);
 
+       rq->resid_len = blk_rq_bytes(rq);
+       if (unlikely(blk_bidi_rq(rq)))
+               rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+
        /*
         * Just mark start time and set the started bit. Due to memory
         * ordering, we know we'll see the correct deadline as long as
-        * REQ_ATOMIC_STARTED is seen.
+        * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
+        * unless one has been set in the request.
+        */
+       if (!rq->timeout)
+               rq->deadline = jiffies + q->rq_timeout;
+       else
+               rq->deadline = jiffies + rq->timeout;
+
+       /*
+        * Mark us as started and clear complete. Complete might have been
+        * set if requeue raced with timeout, which then marked it as
+        * complete. So be sure to clear complete again when we start
+        * the request, otherwise we'll ignore the completion event.
         */
-       rq->deadline = jiffies + q->rq_timeout;
-       set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+               set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+               clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -378,7 +466,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
                rq->cmd_flags |= REQ_END;
 }
 
-static void blk_mq_requeue_request(struct request *rq)
+static void __blk_mq_requeue_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
@@ -391,6 +479,86 @@ static void blk_mq_requeue_request(struct request *rq)
                rq->nr_phys_segments--;
 }
 
+void blk_mq_requeue_request(struct request *rq)
+{
+       __blk_mq_requeue_request(rq);
+       blk_clear_rq_complete(rq);
+
+       BUG_ON(blk_queued_rq(rq));
+       blk_mq_add_to_requeue_list(rq, true);
+}
+EXPORT_SYMBOL(blk_mq_requeue_request);
+
+static void blk_mq_requeue_work(struct work_struct *work)
+{
+       struct request_queue *q =
+               container_of(work, struct request_queue, requeue_work);
+       LIST_HEAD(rq_list);
+       struct request *rq, *next;
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->requeue_lock, flags);
+       list_splice_init(&q->requeue_list, &rq_list);
+       spin_unlock_irqrestore(&q->requeue_lock, flags);
+
+       list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+               if (!(rq->cmd_flags & REQ_SOFTBARRIER))
+                       continue;
+
+               rq->cmd_flags &= ~REQ_SOFTBARRIER;
+               list_del_init(&rq->queuelist);
+               blk_mq_insert_request(rq, true, false, false);
+       }
+
+       while (!list_empty(&rq_list)) {
+               rq = list_entry(rq_list.next, struct request, queuelist);
+               list_del_init(&rq->queuelist);
+               blk_mq_insert_request(rq, false, false, false);
+       }
+
+       blk_mq_run_queues(q, false);
+}
+
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
+{
+       struct request_queue *q = rq->q;
+       unsigned long flags;
+
+       /*
+        * We abuse this flag that is otherwise used by the I/O scheduler to
+        * request head insertation from the workqueue.
+        */
+       BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
+
+       spin_lock_irqsave(&q->requeue_lock, flags);
+       if (at_head) {
+               rq->cmd_flags |= REQ_SOFTBARRIER;
+               list_add(&rq->queuelist, &q->requeue_list);
+       } else {
+               list_add_tail(&rq->queuelist, &q->requeue_list);
+       }
+       spin_unlock_irqrestore(&q->requeue_lock, flags);
+}
+EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
+
+void blk_mq_kick_requeue_list(struct request_queue *q)
+{
+       kblockd_schedule_work(&q->requeue_work);
+}
+EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+       struct request_queue *q = hctx->queue;
+
+       if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
+           q->flush_rq->tag == tag)
+               return q->flush_rq;
+
+       return hctx->tags->rqs[tag];
+}
+EXPORT_SYMBOL(blk_mq_tag_to_rq);
+
 struct blk_mq_timeout_data {
        struct blk_mq_hw_ctx *hctx;
        unsigned long *next;
@@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
        do {
                struct request *rq;
 
-               tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
-               if (tag >= hctx->queue_depth)
+               tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
+               if (tag >= hctx->tags->nr_tags)
                        break;
 
-               rq = hctx->rqs[tag++];
-
+               rq = blk_mq_tag_to_rq(hctx, tag++);
+               if (rq->q != hctx->queue)
+                       continue;
                if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
                        continue;
 
@@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
        blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
 }
 
+static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
+{
+       struct request_queue *q = rq->q;
+
+       /*
+        * We know that complete is set at this point. If STARTED isn't set
+        * anymore, then the request isn't active and the "timeout" should
+        * just be ignored. This can happen due to the bitflag ordering.
+        * Timeout first checks if STARTED is set, and if it is, assumes
+        * the request is active. But if we race with completion, then
+        * we both flags will get cleared. So check here again, and ignore
+        * a timeout event with a request that isn't active.
+        */
+       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+               return BLK_EH_NOT_HANDLED;
+
+       if (!q->mq_ops->timeout)
+               return BLK_EH_RESET_TIMER;
+
+       return q->mq_ops->timeout(rq);
+}
+
 static void blk_mq_rq_timer(unsigned long data)
 {
        struct request_queue *q = (struct request_queue *) data;
@@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data)
        unsigned long next = 0;
        int i, next_set = 0;
 
-       queue_for_each_hw_ctx(q, hctx, i)
+       queue_for_each_hw_ctx(q, hctx, i) {
+               /*
+                * If not software queues are currently mapped to this
+                * hardware queue, there's nothing to check
+                */
+               if (!hctx->nr_ctx || !hctx->tags)
+                       continue;
+
                blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+       }
 
-       if (next_set)
-               mod_timer(&q->timeout, round_jiffies_up(next));
+       if (next_set) {
+               next = blk_rq_timeout(round_jiffies_up(next));
+               mod_timer(&q->timeout, next);
+       } else {
+               queue_for_each_hw_ctx(q, hctx, i)
+                       blk_mq_tag_idle(hctx);
+       }
 }
 
 /*
@@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
        return false;
 }
 
-void blk_mq_add_timer(struct request *rq)
+/*
+ * Process software queues that have been marked busy, splicing them
+ * to the for-dispatch
+ */
+static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
-       __blk_add_timer(rq, NULL);
+       struct blk_mq_ctx *ctx;
+       int i;
+
+       for (i = 0; i < hctx->ctx_map.map_size; i++) {
+               struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
+               unsigned int off, bit;
+
+               if (!bm->word)
+                       continue;
+
+               bit = 0;
+               off = i * hctx->ctx_map.bits_per_word;
+               do {
+                       bit = find_next_bit(&bm->word, bm->depth, bit);
+                       if (bit >= bm->depth)
+                               break;
+
+                       ctx = hctx->ctxs[bit + off];
+                       clear_bit(bit, &bm->word);
+                       spin_lock(&ctx->lock);
+                       list_splice_tail_init(&ctx->rq_list, list);
+                       spin_unlock(&ctx->lock);
+
+                       bit++;
+               } while (1);
+       }
 }
 
 /*
@@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq)
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        struct request_queue *q = hctx->queue;
-       struct blk_mq_ctx *ctx;
        struct request *rq;
        LIST_HEAD(rq_list);
-       int bit, queued;
+       int queued;
+
+       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
@@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        /*
         * Touch any software queue that has pending entries.
         */
-       for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
-               clear_bit(bit, hctx->ctx_map);
-               ctx = hctx->ctxs[bit];
-               BUG_ON(bit != ctx->index_hw);
-
-               spin_lock(&ctx->lock);
-               list_splice_tail_init(&ctx->rq_list, &rq_list);
-               spin_unlock(&ctx->lock);
-       }
+       flush_busy_ctxs(hctx, &rq_list);
 
        /*
         * If we have previous entries on our dispatch list, grab them
@@ -543,14 +769,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                spin_unlock(&hctx->lock);
        }
 
-       /*
-        * Delete and return all entries from our dispatch list
-        */
-       queued = 0;
-
        /*
         * Now process all the entries, sending them to the driver.
         */
+       queued = 0;
        while (!list_empty(&rq_list)) {
                int ret;
 
@@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                        queued++;
                        continue;
                case BLK_MQ_RQ_QUEUE_BUSY:
-                       /*
-                        * FIXME: we should have a mechanism to stop the queue
-                        * like blk_stop_queue, otherwise we will waste cpu
-                        * time
-                        */
                        list_add(&rq->queuelist, &rq_list);
-                       blk_mq_requeue_request(rq);
+                       __blk_mq_requeue_request(rq);
                        break;
                default:
                        pr_err("blk-mq: bad return on queue: %d\n", ret);
@@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        }
 }
 
+/*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+       int cpu = hctx->next_cpu;
+
+       if (--hctx->next_cpu_batch <= 0) {
+               int next_cpu;
+
+               next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+               if (next_cpu >= nr_cpu_ids)
+                       next_cpu = cpumask_first(hctx->cpumask);
+
+               hctx->next_cpu = next_cpu;
+               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+       }
+
+       return cpu;
+}
+
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
        if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                return;
 
-       if (!async)
+       if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
                __blk_mq_run_hw_queue(hctx);
+       else if (hctx->queue->nr_hw_queues == 1)
+               kblockd_schedule_delayed_work(&hctx->run_work, 0);
        else {
-               struct request_queue *q = hctx->queue;
+               unsigned int cpu;
 
-               kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
+               cpu = blk_mq_hctx_next_cpu(hctx);
+               kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
        }
 }
 
@@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                        continue;
 
+               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
+               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-       cancel_delayed_work(&hctx->delayed_work);
+       cancel_delayed_work(&hctx->run_work);
+       cancel_delayed_work(&hctx->delay_work);
        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+
+       preempt_disable();
        __blk_mq_run_hw_queue(hctx);
+       preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
-void blk_mq_start_stopped_hw_queues(struct request_queue *q)
+void blk_mq_start_hw_queues(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_start_hw_queue(hctx);
+}
+EXPORT_SYMBOL(blk_mq_start_hw_queues);
+
+
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
        struct blk_mq_hw_ctx *hctx;
        int i;
@@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
                        continue;
 
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-               blk_mq_run_hw_queue(hctx, true);
+               preempt_disable();
+               blk_mq_run_hw_queue(hctx, async);
+               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 
-static void blk_mq_work_fn(struct work_struct *work)
+static void blk_mq_run_work_fn(struct work_struct *work)
 {
        struct blk_mq_hw_ctx *hctx;
 
-       hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
+       hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+
        __blk_mq_run_hw_queue(hctx);
 }
 
+static void blk_mq_delay_work_fn(struct work_struct *work)
+{
+       struct blk_mq_hw_ctx *hctx;
+
+       hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+
+       if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
+               __blk_mq_run_hw_queue(hctx);
+}
+
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+       unsigned long tmo = msecs_to_jiffies(msecs);
+
+       if (hctx->queue->nr_hw_queues == 1)
+               kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
+       else {
+               unsigned int cpu;
+
+               cpu = blk_mq_hctx_next_cpu(hctx);
+               kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
+       }
+}
+EXPORT_SYMBOL(blk_mq_delay_queue);
+
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
                                    struct request *rq, bool at_head)
 {
@@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
                list_add(&rq->queuelist, &ctx->rq_list);
        else
                list_add_tail(&rq->queuelist, &ctx->rq_list);
+
        blk_mq_hctx_mark_pending(hctx, ctx);
 
        /*
         * We do this early, to ensure we are on the right CPU.
         */
-       blk_mq_add_timer(rq);
+       blk_add_timer(rq);
 }
 
 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
                spin_unlock(&ctx->lock);
        }
 
-       blk_mq_put_ctx(current_ctx);
-
        if (run_queue)
                blk_mq_run_hw_queue(hctx, async);
+
+       blk_mq_put_ctx(current_ctx);
 }
 
 static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
        }
        spin_unlock(&ctx->lock);
 
-       blk_mq_put_ctx(current_ctx);
-
        blk_mq_run_hw_queue(hctx, from_schedule);
+       blk_mq_put_ctx(current_ctx);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -823,63 +1112,185 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
        init_request_from_bio(rq, bio);
-       blk_account_io_start(rq, 1);
+
+       if (blk_do_io_stat(rq)) {
+               rq->start_time = jiffies;
+               blk_account_io_start(rq, 1);
+       }
 }
 
-static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
+                                        struct blk_mq_ctx *ctx,
+                                        struct request *rq, struct bio *bio)
+{
+       struct request_queue *q = hctx->queue;
+
+       if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
+               blk_mq_bio_to_request(rq, bio);
+               spin_lock(&ctx->lock);
+insert_rq:
+               __blk_mq_insert_request(hctx, rq, false);
+               spin_unlock(&ctx->lock);
+               return false;
+       } else {
+               spin_lock(&ctx->lock);
+               if (!blk_mq_attempt_merge(q, ctx, bio)) {
+                       blk_mq_bio_to_request(rq, bio);
+                       goto insert_rq;
+               }
+
+               spin_unlock(&ctx->lock);
+               __blk_mq_free_request(hctx, ctx, rq);
+               return true;
+       }
+}
+
+struct blk_map_ctx {
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+};
+
+static struct request *blk_mq_map_request(struct request_queue *q,
+                                         struct bio *bio,
+                                         struct blk_map_ctx *data)
 {
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
-       const int is_sync = rw_is_sync(bio->bi_rw);
-       const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-       int rw = bio_data_dir(bio);
        struct request *rq;
-       unsigned int use_plug, request_count = 0;
-
-       /*
-        * If we have multiple hardware queues, just go directly to
-        * one of those for sync IO.
-        */
-       use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
-
-       blk_queue_bounce(q, &bio);
+       int rw = bio_data_dir(bio);
 
-       if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+       if (unlikely(blk_mq_queue_enter(q))) {
                bio_endio(bio, -EIO);
-               return;
-       }
-
-       if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
-               return;
-
-       if (blk_mq_queue_enter(q)) {
-               bio_endio(bio, -EIO);
-               return;
+               return NULL;
        }
 
        ctx = blk_mq_get_ctx(q);
        hctx = q->mq_ops->map_queue(q, ctx->cpu);
 
-       if (is_sync)
+       if (rw_is_sync(bio->bi_rw))
                rw |= REQ_SYNC;
+
        trace_block_getrq(q, bio, rw);
-       rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
-       if (likely(rq))
-               blk_mq_rq_ctx_init(q, ctx, rq, rw);
-       else {
+       rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
+       if (unlikely(!rq)) {
+               __blk_mq_run_hw_queue(hctx);
                blk_mq_put_ctx(ctx);
                trace_block_sleeprq(q, bio, rw);
-               rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
-                                                       false);
-               ctx = rq->mq_ctx;
+
+               ctx = blk_mq_get_ctx(q);
                hctx = q->mq_ops->map_queue(q, ctx->cpu);
+               rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
+                                           __GFP_WAIT|GFP_ATOMIC, false);
        }
 
        hctx->queued++;
+       data->hctx = hctx;
+       data->ctx = ctx;
+       return rq;
+}
+
+/*
+ * Multiple hardware queue variant. This will not use per-process plugs,
+ * but will attempt to bypass the hctx queueing if we can go straight to
+ * hardware for SYNC IO.
+ */
+static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
+{
+       const int is_sync = rw_is_sync(bio->bi_rw);
+       const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+       struct blk_map_ctx data;
+       struct request *rq;
+
+       blk_queue_bounce(q, &bio);
+
+       if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+               bio_endio(bio, -EIO);
+               return;
+       }
+
+       rq = blk_mq_map_request(q, bio, &data);
+       if (unlikely(!rq))
+               return;
+
+       if (unlikely(is_flush_fua)) {
+               blk_mq_bio_to_request(rq, bio);
+               blk_insert_flush(rq);
+               goto run_queue;
+       }
+
+       if (is_sync) {
+               int ret;
+
+               blk_mq_bio_to_request(rq, bio);
+               blk_mq_start_request(rq, true);
+               blk_add_timer(rq);
+
+               /*
+                * For OK queue, we are done. For error, kill it. Any other
+                * error (busy), just add it to our list as we previously
+                * would have done
+                */
+               ret = q->mq_ops->queue_rq(data.hctx, rq);
+               if (ret == BLK_MQ_RQ_QUEUE_OK)
+                       goto done;
+               else {
+                       __blk_mq_requeue_request(rq);
+
+                       if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+                               rq->errors = -EIO;
+                               blk_mq_end_io(rq, rq->errors);
+                               goto done;
+                       }
+               }
+       }
+
+       if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+               /*
+                * For a SYNC request, send it to the hardware immediately. For
+                * an ASYNC request, just ensure that we run it later on. The
+                * latter allows for merging opportunities and more efficient
+                * dispatching.
+                */
+run_queue:
+               blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
+       }
+done:
+       blk_mq_put_ctx(data.ctx);
+}
+
+/*
+ * Single hardware queue variant. This will attempt to use any per-process
+ * plug for merging and IO deferral.
+ */
+static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
+{
+       const int is_sync = rw_is_sync(bio->bi_rw);
+       const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+       unsigned int use_plug, request_count = 0;
+       struct blk_map_ctx data;
+       struct request *rq;
+
+       /*
+        * If we have multiple hardware queues, just go directly to
+        * one of those for sync IO.
+        */
+       use_plug = !is_flush_fua && !is_sync;
+
+       blk_queue_bounce(q, &bio);
+
+       if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
+               bio_endio(bio, -EIO);
+               return;
+       }
+
+       if (use_plug && !blk_queue_nomerges(q) &&
+           blk_attempt_plug_merge(q, bio, &request_count))
+               return;
+
+       rq = blk_mq_map_request(q, bio, &data);
 
        if (unlikely(is_flush_fua)) {
                blk_mq_bio_to_request(rq, bio);
-               blk_mq_put_ctx(ctx);
                blk_insert_flush(rq);
                goto run_queue;
        }
@@ -901,31 +1312,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                                trace_block_plug(q);
                        }
                        list_add_tail(&rq->queuelist, &plug->mq_list);
-                       blk_mq_put_ctx(ctx);
+                       blk_mq_put_ctx(data.ctx);
                        return;
                }
        }
 
-       spin_lock(&ctx->lock);
-
-       if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-           blk_mq_attempt_merge(q, ctx, bio))
-               __blk_mq_free_request(hctx, ctx, rq);
-       else {
-               blk_mq_bio_to_request(rq, bio);
-               __blk_mq_insert_request(hctx, rq, false);
+       if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+               /*
+                * For a SYNC request, send it to the hardware immediately. For
+                * an ASYNC request, just ensure that we run it later on. The
+                * latter allows for merging opportunities and more efficient
+                * dispatching.
+                */
+run_queue:
+               blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
        }
 
-       spin_unlock(&ctx->lock);
-       blk_mq_put_ctx(ctx);
-
-       /*
-        * For a SYNC request, send it to the hardware immediately. For an
-        * ASYNC request, just ensure that we run it later on. The latter
-        * allows for merging opportunities and more efficient dispatching.
-        */
-run_queue:
-       blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
+       blk_mq_put_ctx(data.ctx);
 }
 
 /*
@@ -937,32 +1340,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
-                                                  unsigned int hctx_index)
+static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
+               struct blk_mq_tags *tags, unsigned int hctx_idx)
+{
+       struct page *page;
+
+       if (tags->rqs && set->ops->exit_request) {
+               int i;
+
+               for (i = 0; i < tags->nr_tags; i++) {
+                       if (!tags->rqs[i])
+                               continue;
+                       set->ops->exit_request(set->driver_data, tags->rqs[i],
+                                               hctx_idx, i);
+               }
+       }
+
+       while (!list_empty(&tags->page_list)) {
+               page = list_first_entry(&tags->page_list, struct page, lru);
+               list_del_init(&page->lru);
+               __free_pages(page, page->private);
+       }
+
+       kfree(tags->rqs);
+
+       blk_mq_free_tags(tags);
+}
+
+static size_t order_to_size(unsigned int order)
 {
-       return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
-                               GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+       return (size_t)PAGE_SIZE << order;
 }
-EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
 
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
-                                unsigned int hctx_index)
+static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+               unsigned int hctx_idx)
 {
-       kfree(hctx);
+       struct blk_mq_tags *tags;
+       unsigned int i, j, entries_per_page, max_order = 4;
+       size_t rq_size, left;
+
+       tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+                               set->numa_node);
+       if (!tags)
+               return NULL;
+
+       INIT_LIST_HEAD(&tags->page_list);
+
+       tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
+                                       GFP_KERNEL, set->numa_node);
+       if (!tags->rqs) {
+               blk_mq_free_tags(tags);
+               return NULL;
+       }
+
+       /*
+        * rq_size is the size of the request plus driver payload, rounded
+        * to the cacheline size
+        */
+       rq_size = round_up(sizeof(struct request) + set->cmd_size,
+                               cache_line_size());
+       left = rq_size * set->queue_depth;
+
+       for (i = 0; i < set->queue_depth; ) {
+               int this_order = max_order;
+               struct page *page;
+               int to_do;
+               void *p;
+
+               while (left < order_to_size(this_order - 1) && this_order)
+                       this_order--;
+
+               do {
+                       page = alloc_pages_node(set->numa_node, GFP_KERNEL,
+                                               this_order);
+                       if (page)
+                               break;
+                       if (!this_order--)
+                               break;
+                       if (order_to_size(this_order) < rq_size)
+                               break;
+               } while (1);
+
+               if (!page)
+                       goto fail;
+
+               page->private = this_order;
+               list_add_tail(&page->lru, &tags->page_list);
+
+               p = page_address(page);
+               entries_per_page = order_to_size(this_order) / rq_size;
+               to_do = min(entries_per_page, set->queue_depth - i);
+               left -= to_do * rq_size;
+               for (j = 0; j < to_do; j++) {
+                       tags->rqs[i] = p;
+                       if (set->ops->init_request) {
+                               if (set->ops->init_request(set->driver_data,
+                                               tags->rqs[i], hctx_idx, i,
+                                               set->numa_node))
+                                       goto fail;
+                       }
+
+                       p += rq_size;
+                       i++;
+               }
+       }
+
+       return tags;
+
+fail:
+       pr_warn("%s: failed to allocate requests\n", __func__);
+       blk_mq_free_rq_map(set, tags, hctx_idx);
+       return NULL;
 }
-EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
 
-static void blk_mq_hctx_notify(void *data, unsigned long action,
-                              unsigned int cpu)
+static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
+{
+       kfree(bitmap->map);
+}
+
+static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
+{
+       unsigned int bpw = 8, total, num_maps, i;
+
+       bitmap->bits_per_word = bpw;
+
+       num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
+       bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
+                                       GFP_KERNEL, node);
+       if (!bitmap->map)
+               return -ENOMEM;
+
+       bitmap->map_size = num_maps;
+
+       total = nr_cpu_ids;
+       for (i = 0; i < num_maps; i++) {
+               bitmap->map[i].depth = min(total, bitmap->bits_per_word);
+               total -= bitmap->map[i].depth;
+       }
+
+       return 0;
+}
+
+static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 {
-       struct blk_mq_hw_ctx *hctx = data;
        struct request_queue *q = hctx->queue;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
 
-       if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-               return;
-
        /*
         * Move ctx entries to new CPU, if this one is going away.
         */
@@ -971,12 +1495,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_list)) {
                list_splice_init(&ctx->rq_list, &tmp);
-               clear_bit(ctx->index_hw, hctx->ctx_map);
+               blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);
 
        if (list_empty(&tmp))
-               return;
+               return NOTIFY_OK;
 
        ctx = blk_mq_get_ctx(q);
        spin_lock(&ctx->lock);
@@ -993,210 +1517,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
        blk_mq_hctx_mark_pending(hctx, ctx);
 
        spin_unlock(&ctx->lock);
-       blk_mq_put_ctx(ctx);
 
        blk_mq_run_hw_queue(hctx, true);
+       blk_mq_put_ctx(ctx);
+       return NOTIFY_OK;
 }
 
-static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
-                                  int (*init)(void *, struct blk_mq_hw_ctx *,
-                                       struct request *, unsigned int),
-                                  void *data)
+static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
 {
-       unsigned int i;
-       int ret = 0;
-
-       for (i = 0; i < hctx->queue_depth; i++) {
-               struct request *rq = hctx->rqs[i];
-
-               ret = init(data, hctx, rq, i);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
+       struct request_queue *q = hctx->queue;
+       struct blk_mq_tag_set *set = q->tag_set;
 
-int blk_mq_init_commands(struct request_queue *q,
-                        int (*init)(void *, struct blk_mq_hw_ctx *,
-                                       struct request *, unsigned int),
-                        void *data)
-{
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
-       int ret = 0;
+       if (set->tags[hctx->queue_num])
+               return NOTIFY_OK;
 
-       queue_for_each_hw_ctx(q, hctx, i) {
-               ret = blk_mq_init_hw_commands(hctx, init, data);
-               if (ret)
-                       break;
-       }
+       set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
+       if (!set->tags[hctx->queue_num])
+               return NOTIFY_STOP;
 
-       return ret;
+       hctx->tags = set->tags[hctx->queue_num];
+       return NOTIFY_OK;
 }
-EXPORT_SYMBOL(blk_mq_init_commands);
 
-static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
-                                   void (*free)(void *, struct blk_mq_hw_ctx *,
-                                       struct request *, unsigned int),
-                                   void *data)
+static int blk_mq_hctx_notify(void *data, unsigned long action,
+                             unsigned int cpu)
 {
-       unsigned int i;
+       struct blk_mq_hw_ctx *hctx = data;
 
-       for (i = 0; i < hctx->queue_depth; i++) {
-               struct request *rq = hctx->rqs[i];
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+               return blk_mq_hctx_cpu_offline(hctx, cpu);
+       else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+               return blk_mq_hctx_cpu_online(hctx, cpu);
 
-               free(data, hctx, rq, i);
-       }
+       return NOTIFY_OK;
 }
 
-void blk_mq_free_commands(struct request_queue *q,
-                         void (*free)(void *, struct blk_mq_hw_ctx *,
-                                       struct request *, unsigned int),
-                         void *data)
+static void blk_mq_exit_hw_queues(struct request_queue *q,
+               struct blk_mq_tag_set *set, int nr_queue)
 {
        struct blk_mq_hw_ctx *hctx;
        unsigned int i;
 
-       queue_for_each_hw_ctx(q, hctx, i)
-               blk_mq_free_hw_commands(hctx, free, data);
-}
-EXPORT_SYMBOL(blk_mq_free_commands);
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (i == nr_queue)
+                       break;
 
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
-{
-       struct page *page;
+               if (set->ops->exit_hctx)
+                       set->ops->exit_hctx(hctx, i);
 
-       while (!list_empty(&hctx->page_list)) {
-               page = list_first_entry(&hctx->page_list, struct page, lru);
-               list_del_init(&page->lru);
-               __free_pages(page, page->private);
+               blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
+               kfree(hctx->ctxs);
+               blk_mq_free_bitmap(&hctx->ctx_map);
        }
 
-       kfree(hctx->rqs);
-
-       if (hctx->tags)
-               blk_mq_free_tags(hctx->tags);
-}
-
-static size_t order_to_size(unsigned int order)
-{
-       size_t ret = PAGE_SIZE;
-
-       while (order--)
-               ret *= 2;
-
-       return ret;
 }
 
-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
-                             unsigned int reserved_tags, int node)
+static void blk_mq_free_hw_queues(struct request_queue *q,
+               struct blk_mq_tag_set *set)
 {
-       unsigned int i, j, entries_per_page, max_order = 4;
-       size_t rq_size, left;
-
-       INIT_LIST_HEAD(&hctx->page_list);
-
-       hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
-                                       GFP_KERNEL, node);
-       if (!hctx->rqs)
-               return -ENOMEM;
-
-       /*
-        * rq_size is the size of the request plus driver payload, rounded
-        * to the cacheline size
-        */
-       rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
-                               cache_line_size());
-       left = rq_size * hctx->queue_depth;
-
-       for (i = 0; i < hctx->queue_depth;) {
-               int this_order = max_order;
-               struct page *page;
-               int to_do;
-               void *p;
-
-               while (left < order_to_size(this_order - 1) && this_order)
-                       this_order--;
-
-               do {
-                       page = alloc_pages_node(node, GFP_KERNEL, this_order);
-                       if (page)
-                               break;
-                       if (!this_order--)
-                               break;
-                       if (order_to_size(this_order) < rq_size)
-                               break;
-               } while (1);
-
-               if (!page)
-                       break;
-
-               page->private = this_order;
-               list_add_tail(&page->lru, &hctx->page_list);
-
-               p = page_address(page);
-               entries_per_page = order_to_size(this_order) / rq_size;
-               to_do = min(entries_per_page, hctx->queue_depth - i);
-               left -= to_do * rq_size;
-               for (j = 0; j < to_do; j++) {
-                       hctx->rqs[i] = p;
-                       blk_mq_rq_init(hctx, hctx->rqs[i]);
-                       p += rq_size;
-                       i++;
-               }
-       }
-
-       if (i < (reserved_tags + BLK_MQ_TAG_MIN))
-               goto err_rq_map;
-       else if (i != hctx->queue_depth) {
-               hctx->queue_depth = i;
-               pr_warn("%s: queue depth set to %u because of low memory\n",
-                                       __func__, i);
-       }
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
 
-       hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
-       if (!hctx->tags) {
-err_rq_map:
-               blk_mq_free_rq_map(hctx);
-               return -ENOMEM;
+       queue_for_each_hw_ctx(q, hctx, i) {
+               free_cpumask_var(hctx->cpumask);
+               kfree(hctx);
        }
-
-       return 0;
 }
 
 static int blk_mq_init_hw_queues(struct request_queue *q,
-                                struct blk_mq_reg *reg, void *driver_data)
+               struct blk_mq_tag_set *set)
 {
        struct blk_mq_hw_ctx *hctx;
-       unsigned int i, j;
+       unsigned int i;
 
        /*
         * Initialize hardware queues
         */
        queue_for_each_hw_ctx(q, hctx, i) {
-               unsigned int num_maps;
                int node;
 
                node = hctx->numa_node;
                if (node == NUMA_NO_NODE)
-                       node = hctx->numa_node = reg->numa_node;
+                       node = hctx->numa_node = set->numa_node;
 
-               INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
+               INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+               INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
                spin_lock_init(&hctx->lock);
                INIT_LIST_HEAD(&hctx->dispatch);
                hctx->queue = q;
                hctx->queue_num = i;
-               hctx->flags = reg->flags;
-               hctx->queue_depth = reg->queue_depth;
-               hctx->cmd_size = reg->cmd_size;
+               hctx->flags = set->flags;
+               hctx->cmd_size = set->cmd_size;
 
                blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
                                                blk_mq_hctx_notify, hctx);
                blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-               if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
-                       break;
+               hctx->tags = set->tags[i];
 
                /*
                 * Allocate space for all possible cpus to avoid allocation in
@@ -1207,17 +1624,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
                if (!hctx->ctxs)
                        break;
 
-               num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
-               hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
-                                               GFP_KERNEL, node);
-               if (!hctx->ctx_map)
+               if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
                        break;
 
-               hctx->nr_ctx_map = num_maps;
                hctx->nr_ctx = 0;
 
-               if (reg->ops->init_hctx &&
-                   reg->ops->init_hctx(hctx, driver_data, i))
+               if (set->ops->init_hctx &&
+                   set->ops->init_hctx(hctx, set->driver_data, i))
                        break;
        }
 
@@ -1227,17 +1640,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
        /*
         * Init failed
         */
-       queue_for_each_hw_ctx(q, hctx, j) {
-               if (i == j)
-                       break;
-
-               if (reg->ops->exit_hctx)
-                       reg->ops->exit_hctx(hctx, j);
-
-               blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-               blk_mq_free_rq_map(hctx);
-               kfree(hctx->ctxs);
-       }
+       blk_mq_exit_hw_queues(q, set, i);
 
        return 1;
 }
@@ -1258,12 +1661,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                __ctx->queue = q;
 
                /* If the cpu isn't online, the cpu is mapped to first hctx */
-               hctx = q->mq_ops->map_queue(q, i);
-               hctx->nr_ctx++;
-
                if (!cpu_online(i))
                        continue;
 
+               hctx = q->mq_ops->map_queue(q, i);
+               cpumask_set_cpu(i, hctx->cpumask);
+               hctx->nr_ctx++;
+
                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
@@ -1280,6 +1684,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
        struct blk_mq_ctx *ctx;
 
        queue_for_each_hw_ctx(q, hctx, i) {
+               cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
        }
 
@@ -1288,115 +1693,208 @@ static void blk_mq_map_swqueue(struct request_queue *q)
         */
        queue_for_each_ctx(q, ctx, i) {
                /* If the cpu isn't online, the cpu is mapped to first hctx */
+               if (!cpu_online(i))
+                       continue;
+
                hctx = q->mq_ops->map_queue(q, i);
+               cpumask_set_cpu(i, hctx->cpumask);
                ctx->index_hw = hctx->nr_ctx;
                hctx->ctxs[hctx->nr_ctx++] = ctx;
        }
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               /*
+                * If not software queues are mapped to this hardware queue,
+                * disable it and free the request entries
+                */
+               if (!hctx->nr_ctx) {
+                       struct blk_mq_tag_set *set = q->tag_set;
+
+                       if (set->tags[i]) {
+                               blk_mq_free_rq_map(set, set->tags[i], i);
+                               set->tags[i] = NULL;
+                               hctx->tags = NULL;
+                       }
+                       continue;
+               }
+
+               /*
+                * Initialize batch roundrobin counts
+                */
+               hctx->next_cpu = cpumask_first(hctx->cpumask);
+               hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+       }
 }
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
-                                       void *driver_data)
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
 {
-       struct blk_mq_hw_ctx **hctxs;
-       struct blk_mq_ctx *ctx;
+       struct blk_mq_hw_ctx *hctx;
        struct request_queue *q;
+       bool shared;
        int i;
 
-       if (!reg->nr_hw_queues ||
-           !reg->ops->queue_rq || !reg->ops->map_queue ||
-           !reg->ops->alloc_hctx || !reg->ops->free_hctx)
-               return ERR_PTR(-EINVAL);
+       if (set->tag_list.next == set->tag_list.prev)
+               shared = false;
+       else
+               shared = true;
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_freeze_queue(q);
 
-       if (!reg->queue_depth)
-               reg->queue_depth = BLK_MQ_MAX_DEPTH;
-       else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
-               pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
-               reg->queue_depth = BLK_MQ_MAX_DEPTH;
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (shared)
+                               hctx->flags |= BLK_MQ_F_TAG_SHARED;
+                       else
+                               hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+               }
+               blk_mq_unfreeze_queue(q);
        }
+}
+
+static void blk_mq_del_queue_tag_set(struct request_queue *q)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+
+       blk_mq_freeze_queue(q);
 
-       if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
-               return ERR_PTR(-EINVAL);
+       mutex_lock(&set->tag_list_lock);
+       list_del_init(&q->tag_set_list);
+       blk_mq_update_tag_set_depth(set);
+       mutex_unlock(&set->tag_list_lock);
+
+       blk_mq_unfreeze_queue(q);
+}
+
+static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
+                                    struct request_queue *q)
+{
+       q->tag_set = set;
+
+       mutex_lock(&set->tag_list_lock);
+       list_add_tail(&q->tag_set_list, &set->tag_list);
+       blk_mq_update_tag_set_depth(set);
+       mutex_unlock(&set->tag_list_lock);
+}
+
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+{
+       struct blk_mq_hw_ctx **hctxs;
+       struct blk_mq_ctx *ctx;
+       struct request_queue *q;
+       unsigned int *map;
+       int i;
 
        ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctx)
                return ERR_PTR(-ENOMEM);
 
-       hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-                       reg->numa_node);
+       hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+                       set->numa_node);
 
        if (!hctxs)
                goto err_percpu;
 
-       for (i = 0; i < reg->nr_hw_queues; i++) {
-               hctxs[i] = reg->ops->alloc_hctx(reg, i);
+       map = blk_mq_make_queue_map(set);
+       if (!map)
+               goto err_map;
+
+       for (i = 0; i < set->nr_hw_queues; i++) {
+               int node = blk_mq_hw_queue_to_node(map, i);
+
+               hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+                                       GFP_KERNEL, node);
                if (!hctxs[i])
                        goto err_hctxs;
 
-               hctxs[i]->numa_node = NUMA_NO_NODE;
+               if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
+                       goto err_hctxs;
+
+               atomic_set(&hctxs[i]->nr_active, 0);
+               hctxs[i]->numa_node = node;
                hctxs[i]->queue_num = i;
        }
 
-       q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+       q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
        if (!q)
                goto err_hctxs;
 
-       q->mq_map = blk_mq_make_queue_map(reg);
-       if (!q->mq_map)
+       if (percpu_counter_init(&q->mq_usage_counter, 0))
                goto err_map;
 
        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
        blk_queue_rq_timeout(q, 30000);
 
        q->nr_queues = nr_cpu_ids;
-       q->nr_hw_queues = reg->nr_hw_queues;
+       q->nr_hw_queues = set->nr_hw_queues;
+       q->mq_map = map;
 
        q->queue_ctx = ctx;
        q->queue_hw_ctx = hctxs;
 
-       q->mq_ops = reg->ops;
+       q->mq_ops = set->ops;
        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
+       if (!(set->flags & BLK_MQ_F_SG_MERGE))
+               q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
+
        q->sg_reserved_size = INT_MAX;
 
-       blk_queue_make_request(q, blk_mq_make_request);
-       blk_queue_rq_timed_out(q, reg->ops->timeout);
-       if (reg->timeout)
-               blk_queue_rq_timeout(q, reg->timeout);
+       INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+       INIT_LIST_HEAD(&q->requeue_list);
+       spin_lock_init(&q->requeue_lock);
+
+       if (q->nr_hw_queues > 1)
+               blk_queue_make_request(q, blk_mq_make_request);
+       else
+               blk_queue_make_request(q, blk_sq_make_request);
+
+       blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
+       if (set->timeout)
+               blk_queue_rq_timeout(q, set->timeout);
+
+       /*
+        * Do this after blk_queue_make_request() overrides it...
+        */
+       q->nr_requests = set->queue_depth;
 
-       if (reg->ops->complete)
-               blk_queue_softirq_done(q, reg->ops->complete);
+       if (set->ops->complete)
+               blk_queue_softirq_done(q, set->ops->complete);
 
        blk_mq_init_flush(q);
-       blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+       blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-       q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
-                               cache_line_size()), GFP_KERNEL);
+       q->flush_rq = kzalloc(round_up(sizeof(struct request) +
+                               set->cmd_size, cache_line_size()),
+                               GFP_KERNEL);
        if (!q->flush_rq)
                goto err_hw;
 
-       if (blk_mq_init_hw_queues(q, reg, driver_data))
+       if (blk_mq_init_hw_queues(q, set))
                goto err_flush_rq;
 
-       blk_mq_map_swqueue(q);
-
        mutex_lock(&all_q_mutex);
        list_add_tail(&q->all_q_node, &all_q_list);
        mutex_unlock(&all_q_mutex);
 
+       blk_mq_add_queue_tag_set(set, q);
+
+       blk_mq_map_swqueue(q);
+
        return q;
 
 err_flush_rq:
        kfree(q->flush_rq);
 err_hw:
-       kfree(q->mq_map);
-err_map:
        blk_cleanup_queue(q);
 err_hctxs:
-       for (i = 0; i < reg->nr_hw_queues; i++) {
+       kfree(map);
+       for (i = 0; i < set->nr_hw_queues; i++) {
                if (!hctxs[i])
                        break;
-               reg->ops->free_hctx(hctxs[i], i);
+               free_cpumask_var(hctxs[i]->cpumask);
+               kfree(hctxs[i]);
        }
+err_map:
        kfree(hctxs);
 err_percpu:
        free_percpu(ctx);
@@ -1406,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue);
 
 void blk_mq_free_queue(struct request_queue *q)
 {
-       struct blk_mq_hw_ctx *hctx;
-       int i;
+       struct blk_mq_tag_set   *set = q->tag_set;
 
-       queue_for_each_hw_ctx(q, hctx, i) {
-               kfree(hctx->ctx_map);
-               kfree(hctx->ctxs);
-               blk_mq_free_rq_map(hctx);
-               blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-               if (q->mq_ops->exit_hctx)
-                       q->mq_ops->exit_hctx(hctx, i);
-               q->mq_ops->free_hctx(hctx, i);
-       }
+       blk_mq_del_queue_tag_set(q);
+
+       blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
+       blk_mq_free_hw_queues(q, set);
+
+       percpu_counter_destroy(&q->mq_usage_counter);
 
        free_percpu(q->queue_ctx);
        kfree(q->queue_hw_ctx);
@@ -1437,6 +1931,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 {
        blk_mq_freeze_queue(q);
 
+       blk_mq_sysfs_unregister(q);
+
        blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
 
        /*
@@ -1447,6 +1943,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
 
        blk_mq_map_swqueue(q);
 
+       blk_mq_sysfs_register(q);
+
        blk_mq_unfreeze_queue(q);
 }
 
@@ -1456,10 +1954,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
        struct request_queue *q;
 
        /*
-        * Before new mapping is established, hotadded cpu might already start
-        * handling requests. This doesn't break anything as we map offline
-        * CPUs to first hardware queue. We will re-init queue below to get
-        * optimal settings.
+        * Before new mappings are established, hotadded cpu might already
+        * start handling requests. This doesn't break anything as we map
+        * offline CPUs to first hardware queue. We will re-init the queue
+        * below to get optimal settings.
         */
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
            action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
@@ -1472,6 +1970,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
        return NOTIFY_OK;
 }
 
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+       int i;
+
+       if (!set->nr_hw_queues)
+               return -EINVAL;
+       if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+               return -EINVAL;
+       if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
+               return -EINVAL;
+
+       if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
+               return -EINVAL;
+
+
+       set->tags = kmalloc_node(set->nr_hw_queues *
+                                sizeof(struct blk_mq_tags *),
+                                GFP_KERNEL, set->numa_node);
+       if (!set->tags)
+               goto out;
+
+       for (i = 0; i < set->nr_hw_queues; i++) {
+               set->tags[i] = blk_mq_init_rq_map(set, i);
+               if (!set->tags[i])
+                       goto out_unwind;
+       }
+
+       mutex_init(&set->tag_list_lock);
+       INIT_LIST_HEAD(&set->tag_list);
+
+       return 0;
+
+out_unwind:
+       while (--i >= 0)
+               blk_mq_free_rq_map(set, set->tags[i], i);
+out:
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
+{
+       int i;
+
+       for (i = 0; i < set->nr_hw_queues; i++) {
+               if (set->tags[i])
+                       blk_mq_free_rq_map(set, set->tags[i], i);
+       }
+
+       kfree(set->tags);
+}
+EXPORT_SYMBOL(blk_mq_free_tag_set);
+
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+       struct blk_mq_tag_set *set = q->tag_set;
+       struct blk_mq_hw_ctx *hctx;
+       int i, ret;
+
+       if (!set || nr > set->queue_depth)
+               return -EINVAL;
+
+       ret = 0;
+       queue_for_each_hw_ctx(q, hctx, i) {
+               ret = blk_mq_tag_update_depth(hctx->tags, nr);
+               if (ret)
+                       break;
+       }
+
+       if (!ret)
+               q->nr_requests = nr;
+
+       return ret;
+}
+
 void blk_mq_disable_hotplug(void)
 {
        mutex_lock(&all_q_mutex);
index ebbe6bac9d616d4a47a815870dc9d3682403c9d1..de7b3bbd5bd61d203799cc24af2f8af3f2c6e396 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+struct blk_mq_tag_set;
+
 struct blk_mq_ctx {
        struct {
                spinlock_t              lock;
@@ -9,7 +11,8 @@ struct blk_mq_ctx {
 
        unsigned int            cpu;
        unsigned int            index_hw;
-       unsigned int            ipi_redirect;
+
+       unsigned int            last_tag ____cacheline_aligned_in_smp;
 
        /* incremented at dispatch time */
        unsigned long           rq_dispatched[2];
@@ -20,21 +23,23 @@ struct blk_mq_ctx {
 
        struct request_queue    *queue;
        struct kobject          kobj;
-};
+} ____cacheline_aligned_in_smp;
 
 void __blk_mq_complete_request(struct request *rq);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
+void blk_mq_clone_flush_request(struct request *flush_rq,
+               struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 
 /*
  * CPU hotplug helpers
  */
 struct blk_mq_cpu_notifier;
 void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
-                             void (*fn)(void *, unsigned long, unsigned int),
+                             int (*fn)(void *, unsigned long, unsigned int),
                              void *data);
 void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
 void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void);
 /*
  * CPU -> queue mappings
  */
-struct blk_mq_reg;
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 
-void blk_mq_add_timer(struct request *rq);
+/*
+ * sysfs helpers
+ */
+extern int blk_mq_sysfs_register(struct request_queue *q);
+extern void blk_mq_sysfs_unregister(struct request_queue *q);
+
+/*
+ * Basic implementation of sparser bitmap, allowing the user to spread
+ * the bits over more cachelines.
+ */
+struct blk_align_bitmap {
+       unsigned long word;
+       unsigned long depth;
+} ____cacheline_aligned_in_smp;
 
 #endif
index 7500f876dae40e0b124b90adab21c60c1e3676b6..23321fbab29318ae5b550216c66eb9ae2d026c52 100644 (file)
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
-       struct request_list *rl;
        unsigned long nr;
-       int ret;
+       int ret, err;
 
-       if (!q->request_fn)
+       if (!q->request_fn && !q->mq_ops)
                return -EINVAL;
 
        ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
        if (nr < BLKDEV_MIN_RQ)
                nr = BLKDEV_MIN_RQ;
 
-       spin_lock_irq(q->queue_lock);
-       q->nr_requests = nr;
-       blk_queue_congestion_threshold(q);
-
-       /* congestion isn't cgroup aware and follows root blkcg for now */
-       rl = &q->root_rl;
-
-       if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_SYNC);
-       else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_SYNC);
-
-       if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_ASYNC);
-       else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_ASYNC);
-
-       blk_queue_for_each_rl(rl, q) {
-               if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-                       blk_set_rl_full(rl, BLK_RW_SYNC);
-               } else {
-                       blk_clear_rl_full(rl, BLK_RW_SYNC);
-                       wake_up(&rl->wait[BLK_RW_SYNC]);
-               }
-
-               if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-                       blk_set_rl_full(rl, BLK_RW_ASYNC);
-               } else {
-                       blk_clear_rl_full(rl, BLK_RW_ASYNC);
-                       wake_up(&rl->wait[BLK_RW_ASYNC]);
-               }
-       }
+       if (q->request_fn)
+               err = blk_update_nr_requests(q, nr);
+       else
+               err = blk_mq_update_nr_requests(q, nr);
+
+       if (err)
+               return err;
 
-       spin_unlock_irq(q->queue_lock);
        return ret;
 }
 
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
        if (q->queue_tags)
                __blk_queue_free_tags(q);
 
-       percpu_counter_destroy(&q->mq_usage_counter);
-
        if (q->mq_ops)
                blk_mq_free_queue(q);
 
index 033745cd7fba62b299301860f4ee0f50ec2ff04e..9353b468335930f32e9e82b160d8a40930eca839 100644 (file)
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
        if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
-               return 0;
+               return false;
 
        return 1;
 }
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
        if (tg->io_disp[rw] + 1 <= io_allowed) {
                if (wait)
                        *wait = 0;
-               return 1;
+               return true;
        }
 
        /* Calc approx time to dispatch */
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
        if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
                if (wait)
                        *wait = 0;
-               return 1;
+               return true;
        }
 
        /* Calc approx time to dispatch */
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
        if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
                if (wait)
                        *wait = 0;
-               return 1;
+               return true;
        }
 
        /*
@@ -1258,7 +1258,7 @@ out_unlock:
  * of throtl_data->service_queue.  Those bio's are ready and issued by this
  * function.
  */
-void blk_throtl_dispatch_work_fn(struct work_struct *work)
+static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 {
        struct throtl_data *td = container_of(work, struct throtl_data,
                                              dispatch_work);
index d96f7061c6fd8727de9eb9fc02fae7b07dd357c1..95a09590ccfda8366f92efbee7c31f1654c11637 100644 (file)
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
                        __blk_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
-               if (q->mq_ops)
-                       blk_mq_add_timer(req);
-               else
-                       blk_add_timer(req);
-
+               blk_add_timer(req);
                blk_clear_rq_complete(req);
                break;
        case BLK_EH_NOT_HANDLED:
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
-void __blk_add_timer(struct request *req, struct list_head *timeout_list)
+unsigned long blk_rq_timeout(unsigned long timeout)
+{
+       unsigned long maxt;
+
+       maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+       if (time_after(timeout, maxt))
+               timeout = maxt;
+
+       return timeout;
+}
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:       request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
 {
        struct request_queue *q = req->q;
        unsigned long expiry;
@@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
                req->timeout = q->rq_timeout;
 
        req->deadline = jiffies + req->timeout;
-       if (timeout_list)
-               list_add_tail(&req->timeout_list, timeout_list);
+       if (!q->mq_ops)
+               list_add_tail(&req->timeout_list, &req->q->timeout_list);
 
        /*
         * If the timer isn't already pending or this timeout is earlier
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
-       expiry = round_jiffies_up(req->deadline);
+       expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
 
        if (!timer_pending(&q->timeout) ||
-           time_before(expiry, q->timeout.expires))
-               mod_timer(&q->timeout, expiry);
+           time_before(expiry, q->timeout.expires)) {
+               unsigned long diff = q->timeout.expires - expiry;
 
-}
+               /*
+                * Due to added timer slack to group timers, the timer
+                * will often be a little in front of what we asked for.
+                * So apply some tolerance here too, otherwise we keep
+                * modifying the timer because expires for value X
+                * will be X + something.
+                */
+               if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
+                       mod_timer(&q->timeout, expiry);
+       }
 
-/**
- * blk_add_timer - Start timeout timer for a single request
- * @req:       request that is about to start running.
- *
- * Notes:
- *    Each request has its own timer, and as it is added to the queue, we
- *    set up the timer. When the request completes, we cancel the timer.
- */
-void blk_add_timer(struct request *req)
-{
-       __blk_add_timer(req, &req->q->timeout_list);
 }
-
index 1d880f1f957fe473fbb0f78ad8ad03a3726faa73..45385e9abf6f8e926e58d3fc8535b2133d6230e9 100644 (file)
@@ -9,6 +9,9 @@
 /* Number of requests a "batching" process may submit */
 #define BLK_BATCH_REQ  32
 
+/* Max future timer expiry for timeouts */
+#define BLK_MAX_TIMEOUT                (5 * HZ)
+
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
                          unsigned int *next_set);
-void __blk_add_timer(struct request *req, struct list_head *timeout_list);
+unsigned long blk_rq_timeout(unsigned long timeout);
+void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
-void blk_add_timer(struct request *);
 
 
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
        return q->nr_congestion_off;
 }
 
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
+
 /*
  * Contribute to IO statistics IFF:
  *
similarity index 100%
rename from mm/bounce.c
rename to block/bounce.c
index e0985f1955e7e0ea513ddf9cea104d571626a592..22dffebc7c73531ec235e27e188f66fa88b677d8 100644 (file)
@@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
        if (cfqd->busy_queues) {
                cfq_log(cfqd, "schedule dispatch");
-               kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
+               kblockd_schedule_work(&cfqd->unplug_work);
        }
 }
 
@@ -4460,7 +4460,7 @@ out_free:
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
-       return sprintf(page, "%d\n", var);
+       return sprintf(page, "%u\n", var);
 }
 
 static ssize_t
similarity index 100%
rename from fs/ioprio.c
rename to block/ioprio.c
index 26487972ac549ba899a723201125e5b3c59934ff..9c28a5b38042bbd7e9904d00e2495afb6ee4517b 100644 (file)
@@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
        if (capable(CAP_SYS_RAWIO))
                return 0;
 
-       /* if there's no filter set, assume we're filtering everything out */
-       if (!filter)
-               return -EPERM;
-
        /* Anybody who can open the device can do a read-safe command */
        if (test_bit(cmd[0], filter->read_ok))
                return 0;
index 748dea4f34dc1e18b089557478a2b00221bd83c5..758da2287d9a3d01143c1d588c6f28dbbf0d8c14 100644 (file)
@@ -1406,7 +1406,7 @@ next_segment:
 
                track = block / (floppy->dtype->sects * floppy->type->sect_mult);
                sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-               data = rq->buffer + 512 * cnt;
+               data = bio_data(rq->bio) + 512 * cnt;
 #ifdef DEBUG
                printk("access to track %d, sector %d, with buffer at "
                       "0x%08lx\n", track, sector, data);
index cfa64bdf01c96ec41ad1b65917211524e0be8bf8..2104b1b4ccda276dd324b12b843468c322a39ed5 100644 (file)
@@ -1484,7 +1484,7 @@ repeat:
        ReqCnt = 0;
        ReqCmd = rq_data_dir(fd_request);
        ReqBlock = blk_rq_pos(fd_request);
-       ReqBuffer = fd_request->buffer;
+       ReqBuffer = bio_data(fd_request->bio);
        setup_req_params( drive );
        do_fd_action( drive );
 
index fa9bb742df6e0becfa8bca52576f17b5bdafe2bf..dc3a41c82b38a155a92af438ed7e21c77f8654ec 100644 (file)
@@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
        }
 
        if (CT(COMMAND) != FD_READ ||
-           raw_cmd->kernel_data == current_req->buffer) {
+           raw_cmd->kernel_data == bio_data(current_req->bio)) {
                /* transfer directly from buffer */
                cont->done(1);
        } else if (CT(COMMAND) == FD_READ) {
@@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void)
                raw_cmd->flags &= ~FD_RAW_WRITE;
                raw_cmd->flags |= FD_RAW_READ;
                COMMAND = FM_MODE(_floppy, FD_READ);
-       } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) {
+       } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
                unsigned long dma_limit;
                int direct, indirect;
 
@@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void)
                 */
                max_size = buffer_chain_size();
                dma_limit = (MAX_DMA_ADDRESS -
-                            ((unsigned long)current_req->buffer)) >> 9;
+                            ((unsigned long)bio_data(current_req->bio))) >> 9;
                if ((unsigned long)max_size > dma_limit)
                        max_size = dma_limit;
                /* 64 kb boundaries */
-               if (CROSS_64KB(current_req->buffer, max_size << 9))
+               if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
                        max_size = (K_64 -
-                                   ((unsigned long)current_req->buffer) %
+                                   ((unsigned long)bio_data(current_req->bio)) %
                                    K_64) >> 9;
                direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
                /*
@@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void)
                       (DP->read_track & (1 << DRS->probed_format)))))) {
                        max_size = blk_rq_sectors(current_req);
                } else {
-                       raw_cmd->kernel_data = current_req->buffer;
+                       raw_cmd->kernel_data = bio_data(current_req->bio);
                        raw_cmd->length = current_count_sectors << 9;
                        if (raw_cmd->length == 0) {
                                DPRINT("%s: zero dma transfer attempted\n", __func__);
@@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void)
        raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
        raw_cmd->length <<= 9;
        if ((raw_cmd->length < current_count_sectors << 9) ||
-           (raw_cmd->kernel_data != current_req->buffer &&
+           (raw_cmd->kernel_data != bio_data(current_req->bio) &&
             CT(COMMAND) == FD_WRITE &&
             (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
              aligned_sector_t < buffer_min)) ||
@@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void)
            raw_cmd->length <= 0 || current_count_sectors <= 0) {
                DPRINT("fractionary current count b=%lx s=%lx\n",
                       raw_cmd->length, current_count_sectors);
-               if (raw_cmd->kernel_data != current_req->buffer)
+               if (raw_cmd->kernel_data != bio_data(current_req->bio))
                        pr_info("addr=%d, length=%ld\n",
                                (int)((raw_cmd->kernel_data -
                                       floppy_track_buffer) >> 9),
@@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void)
                return 0;
        }
 
-       if (raw_cmd->kernel_data != current_req->buffer) {
+       if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
                if (raw_cmd->kernel_data < floppy_track_buffer ||
                    current_count_sectors < 0 ||
                    raw_cmd->length < 0 ||
index bf397bf108b75de4ef302dc7b8f66f45303b891f..8a290c08262f26079ecd53235b3ee8c5bc43f681 100644 (file)
@@ -464,11 +464,11 @@ static void read_intr(void)
 
 ok_to_read:
        req = hd_req;
-       insw(HD_DATA, req->buffer, 256);
+       insw(HD_DATA, bio_data(req->bio), 256);
 #ifdef DEBUG
        printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
               req->rq_disk->disk_name, blk_rq_pos(req) + 1,
-              blk_rq_sectors(req) - 1, req->buffer+512);
+              blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
 #endif
        if (hd_end_request(0, 512)) {
                SET_HANDLER(&read_intr);
@@ -505,7 +505,7 @@ static void write_intr(void)
 ok_to_write:
        if (hd_end_request(0, 512)) {
                SET_HANDLER(&write_intr);
-               outsw(HD_DATA, req->buffer, 256);
+               outsw(HD_DATA, bio_data(req->bio), 256);
                return;
        }
 
@@ -624,7 +624,7 @@ repeat:
        printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
                req->rq_disk->disk_name,
                req_data_dir(req) == READ ? "read" : "writ",
-               cyl, head, sec, nsect, req->buffer);
+               cyl, head, sec, nsect, bio_data(req->bio));
 #endif
        if (req->cmd_type == REQ_TYPE_FS) {
                switch (rq_data_dir(req)) {
@@ -643,7 +643,7 @@ repeat:
                                bad_rw_intr();
                                goto repeat;
                        }
-                       outsw(HD_DATA, req->buffer, 256);
+                       outsw(HD_DATA, bio_data(req->bio), 256);
                        break;
                default:
                        printk("unknown hd-command\n");
index eb59b124136690e217897dd6003473e09e5bd64a..e352cac707e82f5193324c1feafc6296d86bd219 100644 (file)
@@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host,
 
 static void mg_read_one(struct mg_host *host, struct request *req)
 {
-       u16 *buff = (u16 *)req->buffer;
+       u16 *buff = (u16 *)bio_data(req->bio);
        u32 i;
 
        for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -496,7 +496,7 @@ static void mg_read(struct request *req)
                mg_bad_rw_intr(host);
 
        MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-              blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
+              blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
 
        do {
                if (mg_wait(host, ATA_DRQ,
@@ -514,7 +514,7 @@ static void mg_read(struct request *req)
 
 static void mg_write_one(struct mg_host *host, struct request *req)
 {
-       u16 *buff = (u16 *)req->buffer;
+       u16 *buff = (u16 *)bio_data(req->bio);
        u32 i;
 
        for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -534,7 +534,7 @@ static void mg_write(struct request *req)
        }
 
        MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-              rem, blk_rq_pos(req), req->buffer);
+              rem, blk_rq_pos(req), bio_data(req->bio));
 
        if (mg_wait(host, ATA_DRQ,
                    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
@@ -585,7 +585,7 @@ ok_to_read:
        mg_read_one(host, req);
 
        MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-              blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
+              blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
 
        /* send read confirm */
        outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -624,7 +624,7 @@ ok_to_write:
                /* write 1 sector and set handler if remains */
                mg_write_one(host, req);
                MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-                      blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+                      blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
                host->mg_do_intr = mg_write_intr;
                mod_timer(&host->timer, jiffies + 3 * HZ);
        }
index 091b9ea14feb5856ceada49671197dfac567376c..b40af63a54767a2463270a71d0988477b47f0e9e 100644 (file)
@@ -32,6 +32,7 @@ struct nullb {
        unsigned int index;
        struct request_queue *q;
        struct gendisk *disk;
+       struct blk_mq_tag_set tag_set;
        struct hrtimer timer;
        unsigned int queue_depth;
        spinlock_t lock;
@@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 
 static void null_softirq_done_fn(struct request *rq)
 {
-       end_cmd(rq->special);
+       end_cmd(blk_mq_rq_to_pdu(rq));
 }
 
 static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q)
 
 static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-       struct nullb_cmd *cmd = rq->special;
+       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
        cmd->rq = rq;
        cmd->nq = hctx->driver_data;
@@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
        return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
-{
-       int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
-       int tip = (reg->nr_hw_queues % nr_online_nodes);
-       int node = 0, i, n;
-
-       /*
-        * Split submit queues evenly wrt to the number of nodes. If uneven,
-        * fill the first buckets with one extra, until the rest is filled with
-        * no extra.
-        */
-       for (i = 0, n = 1; i < hctx_index; i++, n++) {
-               if (n % b_size == 0) {
-                       n = 0;
-                       node++;
-
-                       tip--;
-                       if (!tip)
-                               b_size = reg->nr_hw_queues / nr_online_nodes;
-               }
-       }
-
-       /*
-        * A node might not be online, therefore map the relative node id to the
-        * real node id.
-        */
-       for_each_online_node(n) {
-               if (!node)
-                       break;
-               node--;
-       }
-
-       return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
-}
-
-static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
-{
-       kfree(hctx);
-}
-
 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
 {
        BUG_ON(!nullb);
@@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = {
        .complete       = null_softirq_done_fn,
 };
 
-static struct blk_mq_reg null_mq_reg = {
-       .ops            = &null_mq_ops,
-       .queue_depth    = 64,
-       .cmd_size       = sizeof(struct nullb_cmd),
-       .flags          = BLK_MQ_F_SHOULD_MERGE,
-};
-
 static void null_del_dev(struct nullb *nullb)
 {
        list_del_init(&nullb->list);
 
        del_gendisk(nullb->disk);
        blk_cleanup_queue(nullb->q);
+       if (queue_mode == NULL_Q_MQ)
+               blk_mq_free_tag_set(&nullb->tag_set);
        put_disk(nullb->disk);
        kfree(nullb);
 }
@@ -506,7 +462,7 @@ static int null_add_dev(void)
 
        nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
        if (!nullb)
-               return -ENOMEM;
+               goto out;
 
        spin_lock_init(&nullb->lock);
 
@@ -514,49 +470,44 @@ static int null_add_dev(void)
                submit_queues = nr_online_nodes;
 
        if (setup_queues(nullb))
-               goto err;
+               goto out_free_nullb;
 
        if (queue_mode == NULL_Q_MQ) {
-               null_mq_reg.numa_node = home_node;
-               null_mq_reg.queue_depth = hw_queue_depth;
-               null_mq_reg.nr_hw_queues = submit_queues;
-
-               if (use_per_node_hctx) {
-                       null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
-                       null_mq_reg.ops->free_hctx = null_free_hctx;
-               } else {
-                       null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
-                       null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
-               }
-
-               nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+               nullb->tag_set.ops = &null_mq_ops;
+               nullb->tag_set.nr_hw_queues = submit_queues;
+               nullb->tag_set.queue_depth = hw_queue_depth;
+               nullb->tag_set.numa_node = home_node;
+               nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
+               nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+               nullb->tag_set.driver_data = nullb;
+
+               if (blk_mq_alloc_tag_set(&nullb->tag_set))
+                       goto out_cleanup_queues;
+
+               nullb->q = blk_mq_init_queue(&nullb->tag_set);
+               if (!nullb->q)
+                       goto out_cleanup_tags;
        } else if (queue_mode == NULL_Q_BIO) {
                nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+               if (!nullb->q)
+                       goto out_cleanup_queues;
                blk_queue_make_request(nullb->q, null_queue_bio);
                init_driver_queues(nullb);
        } else {
                nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+               if (!nullb->q)
+                       goto out_cleanup_queues;
                blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-               if (nullb->q)
-                       blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+               blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
                init_driver_queues(nullb);
        }
 
-       if (!nullb->q)
-               goto queue_fail;
-
        nullb->q->queuedata = nullb;
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
 
        disk = nullb->disk = alloc_disk_node(1, home_node);
-       if (!disk) {
-queue_fail:
-               blk_cleanup_queue(nullb->q);
-               cleanup_queues(nullb);
-err:
-               kfree(nullb);
-               return -ENOMEM;
-       }
+       if (!disk)
+               goto out_cleanup_blk_queue;
 
        mutex_lock(&lock);
        list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +530,18 @@ err:
        sprintf(disk->disk_name, "nullb%d", nullb->index);
        add_disk(disk);
        return 0;
+
+out_cleanup_blk_queue:
+       blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+       if (queue_mode == NULL_Q_MQ)
+               blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+       cleanup_queues(nullb);
+out_free_nullb:
+       kfree(nullb);
+out:
+       return -ENOMEM;
 }
 
 static int __init null_init(void)
index e76bdc074dbe5678e52a33463a743a8b55335555..719cb1bc16401640a535ee1a0c3ae65de7f08e1c 100644 (file)
@@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
                        pcd_current = cd;
                        pcd_sector = blk_rq_pos(pcd_req);
                        pcd_count = blk_rq_cur_sectors(pcd_req);
-                       pcd_buf = pcd_req->buffer;
+                       pcd_buf = bio_data(pcd_req->bio);
                        pcd_busy = 1;
                        ps_set_intr(do_pcd_read, NULL, 0, nice);
                        return;
index 19ad8f0c83efe6942eb2dbabc95a978a41f05cd9..fea7e76a00de66e7d20dd6859ad851d8ecb40a35 100644 (file)
@@ -454,7 +454,7 @@ static enum action do_pd_io_start(void)
                if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
                        return Fail;
                pd_run = blk_rq_sectors(pd_req);
-               pd_buf = pd_req->buffer;
+               pd_buf = bio_data(pd_req->bio);
                pd_retries = 0;
                if (pd_cmd == READ)
                        return do_pd_read_start();
@@ -485,7 +485,7 @@ static int pd_next_buf(void)
        spin_lock_irqsave(&pd_lock, saved_flags);
        __blk_end_request_cur(pd_req, 0);
        pd_count = blk_rq_cur_sectors(pd_req);
-       pd_buf = pd_req->buffer;
+       pd_buf = bio_data(pd_req->bio);
        spin_unlock_irqrestore(&pd_lock, saved_flags);
        return 0;
 }
index f5c86d523ba0c8fe3a6154737f010246d67540d0..9a15fd3c9349403a19c518e278cf547d63733504 100644 (file)
@@ -795,7 +795,7 @@ repeat:
        }
 
        pf_cmd = rq_data_dir(pf_req);
-       pf_buf = pf_req->buffer;
+       pf_buf = bio_data(pf_req->bio);
        pf_retries = 0;
 
        pf_busy = 1;
@@ -827,7 +827,7 @@ static int pf_next_buf(void)
                if (!pf_req)
                        return 1;
                pf_count = blk_rq_cur_sectors(pf_req);
-               pf_buf = pf_req->buffer;
+               pf_buf = bio_data(pf_req->bio);
        }
        return 0;
 }
index a69dd93d1bd553d0b756407ea9cbb83df4f22c88..c48d9084c96503ca36addd882d2158cc93a9cccd 100644 (file)
@@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
 
        req = skreq->req;
        blk_add_request_payload(req, page, len);
-       req->buffer = buf;
 }
 
 static void skd_request_fn_not_online(struct request_queue *q);
@@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q)
                                break;
                        }
                        skreq->discard_page = 1;
+                       req->completion_data = page;
                        skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
 
                } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
@@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev,
                (skreq->discard_page == 1)) {
                pr_debug("%s:%s:%d, free the page!",
                         skdev->name, __func__, __LINE__);
-               free_page((unsigned long)req->buffer);
-               req->buffer = NULL;
+               __free_page(req->completion_data);
        }
 
        if (unlikely(error)) {
index b02d53a399f37818f58950fd50e2184b43c216ba..6b44bbe528b7b1089296f52d54d9c6aa8ed90f81 100644 (file)
@@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
                case READ:
                        err = floppy_read_sectors(fs, blk_rq_pos(req),
                                                  blk_rq_cur_sectors(req),
-                                                 req->buffer);
+                                                 bio_data(req->bio));
                        break;
                }
        done:
index c74f7b56e7c40106ddd5ebde357d161cf7929361..523ee8fd4c150e7671ab776368e8a0a017d3287e 100644 (file)
@@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs)
                swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
                          req->rq_disk->disk_name, req->cmd,
                          (long)blk_rq_pos(req), blk_rq_sectors(req),
-                         req->buffer);
+                         bio_data(req->bio));
                swim3_dbg("           errors=%d current_nr_sectors=%u\n",
                          req->errors, blk_rq_cur_sectors(req));
 #endif
@@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs)
                /* Set up 3 dma commands: write preamble, data, postamble */
                init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
                ++cp;
-               init_dma(cp, OUTPUT_MORE, req->buffer, 512);
+               init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
                ++cp;
                init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
        } else {
-               init_dma(cp, INPUT_LAST, req->buffer, n * 512);
+               init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
        }
        ++cp;
        out_le16(&cp->command, DBDMA_STOP);
index cb9b1f8326c3c6e1327b2c4505bc94aa143cf215..c8f286e8d80f8e78acf1c7f59cee3ef583c2a50a 100644 (file)
@@ -30,6 +30,9 @@ struct virtio_blk
        /* The disk structure for the kernel. */
        struct gendisk *disk;
 
+       /* Block layer tags. */
+       struct blk_mq_tag_set tag_set;
+
        /* Process context for config space updates */
        struct work_struct config_work;
 
@@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 
 static inline void virtblk_request_done(struct request *req)
 {
-       struct virtblk_req *vbr = req->special;
+       struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        int error = virtblk_result(vbr);
 
        if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -147,14 +150,14 @@ static void virtblk_done(struct virtqueue *vq)
 
        /* In case queue is stopped waiting for more buffers. */
        if (req_done)
-               blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+               blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
        spin_unlock_irqrestore(&vblk->vq_lock, flags);
 }
 
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
        struct virtio_blk *vblk = hctx->queue->queuedata;
-       struct virtblk_req *vbr = req->special;
+       struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
        const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw =
        __ATTR(cache_type, S_IRUGO|S_IWUSR,
               virtblk_cache_type_show, virtblk_cache_type_store);
 
-static struct blk_mq_ops virtio_mq_ops = {
-       .queue_rq       = virtio_queue_rq,
-       .map_queue      = blk_mq_map_queue,
-       .alloc_hctx     = blk_mq_alloc_single_hw_queue,
-       .free_hctx      = blk_mq_free_single_hw_queue,
-       .complete       = virtblk_request_done,
-};
-
-static struct blk_mq_reg virtio_mq_reg = {
-       .ops            = &virtio_mq_ops,
-       .nr_hw_queues   = 1,
-       .queue_depth    = 0, /* Set in virtblk_probe */
-       .numa_node      = NUMA_NO_NODE,
-       .flags          = BLK_MQ_F_SHOULD_MERGE,
-};
-module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
-
-static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
-                            struct request *rq, unsigned int nr)
+static int virtblk_init_request(void *data, struct request *rq,
+               unsigned int hctx_idx, unsigned int request_idx,
+               unsigned int numa_node)
 {
        struct virtio_blk *vblk = data;
-       struct virtblk_req *vbr = rq->special;
+       struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
 
        sg_init_table(vbr->sg, vblk->sg_elems);
        return 0;
 }
 
+static struct blk_mq_ops virtio_mq_ops = {
+       .queue_rq       = virtio_queue_rq,
+       .map_queue      = blk_mq_map_queue,
+       .complete       = virtblk_request_done,
+       .init_request   = virtblk_init_request,
+};
+
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+
 static int virtblk_probe(struct virtio_device *vdev)
 {
        struct virtio_blk *vblk;
@@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev)
        }
 
        /* Default queue sizing is to fill the ring. */
-       if (!virtio_mq_reg.queue_depth) {
-               virtio_mq_reg.queue_depth = vblk->vq->num_free;
+       if (!virtblk_queue_depth) {
+               virtblk_queue_depth = vblk->vq->num_free;
                /* ... but without indirect descs, we use 2 descs per req */
                if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
-                       virtio_mq_reg.queue_depth /= 2;
+                       virtblk_queue_depth /= 2;
        }
-       virtio_mq_reg.cmd_size =
+
+       memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+       vblk->tag_set.ops = &virtio_mq_ops;
+       vblk->tag_set.nr_hw_queues = 1;
+       vblk->tag_set.queue_depth = virtblk_queue_depth;
+       vblk->tag_set.numa_node = NUMA_NO_NODE;
+       vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+       vblk->tag_set.cmd_size =
                sizeof(struct virtblk_req) +
                sizeof(struct scatterlist) * sg_elems;
+       vblk->tag_set.driver_data = vblk;
 
-       q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+       err = blk_mq_alloc_tag_set(&vblk->tag_set);
+       if (err)
+               goto out_put_disk;
+
+       q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
        if (!q) {
                err = -ENOMEM;
-               goto out_put_disk;
+               goto out_free_tags;
        }
 
-       blk_mq_init_commands(q, virtblk_init_vbr, vblk);
-
        q->queuedata = vblk;
 
        virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 out_del_disk:
        del_gendisk(vblk->disk);
        blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+       blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
        put_disk(vblk->disk);
 out_free_vq:
@@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev)
        del_gendisk(vblk->disk);
        blk_cleanup_queue(vblk->disk->queue);
 
+       blk_mq_free_tag_set(&vblk->tag_set);
+
        /* Stop all the virtqueues. */
        vdev->config->reset(vdev);
 
@@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev)
        vblk->config_enable = true;
        ret = init_vq(vdev->priv);
        if (!ret)
-               blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+               blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
 
        return ret;
 }
index efe1b4761735a79faa30867ad625fdd51e043081..283a30e88287b6bbec1be6b683a87d52fc997b59 100644 (file)
@@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq)
                }
 
                pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-                        "(%u/%u) buffer:%p [%s]\n",
+                        "(%u/%u) [%s]\n",
                         req, req->cmd, (unsigned long)blk_rq_pos(req),
                         blk_rq_cur_sectors(req), blk_rq_sectors(req),
-                        req->buffer, rq_data_dir(req) ? "write" : "read");
+                        rq_data_dir(req) ? "write" : "read");
 
                if (blkif_queue_request(req)) {
                        blk_requeue_request(rq, req);
index 1393b8871a281a82ef4e613fc096ce4f0e933cfa..ab3ea62e5dfc70dc0e62488290688f468d69cbcc 100644 (file)
@@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
                        rq_data_dir(req));
 
                ace->req = req;
-               ace->data_ptr = req->buffer;
+               ace->data_ptr = bio_data(req->bio);
                ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
                ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
 
@@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
                         *      blk_rq_sectors(ace->req),
                         *      blk_rq_cur_sectors(ace->req));
                         */
-                       ace->data_ptr = ace->req->buffer;
+                       ace->data_ptr = bio_data(ace->req->bio);
                        ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
                        ace_fsm_yieldirq(ace);
                        break;
index 27de5046708a233cbc99e42a4349387f9f5157cb..968f9e52effa8c401a66e11b4de8bae9f23756ec 100644 (file)
@@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q)
                while (len) {
                        unsigned long addr = start & Z2RAM_CHUNKMASK;
                        unsigned long size = Z2RAM_CHUNKSIZE - addr;
+                       void *buffer = bio_data(req->bio);
+
                        if (len < size)
                                size = len;
                        addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
                        if (rq_data_dir(req) == READ)
-                               memcpy(req->buffer, (char *)addr, size);
+                               memcpy(buffer, (char *)addr, size);
                        else
-                               memcpy((char *)addr, req->buffer, size);
+                               memcpy((char *)addr, buffer, size);
                        start += size;
                        len -= size;
                }
index 51e75ad964223e07268e0781caa5fa9230f43824..584bc3126403d58955a915db2117fcd1d16de5a2 100644 (file)
@@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
                spin_unlock(&gdrom_lock);
                block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
                block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
-               __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG);
+               __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG);
                __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
                __raw_writel(1, GDROM_DMA_DIRECTION_REG);
                __raw_writel(1, GDROM_DMA_ENABLE_REG);
index 102c50d38902ca43fed85641618ba202a7251679..06cea7ff3a7c34914c611d0a0a84703a3789edc1 100644 (file)
@@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk)
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
        trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
 }
+EXPORT_SYMBOL_GPL(add_disk_randomness);
 #endif
 
 /*********************************************************************
index 16f69be820c7bb13da94f253b5e7bb33c831fea5..ee880382e3bce50b03775e9ab48504febb4ab6b3 100644 (file)
@@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 
        ledtrig_ide_activity();
 
-       pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
+       pr_debug("%s: %sing: block=%llu, sectors=%u\n",
                 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-                (unsigned long long)block, blk_rq_sectors(rq),
-                (unsigned long)rq->buffer);
+                (unsigned long long)block, blk_rq_sectors(rq));
 
        if (hwif->rw_disk)
                hwif->rw_disk(drive, rq);
index 455e6491649889d1970cd8ef3425e93f2ea6a93c..6a71bc7c9133a1d5e63ec13493593449a22f1044 100644 (file)
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
        clone->cmd = rq->cmd;
        clone->cmd_len = rq->cmd_len;
        clone->sense = rq->sense;
-       clone->buffer = rq->buffer;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
 
index 0b2ccb68c0d0240efdb9c5de9189f8e89a4ca890..4dbfaee9aa9583c41508b27a0e993480a265ebc2 100644 (file)
@@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
        block = blk_rq_pos(req) << 9 >> tr->blkshift;
        nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
-
-       buf = req->buffer;
+       buf = bio_data(req->bio);
 
        if (req->cmd_type != REQ_TYPE_FS)
                return -EIO;
index 8d659e6a1b4c0899e32706b8bfa7fe3270ff715a..20a667c95da4b86e2076dec801d5e0770f88bf3f 100644 (file)
@@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
         * flash access anyway.
         */
        mutex_lock(&dev->dev_mutex);
-       ret = ubiblock_read(dev, req->buffer, sec, len);
+       ret = ubiblock_read(dev, bio_data(req->bio), sec, len);
        mutex_unlock(&dev->dev_mutex);
 
        return ret;
index 4ccb5d869389e353113692d20f02f66d002046cd..a40ee1e37486bd9b05062074507fd2a64d472677 100644 (file)
@@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q)
                        goto end;
                }
 
-               jsfd_read(req->buffer, jdp->dbase + offset, len);
+               jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
                err = 0;
        end:
                if (!__blk_end_request_cur(req, err))
index 9db097a28a74588c793c0521c7f80f8540820f61..a0c95cac91f0fe55681830af4477e9513ec19a32 100644 (file)
@@ -140,7 +140,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
        cmd->result = 0;
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, cmd->request);
-       kblockd_schedule_work(q, &device->requeue_work);
+       kblockd_schedule_work(&device->requeue_work);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -1019,8 +1019,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
                return BLKPREP_DEFER;
        }
 
-       req->buffer = NULL;
-
        /* 
         * Next, walk the list, and fill in the addresses and sizes of
         * each segment.
@@ -1158,7 +1156,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
                BUG_ON(blk_rq_bytes(req));
 
                memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-               req->buffer = NULL;
        }
 
        cmd->cmd_len = req->cmd_len;
index efcbcd182863318f296936bc7fa16c96bc58efe5..96af195224f2b9dae866ea109afb1d894c142c80 100644 (file)
@@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
                goto out;
        }
 
+       rq->completion_data = page;
        blk_add_request_payload(rq, page, len);
        ret = scsi_setup_blk_pc_cmnd(sdp, rq);
-       rq->buffer = page_address(page);
        rq->__data_len = nr_bytes;
 
 out:
-       if (ret != BLKPREP_OK) {
+       if (ret != BLKPREP_OK)
                __free_page(page);
-               rq->buffer = NULL;
-       }
        return ret;
 }
 
@@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
 {
        struct scsi_cmnd *SCpnt = rq->special;
 
-       if (rq->cmd_flags & REQ_DISCARD) {
-               free_page((unsigned long)rq->buffer);
-               rq->buffer = NULL;
-       }
+       if (rq->cmd_flags & REQ_DISCARD)
+               __free_page(rq->completion_data);
+
        if (SCpnt->cmnd != rq->cmd) {
                mempool_free(SCpnt->cmnd, sd_cdb_pool);
                SCpnt->cmnd = NULL;
index f9cb9876e466a02f7c0149412bea8a177031c509..4030cbfbc9af8572673eb06c997603ee529443e4 100644 (file)
@@ -14,14 +14,13 @@ obj-y :=    open.o read_write.o file_table.o super.o \
                stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=       buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=       buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=       no-block.o
 endif
 
 obj-$(CONFIG_PROC_FS) += proc_namespace.o
 
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-y                          += notify/
 obj-$(CONFIG_EPOLL)            += eventpoll.o
 obj-$(CONFIG_ANON_INODES)      += anon_inodes.o
index bba5508269219a0726ede9f97cc3d22afa97f213..5a645769f020f956415c1fac11ac868a1922f161 100644 (file)
@@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
-extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
+extern mempool_t *biovec_create_pool(int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
index 0120451545d8d2b85cc5eb10475359442f9d7460..c15128833100b9a7b7a26269486ca00d066d5c9e 100644 (file)
@@ -8,7 +8,13 @@ struct blk_mq_tags;
 struct blk_mq_cpu_notifier {
        struct list_head list;
        void *data;
-       void (*notify)(void *data, unsigned long action, unsigned int cpu);
+       int (*notify)(void *data, unsigned long action, unsigned int cpu);
+};
+
+struct blk_mq_ctxmap {
+       unsigned int map_size;
+       unsigned int bits_per_word;
+       struct blk_align_bitmap *map;
 };
 
 struct blk_mq_hw_ctx {
@@ -18,7 +24,11 @@ struct blk_mq_hw_ctx {
        } ____cacheline_aligned_in_smp;
 
        unsigned long           state;          /* BLK_MQ_S_* flags */
-       struct delayed_work     delayed_work;
+       struct delayed_work     run_work;
+       struct delayed_work     delay_work;
+       cpumask_var_t           cpumask;
+       int                     next_cpu;
+       int                     next_cpu_batch;
 
        unsigned long           flags;          /* BLK_MQ_F_* flags */
 
@@ -27,13 +37,13 @@ struct blk_mq_hw_ctx {
 
        void                    *driver_data;
 
+       struct blk_mq_ctxmap    ctx_map;
+
        unsigned int            nr_ctx;
        struct blk_mq_ctx       **ctxs;
-       unsigned int            nr_ctx_map;
-       unsigned long           *ctx_map;
 
-       struct request          **rqs;
-       struct list_head        page_list;
+       unsigned int            wait_index;
+
        struct blk_mq_tags      *tags;
 
        unsigned long           queued;
@@ -41,31 +51,40 @@ struct blk_mq_hw_ctx {
 #define BLK_MQ_MAX_DISPATCH_ORDER      10
        unsigned long           dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
-       unsigned int            queue_depth;
        unsigned int            numa_node;
        unsigned int            cmd_size;       /* per-request extra data */
 
+       atomic_t                nr_active;
+
        struct blk_mq_cpu_notifier      cpu_notifier;
        struct kobject          kobj;
 };
 
-struct blk_mq_reg {
+struct blk_mq_tag_set {
        struct blk_mq_ops       *ops;
        unsigned int            nr_hw_queues;
-       unsigned int            queue_depth;
+       unsigned int            queue_depth;    /* max hw supported */
        unsigned int            reserved_tags;
        unsigned int            cmd_size;       /* per-request extra data */
        int                     numa_node;
        unsigned int            timeout;
        unsigned int            flags;          /* BLK_MQ_F_* */
+       void                    *driver_data;
+
+       struct blk_mq_tags      **tags;
+
+       struct mutex            tag_list_lock;
+       struct list_head        tag_list;
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
-typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+               unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+               unsigned int);
 
 struct blk_mq_ops {
        /*
@@ -85,12 +104,6 @@ struct blk_mq_ops {
 
        softirq_done_fn         *complete;
 
-       /*
-        * Override for hctx allocations (should probably go)
-        */
-       alloc_hctx_fn           *alloc_hctx;
-       free_hctx_fn            *free_hctx;
-
        /*
         * Called when the block layer side of a hardware queue has been
         * set up, allowing the driver to allocate/init matching structures.
@@ -98,6 +111,14 @@ struct blk_mq_ops {
         */
        init_hctx_fn            *init_hctx;
        exit_hctx_fn            *exit_hctx;
+
+       /*
+        * Called for every command allocated by the block layer to allow
+        * the driver to set up driver specific data.
+        * Ditto for exit/teardown.
+        */
+       init_request_fn         *init_request;
+       exit_request_fn         *exit_request;
 };
 
 enum {
@@ -107,18 +128,24 @@ enum {
 
        BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
        BLK_MQ_F_SHOULD_SORT    = 1 << 1,
-       BLK_MQ_F_SHOULD_IPI     = 1 << 2,
+       BLK_MQ_F_TAG_SHARED     = 1 << 2,
+       BLK_MQ_F_SG_MERGE       = 1 << 3,
+       BLK_MQ_F_SYSFS_UP       = 1 << 4,
 
        BLK_MQ_S_STOPPED        = 0,
+       BLK_MQ_S_TAG_ACTIVE     = 1,
 
        BLK_MQ_MAX_DEPTH        = 2048,
+
+       BLK_MQ_CPU_WORK_BATCH   = 8,
 };
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
-int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
-void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
+
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
@@ -126,28 +153,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+               gfp_t gfp, bool reserved);
+struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
-void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
 
-bool blk_mq_end_io_partial(struct request *rq, int error,
-               unsigned int nr_bytes);
-static inline void blk_mq_end_io(struct request *rq, int error)
-{
-       bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
-       BUG_ON(!done);
-}
+void blk_mq_end_io(struct request *rq, int error);
+void __blk_mq_end_io(struct request *rq, int error);
 
+void blk_mq_requeue_request(struct request *rq);
+void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
+void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-void blk_mq_start_stopped_hw_queues(struct request_queue *q);
+void blk_mq_start_hw_queues(struct request_queue *q);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
 
 /*
  * Driver command data is immediately after the request. So subtract request
@@ -162,12 +189,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
        return (void *) rq + sizeof(*rq);
 }
 
-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-                                              unsigned int tag)
-{
-       return hctx->rqs[tag];
-}
-
 #define queue_for_each_hw_ctx(q, hctx, i)                              \
        for ((i) = 0; (i) < (q)->nr_hw_queues &&                        \
             ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
index aa0eaa2d0bd85854e231f9fecd56e4f037446d55..d8e4cea23a257c1b9c8b1514493c407b13d912e0 100644 (file)
@@ -190,6 +190,7 @@ enum rq_flag_bits {
        __REQ_PM,               /* runtime pm request */
        __REQ_END,              /* last of chain of requests */
        __REQ_HASHED,           /* on IO scheduler merge hash */
+       __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
        __REQ_NR_BITS,          /* stops here */
 };
 
@@ -243,5 +244,6 @@ enum rq_flag_bits {
 #define REQ_PM                 (1ULL << __REQ_PM)
 #define REQ_END                        (1ULL << __REQ_END)
 #define REQ_HASHED             (1ULL << __REQ_HASHED)
+#define REQ_MQ_INFLIGHT                (1ULL << __REQ_MQ_INFLIGHT)
 
 #endif /* __LINUX_BLK_TYPES_H */
index 0d84981ee03fc1c9d7bd5b656611b8b87af696e0..695b9fd41efeca79d9c4fa037bf3a85587621731 100644 (file)
@@ -90,15 +90,15 @@ enum rq_cmd_type_bits {
 #define BLK_MAX_CDB    16
 
 /*
- * try to put the fields that are referenced together in the same cacheline.
- * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init()
- * as well!
+ * Try to put the fields that are referenced together in the same cacheline.
+ *
+ * If you modify this structure, make sure to update blk_rq_init() and
+ * especially blk_mq_rq_ctx_init() to take care of the added fields.
  */
 struct request {
        struct list_head queuelist;
        union {
                struct call_single_data csd;
-               struct work_struct mq_flush_work;
                unsigned long fifo_time;
        };
 
@@ -178,7 +178,6 @@ struct request {
        unsigned short ioprio;
 
        void *special;          /* opaque pointer available for LLD use */
-       char *buffer;           /* kaddr of the current segment if available */
 
        int tag;
        int errors;
@@ -463,6 +462,10 @@ struct request_queue {
        struct request          *flush_rq;
        spinlock_t              mq_flush_lock;
 
+       struct list_head        requeue_list;
+       spinlock_t              requeue_lock;
+       struct work_struct      requeue_work;
+
        struct mutex            sysfs_lock;
 
        int                     bypass_depth;
@@ -481,6 +484,9 @@ struct request_queue {
        wait_queue_head_t       mq_freeze_wq;
        struct percpu_counter   mq_usage_counter;
        struct list_head        all_q_node;
+
+       struct blk_mq_tag_set   *tag_set;
+       struct list_head        tag_set_list;
 };
 
 #define QUEUE_FLAG_QUEUED      1       /* uses generic tag queueing */
@@ -504,6 +510,7 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  18      /* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19      /* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20      /* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE 21      /* don't attempt to merge SG segments*/
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@ -937,6 +944,7 @@ extern struct request *blk_fetch_request(struct request_queue *q);
  */
 extern bool blk_update_request(struct request *rq, int error,
                               unsigned int nr_bytes);
+extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
                            unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
@@ -1053,7 +1061,6 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
  * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
-       unsigned long magic; /* detect uninitialized use-cases */
        struct list_head list; /* requests */
        struct list_head mq_list; /* blk-mq requests */
        struct list_head cb_list; /* md requires an unplug callback */
@@ -1102,7 +1109,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 /*
  * tag stuff
  */
-#define blk_rq_tagged(rq)              ((rq)->cmd_flags & REQ_QUEUED)
+#define blk_rq_tagged(rq) \
+       ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
@@ -1370,8 +1378,9 @@ static inline void put_dev_sector(Sector p)
 }
 
 struct work_struct;
-int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
+int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
index b484452dac57ea5e531918837d658c9d5d159ea6..0173940407f6c0e50d033dfb1a56be4d4d117d9d 100644 (file)
@@ -30,7 +30,6 @@ endif
 
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
-obj-$(CONFIG_BOUNCE)   += bounce.o
 obj-$(CONFIG_SWAP)     += page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP)        += frontswap.o
 obj-$(CONFIG_ZSWAP)    += zswap.o