]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/target/target_core_user.c
Merge tag 'platform-drivers-x86-v4.12-2' of git://git.infradead.org/linux-platform...
[karo-tx-linux.git] / drivers / target / target_core_user.c
index f615c3bbb73e8b7a2a7bf3f5039efd84c724cdf2..beb5f098f32d6f7bb5851deb810065ab37e4ac4a 100644 (file)
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 Shaohua Li <shli@kernel.org>
  * Copyright (C) 2014 Red Hat, Inc.
  * Copyright (C) 2015 Arrikto, Inc.
+ * Copyright (C) 2017 Chinamobile, Inc.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
 #include <linux/parser.h>
 #include <linux/vmalloc.h>
 #include <linux/uio_driver.h>
+#include <linux/radix-tree.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/highmem.h>
 #include <linux/configfs.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
  * this may have a 'UAM' comment.
  */
 
-
 #define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
 
-#define DATA_BLOCK_BITS 256
-#define DATA_BLOCK_SIZE 4096
+/* For cmd area, the size is fixed 8MB */
+#define CMDR_SIZE (8 * 1024 * 1024)
 
-#define CMDR_SIZE (16 * 4096)
+/*
+ * For data area, the block size is PAGE_SIZE and
+ * the total size is 256K * PAGE_SIZE.
+ */
+#define DATA_BLOCK_SIZE PAGE_SIZE
+#define DATA_BLOCK_BITS (256 * 1024)
 #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
+#define DATA_BLOCK_INIT_BITS 128
 
+/* The total size of the ring is 8M + 256K * PAGE_SIZE */
 #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
 
+/* Default maximum of the global data blocks(512K * PAGE_SIZE) */
+#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
+
 static struct device *tcmu_root_device;
 
 struct tcmu_hba {
@@ -83,6 +96,8 @@ struct tcmu_hba {
 #define TCMU_CONFIG_LEN 256
 
 struct tcmu_dev {
+       struct list_head node;
+       struct kref kref;
        struct se_device se_dev;
 
        char *name;
@@ -94,6 +109,8 @@ struct tcmu_dev {
 
        struct uio_info uio_info;
 
+       struct inode *inode;
+
        struct tcmu_mailbox *mb_addr;
        size_t dev_size;
        u32 cmdr_size;
@@ -103,11 +120,14 @@ struct tcmu_dev {
        size_t data_off;
        size_t data_size;
 
-       DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
-
        wait_queue_head_t wait_cmdr;
-       /* TODO should this be a mutex? */
-       spinlock_t cmdr_lock;
+       struct mutex cmdr_lock;
+
+       bool waiting_global;
+       uint32_t dbi_max;
+       uint32_t dbi_thresh;
+       DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
+       struct radix_tree_root data_blocks;
 
        struct idr commands;
        spinlock_t commands_lock;
@@ -130,7 +150,9 @@ struct tcmu_cmd {
 
        /* Can't use se_cmd when cleaning up expired cmds, because if
           cmd has been completed then accessing se_cmd is off limits */
-       DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
+       uint32_t dbi_cnt;
+       uint32_t dbi_cur;
+       uint32_t *dbi;
 
        unsigned long deadline;
 
@@ -138,6 +160,13 @@ struct tcmu_cmd {
        unsigned long flags;
 };
 
+static struct task_struct *unmap_thread;
+static wait_queue_head_t unmap_wait;
+static DEFINE_MUTEX(root_udev_mutex);
+static LIST_HEAD(root_udev);
+
+static atomic_t global_db_count = ATOMIC_INIT(0);
+
 static struct kmem_cache *tcmu_cmd_cache;
 
 /* multicast group */
@@ -161,6 +190,114 @@ static struct genl_family tcmu_genl_family __ro_after_init = {
        .netnsok = true,
 };
 
+#define tcmu_cmd_set_dbi_cur(cmd, index) ((cmd)->dbi_cur = (index))
+#define tcmu_cmd_reset_dbi_cur(cmd) tcmu_cmd_set_dbi_cur(cmd, 0)
+#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
+#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
+
+static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len)
+{
+       struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
+       uint32_t i;
+
+       for (i = 0; i < len; i++)
+               clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap);
+}
+
+static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
+                                       struct tcmu_cmd *tcmu_cmd)
+{
+       struct page *page;
+       int ret, dbi;
+
+       dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh);
+       if (dbi == udev->dbi_thresh)
+               return false;
+
+       page = radix_tree_lookup(&udev->data_blocks, dbi);
+       if (!page) {
+
+               if (atomic_add_return(1, &global_db_count) >
+                                       TCMU_GLOBAL_MAX_BLOCKS) {
+                       atomic_dec(&global_db_count);
+                       return false;
+               }
+
+               /* try to get new page from the mm */
+               page = alloc_page(GFP_KERNEL);
+               if (!page)
+                       return false;
+
+               ret = radix_tree_insert(&udev->data_blocks, dbi, page);
+               if (ret) {
+                       __free_page(page);
+                       return false;
+               }
+
+       }
+
+       if (dbi > udev->dbi_max)
+               udev->dbi_max = dbi;
+
+       set_bit(dbi, udev->data_bitmap);
+       tcmu_cmd_set_dbi(tcmu_cmd, dbi);
+
+       return true;
+}
+
+static bool tcmu_get_empty_blocks(struct tcmu_dev *udev,
+                                 struct tcmu_cmd *tcmu_cmd)
+{
+       int i;
+
+       udev->waiting_global = false;
+
+       for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) {
+               if (!tcmu_get_empty_block(udev, tcmu_cmd))
+                       goto err;
+       }
+       return true;
+
+err:
+       udev->waiting_global = true;
+       /* Try to wake up the unmap thread */
+       wake_up(&unmap_wait);
+       return false;
+}
+
+static inline struct page *
+tcmu_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
+{
+       return radix_tree_lookup(&udev->data_blocks, dbi);
+}
+
+static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd)
+{
+       kfree(tcmu_cmd->dbi);
+       kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+}
+
+static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd)
+{
+       struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
+       size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE);
+
+       if (se_cmd->se_cmd_flags & SCF_BIDI) {
+               BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
+               data_length += round_up(se_cmd->t_bidi_data_sg->length,
+                               DATA_BLOCK_SIZE);
+       }
+
+       return data_length;
+}
+
+static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd)
+{
+       size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
+
+       return data_length / DATA_BLOCK_SIZE;
+}
+
 static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 {
        struct se_device *se_dev = se_cmd->se_dev;
@@ -178,6 +315,15 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
                tcmu_cmd->deadline = jiffies +
                                        msecs_to_jiffies(udev->cmd_time_out);
 
+       tcmu_cmd_reset_dbi_cur(tcmu_cmd);
+       tcmu_cmd->dbi_cnt = tcmu_cmd_get_block_cnt(tcmu_cmd);
+       tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t),
+                               GFP_KERNEL);
+       if (!tcmu_cmd->dbi) {
+               kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+               return NULL;
+       }
+
        idr_preload(GFP_KERNEL);
        spin_lock_irq(&udev->commands_lock);
        cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0,
@@ -186,7 +332,7 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
        idr_preload_end();
 
        if (cmd_id < 0) {
-               kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+               tcmu_free_cmd(tcmu_cmd);
                return NULL;
        }
        tcmu_cmd->cmd_id = cmd_id;
@@ -248,10 +394,10 @@ static inline void new_iov(struct iovec **iov, int *iov_cnt,
 #define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size)
 
 /* offset is relative to mb_addr */
-static inline size_t get_block_offset(struct tcmu_dev *dev,
-               int block, int remaining)
+static inline size_t get_block_offset_user(struct tcmu_dev *dev,
+               int dbi, int remaining)
 {
-       return dev->data_off + block * DATA_BLOCK_SIZE +
+       return dev->data_off + dbi * DATA_BLOCK_SIZE +
                DATA_BLOCK_SIZE - remaining;
 }
 
@@ -260,37 +406,45 @@ static inline size_t iov_tail(struct tcmu_dev *udev, struct iovec *iov)
        return (size_t)iov->iov_base + iov->iov_len;
 }
 
-static void alloc_and_scatter_data_area(struct tcmu_dev *udev,
-       struct scatterlist *data_sg, unsigned int data_nents,
-       struct iovec **iov, int *iov_cnt, bool copy_data)
+static int scatter_data_area(struct tcmu_dev *udev,
+       struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
+       unsigned int data_nents, struct iovec **iov,
+       int *iov_cnt, bool copy_data)
 {
-       int i, block;
+       int i, dbi;
        int block_remaining = 0;
-       void *from, *to;
-       size_t copy_bytes, to_offset;
+       void *from, *to = NULL;
+       size_t copy_bytes, to_offset, offset;
        struct scatterlist *sg;
+       struct page *page;
 
        for_each_sg(data_sg, sg, data_nents, i) {
                int sg_remaining = sg->length;
                from = kmap_atomic(sg_page(sg)) + sg->offset;
                while (sg_remaining > 0) {
                        if (block_remaining == 0) {
-                               block = find_first_zero_bit(udev->data_bitmap,
-                                               DATA_BLOCK_BITS);
+                               if (to)
+                                       kunmap_atomic(to);
+
                                block_remaining = DATA_BLOCK_SIZE;
-                               set_bit(block, udev->data_bitmap);
+                               dbi = tcmu_cmd_get_dbi(tcmu_cmd);
+                               page = tcmu_get_block_page(udev, dbi);
+                               to = kmap_atomic(page);
                        }
+
                        copy_bytes = min_t(size_t, sg_remaining,
                                        block_remaining);
-                       to_offset = get_block_offset(udev, block,
+                       to_offset = get_block_offset_user(udev, dbi,
                                        block_remaining);
-                       to = (void *)udev->mb_addr + to_offset;
+                       offset = DATA_BLOCK_SIZE - block_remaining;
+                       to = (void *)(unsigned long)to + offset;
+
                        if (*iov_cnt != 0 &&
                            to_offset == iov_tail(udev, *iov)) {
                                (*iov)->iov_len += copy_bytes;
                        } else {
                                new_iov(iov, iov_cnt, udev);
-                               (*iov)->iov_base = (void __user *) to_offset;
+                               (*iov)->iov_base = (void __user *)to_offset;
                                (*iov)->iov_len = copy_bytes;
                        }
                        if (copy_data) {
@@ -303,33 +457,29 @@ static void alloc_and_scatter_data_area(struct tcmu_dev *udev,
                }
                kunmap_atomic(from - sg->offset);
        }
-}
+       if (to)
+               kunmap_atomic(to);
 
-static void free_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd)
-{
-       bitmap_xor(udev->data_bitmap, udev->data_bitmap, cmd->data_bitmap,
-                  DATA_BLOCK_BITS);
+       return 0;
 }
 
 static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
                             bool bidi)
 {
        struct se_cmd *se_cmd = cmd->se_cmd;
-       int i, block;
+       int i, dbi;
        int block_remaining = 0;
-       void *from, *to;
-       size_t copy_bytes, from_offset;
+       void *from = NULL, *to;
+       size_t copy_bytes, offset;
        struct scatterlist *sg, *data_sg;
+       struct page *page;
        unsigned int data_nents;
-       DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS);
-
-       bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS);
+       uint32_t count = 0;
 
        if (!bidi) {
                data_sg = se_cmd->t_data_sg;
                data_nents = se_cmd->t_data_nents;
        } else {
-               uint32_t count;
 
                /*
                 * For bidi case, the first count blocks are for Data-Out
@@ -337,30 +487,30 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
                 * the Data-Out buffer blocks should be discarded.
                 */
                count = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE);
-               while (count--) {
-                       block = find_first_bit(bitmap, DATA_BLOCK_BITS);
-                       clear_bit(block, bitmap);
-               }
 
                data_sg = se_cmd->t_bidi_data_sg;
                data_nents = se_cmd->t_bidi_data_nents;
        }
 
+       tcmu_cmd_set_dbi_cur(cmd, count);
+
        for_each_sg(data_sg, sg, data_nents, i) {
                int sg_remaining = sg->length;
                to = kmap_atomic(sg_page(sg)) + sg->offset;
                while (sg_remaining > 0) {
                        if (block_remaining == 0) {
-                               block = find_first_bit(bitmap,
-                                               DATA_BLOCK_BITS);
+                               if (from)
+                                       kunmap_atomic(from);
+
                                block_remaining = DATA_BLOCK_SIZE;
-                               clear_bit(block, bitmap);
+                               dbi = tcmu_cmd_get_dbi(cmd);
+                               page = tcmu_get_block_page(udev, dbi);
+                               from = kmap_atomic(page);
                        }
                        copy_bytes = min_t(size_t, sg_remaining,
                                        block_remaining);
-                       from_offset = get_block_offset(udev, block,
-                                       block_remaining);
-                       from = (void *) udev->mb_addr + from_offset;
+                       offset = DATA_BLOCK_SIZE - block_remaining;
+                       from = (void *)(unsigned long)from + offset;
                        tcmu_flush_dcache_range(from, copy_bytes);
                        memcpy(to + sg->length - sg_remaining, from,
                                        copy_bytes);
@@ -370,12 +520,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
                }
                kunmap_atomic(to - sg->offset);
        }
+       if (from)
+               kunmap_atomic(from);
 }
 
-static inline size_t spc_bitmap_free(unsigned long *bitmap)
+static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
 {
-       return DATA_BLOCK_SIZE * (DATA_BLOCK_BITS -
-                       bitmap_weight(bitmap, DATA_BLOCK_BITS));
+       return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh));
 }
 
 /*
@@ -384,9 +535,12 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap)
  *
  * Called with ring lock held.
  */
-static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed)
+static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+               size_t cmd_size, size_t data_needed)
 {
        struct tcmu_mailbox *mb = udev->mb_addr;
+       uint32_t blocks_needed = (data_needed + DATA_BLOCK_SIZE - 1)
+                               / DATA_BLOCK_SIZE;
        size_t space, cmd_needed;
        u32 cmd_head;
 
@@ -410,35 +564,63 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
                return false;
        }
 
-       space = spc_bitmap_free(udev->data_bitmap);
+       /* try to check and get the data blocks as needed */
+       space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
        if (space < data_needed) {
-               pr_debug("no data space: only %zu available, but ask for %zu\n",
-                               space, data_needed);
-               return false;
+               unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh;
+               unsigned long grow;
+
+               if (blocks_left < blocks_needed) {
+                       pr_debug("no data space: only %lu available, but ask for %zu\n",
+                                       blocks_left * DATA_BLOCK_SIZE,
+                                       data_needed);
+                       return false;
+               }
+
+               /* Try to expand the thresh */
+               if (!udev->dbi_thresh) {
+                       /* From idle state */
+                       uint32_t init_thresh = DATA_BLOCK_INIT_BITS;
+
+                       udev->dbi_thresh = max(blocks_needed, init_thresh);
+               } else {
+                       /*
+                        * Grow the data area by max(blocks needed,
+                        * dbi_thresh / 2), but limited to the max
+                        * DATA_BLOCK_BITS size.
+                        */
+                       grow = max(blocks_needed, udev->dbi_thresh / 2);
+                       udev->dbi_thresh += grow;
+                       if (udev->dbi_thresh > DATA_BLOCK_BITS)
+                               udev->dbi_thresh = DATA_BLOCK_BITS;
+               }
        }
 
+       if (!tcmu_get_empty_blocks(udev, cmd))
+               return false;
+
        return true;
 }
 
-static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd)
+static inline size_t tcmu_cmd_get_base_cmd_size(size_t iov_cnt)
 {
-       struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
-       size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE);
-
-       if (se_cmd->se_cmd_flags & SCF_BIDI) {
-               BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
-               data_length += round_up(se_cmd->t_bidi_data_sg->length,
-                               DATA_BLOCK_SIZE);
-       }
-
-       return data_length;
+       return max(offsetof(struct tcmu_cmd_entry, req.iov[iov_cnt]),
+                       sizeof(struct tcmu_cmd_entry));
 }
 
-static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd)
+static inline size_t tcmu_cmd_get_cmd_size(struct tcmu_cmd *tcmu_cmd,
+                                          size_t base_command_size)
 {
-       size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
+       struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
+       size_t command_size;
 
-       return data_length / DATA_BLOCK_SIZE;
+       command_size = base_command_size +
+               round_up(scsi_command_size(se_cmd->t_task_cdb),
+                               TCMU_OP_ALIGN_SIZE);
+
+       WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1));
+
+       return command_size;
 }
 
 static sense_reason_t
@@ -450,12 +632,11 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
        struct tcmu_mailbox *mb;
        struct tcmu_cmd_entry *entry;
        struct iovec *iov;
-       int iov_cnt;
+       int iov_cnt, ret;
        uint32_t cmd_head;
        uint64_t cdb_off;
        bool copy_to_data_area;
        size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
-       DECLARE_BITMAP(old_bitmap, DATA_BLOCK_BITS);
 
        if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags))
                return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
@@ -464,18 +645,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
         * Must be a certain minimum size for response sense info, but
         * also may be larger if the iov array is large.
         *
-        * We prepare way too many iovs for potential uses here, because it's
-        * expensive to tell how many regions are freed in the bitmap
-       */
-       base_command_size = max(offsetof(struct tcmu_cmd_entry,
-                               req.iov[tcmu_cmd_get_block_cnt(tcmu_cmd)]),
-                               sizeof(struct tcmu_cmd_entry));
-       command_size = base_command_size
-               + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE);
-
-       WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1));
+        * We prepare as many iovs as possbile for potential uses here,
+        * because it's expensive to tell how many regions are freed in
+        * the bitmap & global data pool, as the size calculated here
+        * will only be used to do the checks.
+        *
+        * The size will be recalculated later as actually needed to save
+        * cmd area memories.
+        */
+       base_command_size = tcmu_cmd_get_base_cmd_size(tcmu_cmd->dbi_cnt);
+       command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size);
 
-       spin_lock_irq(&udev->cmdr_lock);
+       mutex_lock(&udev->cmdr_lock);
 
        mb = udev->mb_addr;
        cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
@@ -484,18 +665,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
                pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu "
                        "cmd ring/data area\n", command_size, data_length,
                        udev->cmdr_size, udev->data_size);
-               spin_unlock_irq(&udev->cmdr_lock);
+               mutex_unlock(&udev->cmdr_lock);
                return TCM_INVALID_CDB_FIELD;
        }
 
-       while (!is_ring_space_avail(udev, command_size, data_length)) {
+       while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
                int ret;
                DEFINE_WAIT(__wait);
 
                prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
 
                pr_debug("sleeping for ring space\n");
-               spin_unlock_irq(&udev->cmdr_lock);
+               mutex_unlock(&udev->cmdr_lock);
                if (udev->cmd_time_out)
                        ret = schedule_timeout(
                                        msecs_to_jiffies(udev->cmd_time_out));
@@ -507,7 +688,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
                        return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
                }
 
-               spin_lock_irq(&udev->cmdr_lock);
+               mutex_lock(&udev->cmdr_lock);
 
                /* We dropped cmdr_lock, cmd_head is stale */
                cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
@@ -534,20 +715,26 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
        entry = (void *) mb + CMDR_OFF + cmd_head;
        tcmu_flush_dcache_range(entry, sizeof(*entry));
        tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_CMD);
-       tcmu_hdr_set_len(&entry->hdr.len_op, command_size);
        entry->hdr.cmd_id = tcmu_cmd->cmd_id;
        entry->hdr.kflags = 0;
        entry->hdr.uflags = 0;
 
-       bitmap_copy(old_bitmap, udev->data_bitmap, DATA_BLOCK_BITS);
-
        /* Handle allocating space from the data area */
+       tcmu_cmd_reset_dbi_cur(tcmu_cmd);
        iov = &entry->req.iov[0];
        iov_cnt = 0;
        copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
                || se_cmd->se_cmd_flags & SCF_BIDI);
-       alloc_and_scatter_data_area(udev, se_cmd->t_data_sg,
-               se_cmd->t_data_nents, &iov, &iov_cnt, copy_to_data_area);
+       ret = scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
+                               se_cmd->t_data_nents, &iov, &iov_cnt,
+                               copy_to_data_area);
+       if (ret) {
+               tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
+               mutex_unlock(&udev->cmdr_lock);
+
+               pr_err("tcmu: alloc and scatter data failed\n");
+               return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+       }
        entry->req.iov_cnt = iov_cnt;
        entry->req.iov_dif_cnt = 0;
 
@@ -555,14 +742,29 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
        if (se_cmd->se_cmd_flags & SCF_BIDI) {
                iov_cnt = 0;
                iov++;
-               alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg,
-                               se_cmd->t_bidi_data_nents, &iov, &iov_cnt,
-                               false);
+               ret = scatter_data_area(udev, tcmu_cmd,
+                                       se_cmd->t_bidi_data_sg,
+                                       se_cmd->t_bidi_data_nents,
+                                       &iov, &iov_cnt, false);
+               if (ret) {
+                       tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
+                       mutex_unlock(&udev->cmdr_lock);
+
+                       pr_err("tcmu: alloc and scatter bidi data failed\n");
+                       return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+               }
                entry->req.iov_bidi_cnt = iov_cnt;
        }
-       /* cmd's data_bitmap is what changed in process */
-       bitmap_xor(tcmu_cmd->data_bitmap, old_bitmap, udev->data_bitmap,
-                       DATA_BLOCK_BITS);
+
+       /*
+        * Recalaulate the command's base size and size according
+        * to the actual needs
+        */
+       base_command_size = tcmu_cmd_get_base_cmd_size(entry->req.iov_cnt +
+                                                      entry->req.iov_bidi_cnt);
+       command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size);
+
+       tcmu_hdr_set_len(&entry->hdr.len_op, command_size);
 
        /* All offsets relative to mb_addr, not start of entry! */
        cdb_off = CMDR_OFF + cmd_head + base_command_size;
@@ -572,8 +774,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 
        UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size);
        tcmu_flush_dcache_range(mb, sizeof(*mb));
-
-       spin_unlock_irq(&udev->cmdr_lock);
+       mutex_unlock(&udev->cmdr_lock);
 
        /* TODO: only if FLUSH and FUA? */
        uio_event_notify(&udev->uio_info);
@@ -604,7 +805,7 @@ tcmu_queue_cmd(struct se_cmd *se_cmd)
                idr_remove(&udev->commands, tcmu_cmd->cmd_id);
                spin_unlock_irq(&udev->commands_lock);
 
-               kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
+               tcmu_free_cmd(tcmu_cmd);
        }
 
        return ret;
@@ -615,50 +816,45 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
        struct se_cmd *se_cmd = cmd->se_cmd;
        struct tcmu_dev *udev = cmd->tcmu_dev;
 
-       if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
-               /*
-                * cmd has been completed already from timeout, just reclaim
-                * data area space and free cmd
-                */
-               free_data_area(udev, cmd);
+       /*
+        * cmd has been completed already from timeout, just reclaim
+        * data area space and free cmd
+        */
+       if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags))
+               goto out;
 
-               kmem_cache_free(tcmu_cmd_cache, cmd);
-               return;
-       }
+       tcmu_cmd_reset_dbi_cur(cmd);
 
        if (entry->hdr.uflags & TCMU_UFLAG_UNKNOWN_OP) {
-               free_data_area(udev, cmd);
                pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n",
                        cmd->se_cmd);
                entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION;
        } else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
                memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer,
                               se_cmd->scsi_sense_length);
-               free_data_area(udev, cmd);
        } else if (se_cmd->se_cmd_flags & SCF_BIDI) {
                /* Get Data-In buffer before clean up */
                gather_data_area(udev, cmd, true);
-               free_data_area(udev, cmd);
        } else if (se_cmd->data_direction == DMA_FROM_DEVICE) {
                gather_data_area(udev, cmd, false);
-               free_data_area(udev, cmd);
        } else if (se_cmd->data_direction == DMA_TO_DEVICE) {
-               free_data_area(udev, cmd);
+               /* TODO: */
        } else if (se_cmd->data_direction != DMA_NONE) {
                pr_warn("TCMU: data direction was %d!\n",
                        se_cmd->data_direction);
        }
 
        target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
-       cmd->se_cmd = NULL;
 
-       kmem_cache_free(tcmu_cmd_cache, cmd);
+out:
+       cmd->se_cmd = NULL;
+       tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
+       tcmu_free_cmd(cmd);
 }
 
 static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 {
        struct tcmu_mailbox *mb;
-       unsigned long flags;
        int handled = 0;
 
        if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) {
@@ -666,8 +862,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
                return 0;
        }
 
-       spin_lock_irqsave(&udev->cmdr_lock, flags);
-
        mb = udev->mb_addr;
        tcmu_flush_dcache_range(mb, sizeof(*mb));
 
@@ -708,8 +902,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
        if (mb->cmd_tail == mb->cmd_head)
                del_timer(&udev->timeout); /* no more pending cmds */
 
-       spin_unlock_irqrestore(&udev->cmdr_lock, flags);
-
        wake_up(&udev->wait_cmdr);
 
        return handled;
@@ -736,16 +928,14 @@ static void tcmu_device_timedout(unsigned long data)
 {
        struct tcmu_dev *udev = (struct tcmu_dev *)data;
        unsigned long flags;
-       int handled;
-
-       handled = tcmu_handle_completions(udev);
-
-       pr_warn("%d completions handled from timeout\n", handled);
 
        spin_lock_irqsave(&udev->commands_lock, flags);
        idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
        spin_unlock_irqrestore(&udev->commands_lock, flags);
 
+       /* Try to wake up the ummap thread */
+       wake_up(&unmap_wait);
+
        /*
         * We don't need to wakeup threads on wait_cmdr since they have their
         * own timeout.
@@ -779,6 +969,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
        udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL);
        if (!udev)
                return NULL;
+       kref_init(&udev->kref);
 
        udev->name = kstrdup(name, GFP_KERNEL);
        if (!udev->name) {
@@ -790,7 +981,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
        udev->cmd_time_out = TCMU_TIME_OUT;
 
        init_waitqueue_head(&udev->wait_cmdr);
-       spin_lock_init(&udev->cmdr_lock);
+       mutex_init(&udev->cmdr_lock);
 
        idr_init(&udev->commands);
        spin_lock_init(&udev->commands_lock);
@@ -805,7 +996,9 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
 {
        struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info);
 
+       mutex_lock(&tcmu_dev->cmdr_lock);
        tcmu_handle_completions(tcmu_dev);
+       mutex_unlock(&tcmu_dev->cmdr_lock);
 
        return 0;
 }
@@ -827,6 +1020,60 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
        return -1;
 }
 
+static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
+{
+       struct page *page;
+       int ret;
+
+       mutex_lock(&udev->cmdr_lock);
+       page = tcmu_get_block_page(udev, dbi);
+       if (likely(page)) {
+               mutex_unlock(&udev->cmdr_lock);
+               return page;
+       }
+
+       /*
+        * Normally it shouldn't be here:
+        * Only when the userspace has touched the blocks which
+        * are out of the tcmu_cmd's data iov[], and will return
+        * one zeroed page.
+        */
+       pr_warn("Block(%u) out of cmd's iov[] has been touched!\n", dbi);
+       pr_warn("Mostly it will be a bug of userspace, please have a check!\n");
+
+       if (dbi >= udev->dbi_thresh) {
+               /* Extern the udev->dbi_thresh to dbi + 1 */
+               udev->dbi_thresh = dbi + 1;
+               udev->dbi_max = dbi;
+       }
+
+       page = radix_tree_lookup(&udev->data_blocks, dbi);
+       if (!page) {
+               page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+               if (!page) {
+                       mutex_unlock(&udev->cmdr_lock);
+                       return NULL;
+               }
+
+               ret = radix_tree_insert(&udev->data_blocks, dbi, page);
+               if (ret) {
+                       mutex_unlock(&udev->cmdr_lock);
+                       __free_page(page);
+                       return NULL;
+               }
+
+               /*
+                * Since this case is rare in page fault routine, here we
+                * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS
+                * to reduce possible page fault call trace.
+                */
+               atomic_inc(&global_db_count);
+       }
+       mutex_unlock(&udev->cmdr_lock);
+
+       return page;
+}
+
 static int tcmu_vma_fault(struct vm_fault *vmf)
 {
        struct tcmu_dev *udev = vmf->vma->vm_private_data;
@@ -845,11 +1092,20 @@ static int tcmu_vma_fault(struct vm_fault *vmf)
         */
        offset = (vmf->pgoff - mi) << PAGE_SHIFT;
 
-       addr = (void *)(unsigned long)info->mem[mi].addr + offset;
-       if (info->mem[mi].memtype == UIO_MEM_LOGICAL)
-               page = virt_to_page(addr);
-       else
+       if (offset < udev->data_off) {
+               /* For the vmalloc()ed cmd area pages */
+               addr = (void *)(unsigned long)info->mem[mi].addr + offset;
                page = vmalloc_to_page(addr);
+       } else {
+               uint32_t dbi;
+
+               /* For the dynamically growing data area pages */
+               dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE;
+               page = tcmu_try_get_block_page(udev, dbi);
+               if (!page)
+                       return VM_FAULT_NOPAGE;
+       }
+
        get_page(page);
        vmf->page = page;
        return 0;
@@ -883,11 +1139,31 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
        if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
                return -EBUSY;
 
+       udev->inode = inode;
+
        pr_debug("open\n");
 
        return 0;
 }
 
+static void tcmu_dev_call_rcu(struct rcu_head *p)
+{
+       struct se_device *dev = container_of(p, struct se_device, rcu_head);
+       struct tcmu_dev *udev = TCMU_DEV(dev);
+
+       kfree(udev->uio_info.name);
+       kfree(udev->name);
+       kfree(udev);
+}
+
+static void tcmu_dev_kref_release(struct kref *kref)
+{
+       struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref);
+       struct se_device *dev = &udev->se_dev;
+
+       call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
+}
+
 static int tcmu_release(struct uio_info *info, struct inode *inode)
 {
        struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
@@ -895,7 +1171,8 @@ static int tcmu_release(struct uio_info *info, struct inode *inode)
        clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags);
 
        pr_debug("close\n");
-
+       /* release ref from configure */
+       kref_put(&udev->kref, tcmu_dev_kref_release);
        return 0;
 }
 
@@ -963,7 +1240,7 @@ static int tcmu_configure_device(struct se_device *dev)
 
        info->name = str;
 
-       udev->mb_addr = vzalloc(TCMU_RING_SIZE);
+       udev->mb_addr = vzalloc(CMDR_SIZE);
        if (!udev->mb_addr) {
                ret = -ENOMEM;
                goto err_vzalloc;
@@ -972,8 +1249,11 @@ static int tcmu_configure_device(struct se_device *dev)
        /* mailbox fits in first part of CMDR space */
        udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
        udev->data_off = CMDR_SIZE;
-       udev->data_size = TCMU_RING_SIZE - CMDR_SIZE;
+       udev->data_size = DATA_SIZE;
+       udev->dbi_thresh = 0; /* Default in Idle state */
+       udev->waiting_global = false;
 
+       /* Initialise the mailbox of the ring buffer */
        mb = udev->mb_addr;
        mb->version = TCMU_MAILBOX_VERSION;
        mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC;
@@ -984,12 +1264,14 @@ static int tcmu_configure_device(struct se_device *dev)
        WARN_ON(udev->data_size % PAGE_SIZE);
        WARN_ON(udev->data_size % DATA_BLOCK_SIZE);
 
+       INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
+
        info->version = __stringify(TCMU_MAILBOX_VERSION);
 
        info->mem[0].name = "tcm-user command & data buffer";
        info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr;
        info->mem[0].size = TCMU_RING_SIZE;
-       info->mem[0].memtype = UIO_MEM_VIRTUAL;
+       info->mem[0].memtype = UIO_MEM_NONE;
 
        info->irqcontrol = tcmu_irqcontrol;
        info->irq = UIO_IRQ_CUSTOM;
@@ -1010,19 +1292,31 @@ static int tcmu_configure_device(struct se_device *dev)
                dev->dev_attrib.hw_max_sectors = 128;
        dev->dev_attrib.hw_queue_depth = 128;
 
+       /*
+        * Get a ref incase userspace does a close on the uio device before
+        * LIO has initiated tcmu_free_device.
+        */
+       kref_get(&udev->kref);
+
        ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name,
                                 udev->uio_info.uio_dev->minor);
        if (ret)
                goto err_netlink;
 
+       mutex_lock(&root_udev_mutex);
+       list_add(&udev->node, &root_udev);
+       mutex_unlock(&root_udev_mutex);
+
        return 0;
 
 err_netlink:
+       kref_put(&udev->kref, tcmu_dev_kref_release);
        uio_unregister_device(&udev->uio_info);
 err_register:
        vfree(udev->mb_addr);
 err_vzalloc:
        kfree(info->name);
+       info->name = NULL;
 
        return ret;
 }
@@ -1036,17 +1330,26 @@ static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
        return -EINVAL;
 }
 
-static void tcmu_dev_call_rcu(struct rcu_head *p)
+static bool tcmu_dev_configured(struct tcmu_dev *udev)
 {
-       struct se_device *dev = container_of(p, struct se_device, rcu_head);
-       struct tcmu_dev *udev = TCMU_DEV(dev);
-
-       kfree(udev);
+       return udev->uio_info.uio_dev ? true : false;
 }
 
-static bool tcmu_dev_configured(struct tcmu_dev *udev)
+static void tcmu_blocks_release(struct tcmu_dev *udev)
 {
-       return udev->uio_info.uio_dev ? true : false;
+       int i;
+       struct page *page;
+
+       /* Try to release all block pages */
+       mutex_lock(&udev->cmdr_lock);
+       for (i = 0; i <= udev->dbi_max; i++) {
+               page = radix_tree_delete(&udev->data_blocks, i);
+               if (page) {
+                       __free_page(page);
+                       atomic_dec(&global_db_count);
+               }
+       }
+       mutex_unlock(&udev->cmdr_lock);
 }
 
 static void tcmu_free_device(struct se_device *dev)
@@ -1058,6 +1361,10 @@ static void tcmu_free_device(struct se_device *dev)
 
        del_timer_sync(&udev->timeout);
 
+       mutex_lock(&root_udev_mutex);
+       list_del(&udev->node);
+       mutex_unlock(&root_udev_mutex);
+
        vfree(udev->mb_addr);
 
        /* Upper layer should drain all requests before calling this */
@@ -1070,15 +1377,17 @@ static void tcmu_free_device(struct se_device *dev)
        spin_unlock_irq(&udev->commands_lock);
        WARN_ON(!all_expired);
 
+       tcmu_blocks_release(udev);
+
        if (tcmu_dev_configured(udev)) {
                tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
                                   udev->uio_info.uio_dev->minor);
 
                uio_unregister_device(&udev->uio_info);
-               kfree(udev->uio_info.name);
-               kfree(udev->name);
        }
-       call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
+
+       /* release ref from init */
+       kref_put(&udev->kref, tcmu_dev_kref_release);
 }
 
 enum {
@@ -1256,6 +1565,84 @@ static struct target_backend_ops tcmu_ops = {
        .tb_dev_attrib_attrs    = NULL,
 };
 
+static int unmap_thread_fn(void *data)
+{
+       struct tcmu_dev *udev;
+       loff_t off;
+       uint32_t start, end, block;
+       struct page *page;
+       int i;
+
+       while (1) {
+               DEFINE_WAIT(__wait);
+
+               prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE);
+               schedule();
+               finish_wait(&unmap_wait, &__wait);
+
+               if (kthread_should_stop())
+                       break;
+
+               mutex_lock(&root_udev_mutex);
+               list_for_each_entry(udev, &root_udev, node) {
+                       mutex_lock(&udev->cmdr_lock);
+
+                       /* Try to complete the finished commands first */
+                       tcmu_handle_completions(udev);
+
+                       /* Skip the udevs waiting the global pool or in idle */
+                       if (udev->waiting_global || !udev->dbi_thresh) {
+                               mutex_unlock(&udev->cmdr_lock);
+                               continue;
+                       }
+
+                       end = udev->dbi_max + 1;
+                       block = find_last_bit(udev->data_bitmap, end);
+                       if (block == udev->dbi_max) {
+                               /*
+                                * The last bit is dbi_max, so there is
+                                * no need to shrink any blocks.
+                                */
+                               mutex_unlock(&udev->cmdr_lock);
+                               continue;
+                       } else if (block == end) {
+                               /* The current udev will goto idle state */
+                               udev->dbi_thresh = start = 0;
+                               udev->dbi_max = 0;
+                       } else {
+                               udev->dbi_thresh = start = block + 1;
+                               udev->dbi_max = block;
+                       }
+
+                       /* Here will truncate the data area from off */
+                       off = udev->data_off + start * DATA_BLOCK_SIZE;
+                       unmap_mapping_range(udev->inode->i_mapping, off, 0, 1);
+
+                       /* Release the block pages */
+                       for (i = start; i < end; i++) {
+                               page = radix_tree_delete(&udev->data_blocks, i);
+                               if (page) {
+                                       __free_page(page);
+                                       atomic_dec(&global_db_count);
+                               }
+                       }
+                       mutex_unlock(&udev->cmdr_lock);
+               }
+
+               /*
+                * Try to wake up the udevs who are waiting
+                * for the global data pool.
+                */
+               list_for_each_entry(udev, &root_udev, node) {
+                       if (udev->waiting_global)
+                               wake_up(&udev->wait_cmdr);
+               }
+               mutex_unlock(&root_udev_mutex);
+       }
+
+       return 0;
+}
+
 static int __init tcmu_module_init(void)
 {
        int ret, i, len = 0;
@@ -1301,8 +1688,17 @@ static int __init tcmu_module_init(void)
        if (ret)
                goto out_attrs;
 
+       init_waitqueue_head(&unmap_wait);
+       unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap");
+       if (IS_ERR(unmap_thread)) {
+               ret = PTR_ERR(unmap_thread);
+               goto out_unreg_transport;
+       }
+
        return 0;
 
+out_unreg_transport:
+       target_backend_unregister(&tcmu_ops);
 out_attrs:
        kfree(tcmu_attrs);
 out_unreg_genl:
@@ -1317,6 +1713,7 @@ out_free_cache:
 
 static void __exit tcmu_module_exit(void)
 {
+       kthread_stop(unmap_thread);
        target_backend_unregister(&tcmu_ops);
        kfree(tcmu_attrs);
        genl_unregister_family(&tcmu_genl_family);