]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/ext4/extents.c
Merge branch 'urgent' of git://amd64.org/linux/rric into perf/urgent
[karo-tx-linux.git] / fs / ext4 / extents.c
index 385ebb4353321b573422f029e4ad7baeee25fbf2..61fa9e1614afd1922bae4cf5ce0d26dfdbab6b25 100644 (file)
@@ -117,11 +117,9 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
 {
-       int depth;
-
        if (path) {
+               int depth = path->p_depth;
                struct ext4_extent *ex;
-               depth = path->p_depth;
 
                /*
                 * Try to predict block placement assuming that we are
@@ -183,12 +181,10 @@ static inline int ext4_ext_space_block(struct inode *inode, int check)
 
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 6)
-                       size = 6;
+       if (!check && size > 6)
+               size = 6;
 #endif
-       }
        return size;
 }
 
@@ -198,12 +194,10 @@ static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 5)
-                       size = 5;
+       if (!check && size > 5)
+               size = 5;
 #endif
-       }
        return size;
 }
 
@@ -214,12 +208,10 @@ static inline int ext4_ext_space_root(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 3)
-                       size = 3;
+       if (!check && size > 3)
+               size = 3;
 #endif
-       }
        return size;
 }
 
@@ -230,12 +222,10 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
-       if (!check) {
 #ifdef AGGRESSIVE_TEST
-               if (size > 4)
-                       size = 4;
+       if (!check && size > 4)
+               size = 4;
 #endif
-       }
        return size;
 }
 
@@ -247,7 +237,7 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-       int idxs, num = 0;
+       int idxs;
 
        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                / sizeof(struct ext4_extent_idx));
@@ -262,6 +252,8 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
         */
        if (ei->i_da_metadata_calc_len &&
            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+               int num = 0;
+
                if ((ei->i_da_metadata_calc_len % idxs) == 0)
                        num++;
                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
@@ -324,8 +316,6 @@ static int ext4_valid_extent_entries(struct inode *inode,
                                struct ext4_extent_header *eh,
                                int depth)
 {
-       struct ext4_extent *ext;
-       struct ext4_extent_idx *ext_idx;
        unsigned short entries;
        if (eh->eh_entries == 0)
                return 1;
@@ -334,7 +324,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
 
        if (depth == 0) {
                /* leaf entries */
-               ext = EXT_FIRST_EXTENT(eh);
+               struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;
@@ -342,7 +332,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
                        entries--;
                }
        } else {
-               ext_idx = EXT_FIRST_INDEX(eh);
+               struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;
@@ -754,31 +744,25 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                return -EIO;
        }
 
-       len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
-               if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
-                       len = (len - 1) * sizeof(struct ext4_extent_idx);
-                       len = len < 0 ? 0 : len;
-                       ext_debug("insert new index %d after: %llu. "
-                                       "move %d from 0x%p to 0x%p\n",
-                                       logical, ptr, len,
-                                       (curp->p_idx + 1), (curp->p_idx + 2));
-                       memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-               }
+               ext_debug("insert new index %d after: %llu\n", logical, ptr);
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
-               len = len * sizeof(struct ext4_extent_idx);
-               len = len < 0 ? 0 : len;
-               ext_debug("insert new index %d before: %llu. "
-                               "move %d from 0x%p to 0x%p\n",
-                               logical, ptr, len,
-                               curp->p_idx, (curp->p_idx + 1));
-               memmove(curp->p_idx + 1, curp->p_idx, len);
+               ext_debug("insert new index %d before: %llu\n", logical, ptr);
                ix = curp->p_idx;
        }
 
+       len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+       BUG_ON(len < 0);
+       if (len > 0) {
+               ext_debug("insert new index %d: "
+                               "move %d indices from 0x%p to 0x%p\n",
+                               logical, len, ix, ix + 1);
+               memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+       }
+
        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
                return -EIO;
@@ -1392,7 +1376,8 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
        while (depth >= 0) {
                if (depth == path->p_depth) {
                        /* leaf */
-                       if (path[depth].p_ext !=
+                       if (path[depth].p_ext &&
+                               path[depth].p_ext !=
                                        EXT_LAST_EXTENT(path[depth].p_hdr))
                          return le32_to_cpu(path[depth].p_ext[1].ee_block);
                } else {
@@ -1697,7 +1682,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        /* try to insert block into found extent and return */
        if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                && ext4_can_extents_be_merged(inode, ex, newext)) {
-               ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+               ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
                          ext4_ext_is_uninitialized(newext),
                          ext4_ext_get_actual_len(newext),
                          le32_to_cpu(ex->ee_block),
@@ -1735,7 +1720,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
-               ext_debug("next leaf block - %d\n", next);
+               ext_debug("next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_ext_find_extent(inode, next, NULL);
                if (IS_ERR(npath))
@@ -1773,46 +1758,51 @@ has_space:
 
        if (!nearex) {
                /* there is no extent in this leaf, create first one */
-               ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+               ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
-               path[depth].p_ext = EXT_FIRST_EXTENT(eh);
-       } else if (le32_to_cpu(newext->ee_block)
+               nearex = EXT_FIRST_EXTENT(eh);
+       } else {
+               if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
-/*             BUG_ON(newext->ee_block == nearex->ee_block); */
-               if (nearex != EXT_LAST_EXTENT(eh)) {
-                       len = EXT_MAX_EXTENT(eh) - nearex;
-                       len = (len - 1) * sizeof(struct ext4_extent);
-                       len = len < 0 ? 0 : len;
-                       ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
-                                       "move %d from 0x%p to 0x%p\n",
+                       /* Insert after */
+                       ext_debug("insert %u:%llu:[%d]%d before: "
+                                       "nearest %p\n",
+                                       le32_to_cpu(newext->ee_block),
+                                       ext4_ext_pblock(newext),
+                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_get_actual_len(newext),
+                                       nearex);
+                       nearex++;
+               } else {
+                       /* Insert before */
+                       BUG_ON(newext->ee_block == nearex->ee_block);
+                       ext_debug("insert %u:%llu:[%d]%d after: "
+                                       "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
-                                       nearex, len, nearex + 1, nearex + 2);
-                       memmove(nearex + 2, nearex + 1, len);
+                                       nearex);
+               }
+               len = EXT_LAST_EXTENT(eh) - nearex + 1;
+               if (len > 0) {
+                       ext_debug("insert %u:%llu:[%d]%d: "
+                                       "move %d extents from 0x%p to 0x%p\n",
+                                       le32_to_cpu(newext->ee_block),
+                                       ext4_ext_pblock(newext),
+                                       ext4_ext_is_uninitialized(newext),
+                                       ext4_ext_get_actual_len(newext),
+                                       len, nearex, nearex + 1);
+                       memmove(nearex + 1, nearex,
+                               len * sizeof(struct ext4_extent));
                }
-               path[depth].p_ext = nearex + 1;
-       } else {
-               BUG_ON(newext->ee_block == nearex->ee_block);
-               len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-               len = len < 0 ? 0 : len;
-               ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
-                               "move %d from 0x%p to 0x%p\n",
-                               le32_to_cpu(newext->ee_block),
-                               ext4_ext_pblock(newext),
-                               ext4_ext_is_uninitialized(newext),
-                               ext4_ext_get_actual_len(newext),
-                               nearex, len, nearex, nearex + 1);
-               memmove(nearex + 1, nearex, len);
-               path[depth].p_ext = nearex;
        }
 
        le16_add_cpu(&eh->eh_entries, 1);
-       nearex = path[depth].p_ext;
+       path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;
@@ -2311,13 +2301,12 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits;
        struct ext4_extent_header *eh;
-       ext4_lblk_t a, b, block;
+       ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned uninitialized = 0;
        struct ext4_extent *ex;
-       struct ext4_map_blocks map;
 
        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug("truncate since %u in leaf\n", start);
@@ -2360,86 +2349,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
-               } else if (a != ex_ee_block &&
-                       b != ex_ee_block + ex_ee_len - 1) {
-                       /*
-                        * If this is a truncate, then this condition should
-                        * never happen because at least one of the end points
-                        * needs to be on the edge of the extent.
-                        */
-                       if (end == EXT_MAX_BLOCKS - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                               start, end);
-                               block = 0;
-                               num = 0;
-                               err = -EIO;
-                               goto out;
-                       }
-                       /*
-                        * else this is a hole punch, so the extent needs to
-                        * be split since neither edge of the hole is on the
-                        * extent edge
-                        */
-                       else{
-                               map.m_pblk = ext4_ext_pblock(ex);
-                               map.m_lblk = ex_ee_block;
-                               map.m_len = b - ex_ee_block;
-
-                               err = ext4_split_extent(handle,
-                                       inode, path, &map, 0,
-                                       EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-                                       EXT4_GET_BLOCKS_PRE_IO);
-
-                               if (err < 0)
-                                       goto out;
-
-                               ex_ee_len = ext4_ext_get_actual_len(ex);
-
-                               b = ex_ee_block+ex_ee_len - 1 < end ?
-                                       ex_ee_block+ex_ee_len - 1 : end;
-
-                               /* Then remove tail of this extent */
-                               block = ex_ee_block;
-                               num = a - block;
-                       }
+               } else if (b != ex_ee_block + ex_ee_len - 1) {
+                       EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
+                                        start, end);
+                       err = -EIO;
+                       goto out;
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
-                       block = ex_ee_block;
-                       num = a - block;
-               } else if (b != ex_ee_block + ex_ee_len - 1) {
-                       /* remove head of the extent */
-                       block = b;
-                       num =  ex_ee_block + ex_ee_len - b;
-
-                       /*
-                        * If this is a truncate, this condition
-                        * should never happen
-                        */
-                       if (end == EXT_MAX_BLOCKS - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
+                       num = a - ex_ee_block;
                } else {
                        /* remove whole extent: excellent! */
-                       block = ex_ee_block;
                        num = 0;
-                       if (a != ex_ee_block) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
-
-                       if (b != ex_ee_block + ex_ee_len - 1) {
-                               ext_debug("  bad truncate %u:%u\n",
-                                       start, end);
-                               err = -EIO;
-                               goto out;
-                       }
                }
-
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
@@ -2466,19 +2387,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                if (err)
                        goto out;
 
-               if (num == 0) {
+               if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);
-               } else if (block != ex_ee_block) {
-                       /*
-                        * If this was a head removal, then we need to update
-                        * the physical block since it is now at a different
-                        * location
-                        */
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
-               }
 
-               ex->ee_block = cpu_to_le32(block);
                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark uninitialized if all the blocks in the
@@ -2486,11 +2398,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 */
                if (uninitialized && num)
                        ext4_ext_mark_uninitialized(ex);
-
-               err = ext4_ext_dirty(handle, inode, path + depth);
-               if (err)
-                       goto out;
-
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
@@ -2513,7 +2420,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                } else
                        *partial_cluster = 0;
 
-               ext_debug("new extent: %u:%u:%llu\n", block, num,
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2997,17 +2908,29 @@ out:
  *   a> There is no split required: Entire extent should be initialized
  *   b> Splits in two extents: Write is happening at either end of the extent
  *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is uninitialized.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path *path)
 {
+       struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block, eof_block;
-       unsigned int allocated, ee_len, depth;
+       unsigned int ee_len, depth;
+       int allocated;
        int err = 0;
        int split_flag = 0;
 
@@ -3021,11 +2944,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                eof_block = map->m_lblk + map->m_len;
 
        depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        allocated = ee_len - (map->m_lblk - ee_block);
 
+       trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+       /* Pre-conditions */
+       BUG_ON(!ext4_ext_is_uninitialized(ex));
+       BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+       BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+       /*
+        * Attempt to transfer newly initialized blocks from the currently
+        * uninitialized extent to its left neighbor. This is much cheaper
+        * than an insertion followed by a merge as those involve costly
+        * memmove() calls. This is the common case in steady state for
+        * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+        * writes.
+        *
+        * Limitations of the current logic:
+        *  - L1: we only deal with writes at the start of the extent.
+        *    The approach could be extended to writes at the end
+        *    of the extent but this scenario was deemed less common.
+        *  - L2: we do not deal with writes covering the whole extent.
+        *    This would require removing the extent if the transfer
+        *    is possible.
+        *  - L3: we only attempt to merge with an extent stored in the
+        *    same extent tree node.
+        */
+       if ((map->m_lblk == ee_block) &&        /*L1*/
+               (map->m_len < ee_len) &&        /*L2*/
+               (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
+               struct ext4_extent *prev_ex;
+               ext4_lblk_t prev_lblk;
+               ext4_fsblk_t prev_pblk, ee_pblk;
+               unsigned int prev_len, write_len;
+
+               prev_ex = ex - 1;
+               prev_lblk = le32_to_cpu(prev_ex->ee_block);
+               prev_len = ext4_ext_get_actual_len(prev_ex);
+               prev_pblk = ext4_ext_pblock(prev_ex);
+               ee_pblk = ext4_ext_pblock(ex);
+               write_len = map->m_len;
+
+               /*
+                * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                * upon those conditions:
+                * - C1: prev_ex is initialized,
+                * - C2: prev_ex is logically abutting ex,
+                * - C3: prev_ex is physically abutting ex,
+                * - C4: prev_ex can receive the additional blocks without
+                *   overflowing the (initialized) length limit.
+                */
+               if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+                       ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
+                       ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
+                       (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                       err = ext4_ext_get_access(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                               map, ex, prev_ex);
+
+                       /* Shift the start of ex by 'write_len' blocks */
+                       ex->ee_block = cpu_to_le32(ee_block + write_len);
+                       ext4_ext_store_pblock(ex, ee_pblk + write_len);
+                       ex->ee_len = cpu_to_le16(ee_len - write_len);
+                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+                       /* Extend prev_ex by 'write_len' blocks */
+                       prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+                       /* Mark the block containing both extents as dirty */
+                       ext4_ext_dirty(handle, inode, path + depth);
+
+                       /* Update path to point to the right extent */
+                       path[depth].p_ext = prev_ex;
+
+                       /* Result: number of initialized blocks past m_lblk */
+                       allocated = write_len;
+                       goto out;
+               }
+       }
+
        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
@@ -3467,10 +3472,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
-               if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                       io->flag = EXT4_IO_END_UNWRITTEN;
-                       atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-               } else
+               if (io)
+                       ext4_set_io_unwritten_flag(inode, io);
+               else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3511,14 +3515,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
        /* buffered write, writepage time, convert*/
        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-       if (ret >= 0) {
+       if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
-               err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
-                                        map->m_len);
-               if (err < 0)
-                       goto out2;
-       }
-
 out:
        if (ret <= 0) {
                err = ret;
@@ -3559,6 +3557,12 @@ out:
 
 map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
+       if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+               err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+                                        map->m_len);
+               if (err < 0)
+                       goto out2;
+       }
 out1:
        if (allocated > map->m_len)
                allocated = map->m_len;
@@ -3705,13 +3709,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t newblock = 0;
        int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
-       unsigned int allocated_clusters = 0, reserved_clusters = 0;
+       unsigned int allocated_clusters = 0;
        unsigned int punched_out = 0;
        unsigned int result = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
        ext4_lblk_t cluster_offset;
-       struct ext4_map_blocks punch_map;
 
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
@@ -3787,6 +3790,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
+                       struct ext4_map_blocks punch_map;
                        ext4_fsblk_t partial_cluster = 0;
 
                        newblock = map->m_lblk - ee_block + ee_start;
@@ -4023,10 +4027,9 @@ got_allocated_blocks:
                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                       if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
-                               io->flag = EXT4_IO_END_UNWRITTEN;
-                               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-                       } else
+                       if (io)
+                               ext4_set_io_unwritten_flag(inode, io);
+                       else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
@@ -4034,7 +4037,10 @@ got_allocated_blocks:
                        map->m_flags |= EXT4_MAP_UNINIT;
        }
 
-       err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+       err = 0;
+       if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+               err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                        path, ar.len);
        if (!err)
                err = ext4_ext_insert_extent(handle, inode, path,
                                             &newex, flags);
@@ -4062,8 +4068,9 @@ got_allocated_blocks:
         * block allocation which had been deferred till now.
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+               unsigned int reserved_clusters;
                /*
-                * Check how many clusters we had reserved this allocted range.
+                * Check how many clusters we had reserved this allocated range
                 */
                reserved_clusters = get_reserved_cluster_alloc(inode,
                                                map->m_lblk, allocated);
@@ -4158,12 +4165,12 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
-       trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-               newblock, map->m_len, err ? err : allocated);
-
        result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
                        punched_out : allocated;
 
+       trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+               newblock, map->m_len, err ? err : result);
+
        return err ? err : result;
 }
 
@@ -4293,6 +4300,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
+       int flags;
        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
 
@@ -4329,6 +4337,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
+       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 retry:
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk = map.m_lblk + ret;
@@ -4338,9 +4356,7 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
-                                     EXT4_GET_BLOCKS_NO_NORMALIZE);
+               ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);