struct ext4_ext_path *path,
ext4_lblk_t block)
{
- int depth;
-
if (path) {
+ int depth = path->p_depth;
struct ext4_extent *ex;
- depth = path->p_depth;
/*
* Try to predict block placement assuming that we are
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (!check && size > 6)
+ size = 6;
#endif
- }
return size;
}
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (!check && size > 5)
+ size = 5;
#endif
- }
return size;
}
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (!check && size > 3)
+ size = 3;
#endif
- }
return size;
}
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
- if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (!check && size > 4)
+ size = 4;
#endif
- }
return size;
}
int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs, num = 0;
+ int idxs;
idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx));
*/
if (ei->i_da_metadata_calc_len &&
ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ int num = 0;
+
if ((ei->i_da_metadata_calc_len % idxs) == 0)
num++;
if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
struct ext4_extent_header *eh,
int depth)
{
- struct ext4_extent *ext;
- struct ext4_extent_idx *ext_idx;
unsigned short entries;
if (eh->eh_entries == 0)
return 1;
if (depth == 0) {
/* leaf entries */
- ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
entries--;
}
} else {
- ext_idx = EXT_FIRST_INDEX(eh);
+ struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
return -EIO;
}
- len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
- len = (len - 1) * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d after: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- (curp->p_idx + 1), (curp->p_idx + 2));
- memmove(curp->p_idx + 2, curp->p_idx + 1, len);
- }
+ ext_debug("insert new index %d after: %llu\n", logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- len = len * sizeof(struct ext4_extent_idx);
- len = len < 0 ? 0 : len;
- ext_debug("insert new index %d before: %llu. "
- "move %d from 0x%p to 0x%p\n",
- logical, ptr, len,
- curp->p_idx, (curp->p_idx + 1));
- memmove(curp->p_idx + 1, curp->p_idx, len);
+ ext_debug("insert new index %d before: %llu\n", logical, ptr);
ix = curp->p_idx;
}
+ len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+ BUG_ON(len < 0);
+ if (len > 0) {
+ ext_debug("insert new index %d: "
+ "move %d indices from 0x%p to 0x%p\n",
+ logical, len, ix, ix + 1);
+ memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+ }
+
if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
return -EIO;
while (depth >= 0) {
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext !=
+ if (path[depth].p_ext &&
+ path[depth].p_ext !=
EXT_LAST_EXTENT(path[depth].p_hdr))
return le32_to_cpu(path[depth].p_ext[1].ee_block);
} else {
/* try to insert block into found extent and return */
if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
&& ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
- ext_debug("next leaf block - %d\n", next);
+ ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
npath = ext4_ext_find_extent(inode, next, NULL);
if (IS_ERR(npath))
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
+ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
- path[depth].p_ext = EXT_FIRST_EXTENT(eh);
- } else if (le32_to_cpu(newext->ee_block)
+ nearex = EXT_FIRST_EXTENT(eh);
+ } else {
+ if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
-/* BUG_ON(newext->ee_block == nearex->ee_block); */
- if (nearex != EXT_LAST_EXTENT(eh)) {
- len = EXT_MAX_EXTENT(eh) - nearex;
- len = (len - 1) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
+ /* Insert after */
+ ext_debug("insert %u:%llu:[%d]%d before: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ nearex++;
+ } else {
+ /* Insert before */
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ ext_debug("insert %u:%llu:[%d]%d after: "
+ "nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
- nearex, len, nearex + 1, nearex + 2);
- memmove(nearex + 2, nearex + 1, len);
+ nearex);
+ }
+ len = EXT_LAST_EXTENT(eh) - nearex + 1;
+ if (len > 0) {
+ ext_debug("insert %u:%llu:[%d]%d: "
+ "move %d extents from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ len, nearex, nearex + 1);
+ memmove(nearex + 1, nearex,
+ len * sizeof(struct ext4_extent));
}
- path[depth].p_ext = nearex + 1;
- } else {
- BUG_ON(newext->ee_block == nearex->ee_block);
- len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
- len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
- "move %d from 0x%p to 0x%p\n",
- le32_to_cpu(newext->ee_block),
- ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
- ext4_ext_get_actual_len(newext),
- nearex, len, nearex, nearex + 1);
- memmove(nearex + 1, nearex, len);
- path[depth].p_ext = nearex;
}
le16_add_cpu(&eh->eh_entries, 1);
- nearex = path[depth].p_ext;
+ path[depth].p_ext = nearex;
nearex->ee_block = newext->ee_block;
ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
nearex->ee_len = newext->ee_len;
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
struct ext4_extent_header *eh;
- ext4_lblk_t a, b, block;
+ ext4_lblk_t a, b;
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
- struct ext4_map_blocks map;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf\n", start);
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
continue;
- } else if (a != ex_ee_block &&
- b != ex_ee_block + ex_ee_len - 1) {
- /*
- * If this is a truncate, then this condition should
- * never happen because at least one of the end points
- * needs to be on the edge of the extent.
- */
- if (end == EXT_MAX_BLOCKS - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- block = 0;
- num = 0;
- err = -EIO;
- goto out;
- }
- /*
- * else this is a hole punch, so the extent needs to
- * be split since neither edge of the hole is on the
- * extent edge
- */
- else{
- map.m_pblk = ext4_ext_pblock(ex);
- map.m_lblk = ex_ee_block;
- map.m_len = b - ex_ee_block;
-
- err = ext4_split_extent(handle,
- inode, path, &map, 0,
- EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
- EXT4_GET_BLOCKS_PRE_IO);
-
- if (err < 0)
- goto out;
-
- ex_ee_len = ext4_ext_get_actual_len(ex);
-
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
-
- /* Then remove tail of this extent */
- block = ex_ee_block;
- num = a - block;
- }
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ EXT4_ERROR_INODE(inode," bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
- block = ex_ee_block;
- num = a - block;
- } else if (b != ex_ee_block + ex_ee_len - 1) {
- /* remove head of the extent */
- block = b;
- num = ex_ee_block + ex_ee_len - b;
-
- /*
- * If this is a truncate, this condition
- * should never happen
- */
- if (end == EXT_MAX_BLOCKS - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
+ num = a - ex_ee_block;
} else {
/* remove whole extent: excellent! */
- block = ex_ee_block;
num = 0;
- if (a != ex_ee_block) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
-
- if (b != ex_ee_block + ex_ee_len - 1) {
- ext_debug(" bad truncate %u:%u\n",
- start, end);
- err = -EIO;
- goto out;
- }
}
-
/*
* 3 for leaf, sb, and inode plus 2 (bmap and group
* descriptor) for each block group; assume two block
if (err)
goto out;
- if (num == 0) {
+ if (num == 0)
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- } else if (block != ex_ee_block) {
- /*
- * If this was a head removal, then we need to update
- * the physical block since it is now at a different
- * location
- */
- ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
- }
- ex->ee_block = cpu_to_le32(block);
ex->ee_len = cpu_to_le16(num);
/*
* Do not mark uninitialized if all the blocks in the
*/
if (uninitialized && num)
ext4_ext_mark_uninitialized(ex);
-
- err = ext4_ext_dirty(handle, inode, path + depth);
- if (err)
- goto out;
-
/*
* If the extent was completely released,
* we need to remove it from the leaf
} else
*partial_cluster = 0;
- ext_debug("new extent: %u:%u:%llu\n", block, num,
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
+ unsigned int ee_len, depth;
+ int allocated;
int err = 0;
int split_flag = 0;
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+ BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * uninitialized extent to its left neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. This is the common case in steady state for
+ * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+ * writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we only deal with writes at the start of the extent.
+ * The approach could be extended to writes at the end
+ * of the extent but this scenario was deemed less common.
+ * - L2: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L3: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) && /*L1*/
+ (map->m_len < ee_len) && /*L2*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
+ struct ext4_extent *prev_ex;
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len, write_len;
+
+ prev_ex = ex - 1;
+ prev_lblk = le32_to_cpu(prev_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(prev_ex);
+ prev_pblk = ext4_ext_pblock(prev_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+ write_len = map->m_len;
+
+ /*
+ * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * upon those conditions:
+ * - C1: prev_ex is initialized,
+ * - C2: prev_ex is logically abutting ex,
+ * - C3: prev_ex is physically abutting ex,
+ * - C4: prev_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, prev_ex);
+
+ /* Shift the start of ex by 'write_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + write_len);
+ ext4_ext_store_pblock(ex, ee_pblk + write_len);
+ ex->ee_len = cpu_to_le16(ee_len - write_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend prev_ex by 'write_len' blocks */
+ prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = prev_ex;
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = write_len;
+ goto out;
+ }
+ }
+
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
* that this IO needs to conversion to written when IO is
* completed
*/
- if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
- io->flag = EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- } else
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
/* buffered write, writepage time, convert*/
ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
- if (ret >= 0) {
+ if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
- map->m_len);
- if (err < 0)
- goto out2;
- }
-
out:
if (ret <= 0) {
err = ret;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+ map->m_len);
+ if (err < 0)
+ goto out2;
+ }
out1:
if (allocated > map->m_len)
allocated = map->m_len;
ext4_fsblk_t newblock = 0;
int free_on_err = 0, err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
- unsigned int allocated_clusters = 0, reserved_clusters = 0;
+ unsigned int allocated_clusters = 0;
unsigned int punched_out = 0;
unsigned int result = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
ext4_lblk_t cluster_offset;
- struct ext4_map_blocks punch_map;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
/* if found extent covers block, simply return it */
if (in_range(map->m_lblk, ee_block, ee_len)) {
+ struct ext4_map_blocks punch_map;
ext4_fsblk_t partial_cluster = 0;
newblock = map->m_lblk - ee_block + ee_start;
* that we need to perform conversion when IO is done.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
- io->flag = EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
- } else
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
map->m_flags |= EXT4_MAP_UNINIT;
}
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
+ err = 0;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, ar.len);
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
* block allocation which had been deferred till now.
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
/*
- * Check how many clusters we had reserved this allocted range.
+ * Check how many clusters we had reserved this allocated range
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
ext4_ext_drop_refs(path);
kfree(path);
}
- trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
- newblock, map->m_len, err ? err : allocated);
-
result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
punched_out : allocated;
+ trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+ newblock, map->m_len, err ? err : result);
+
return err ? err : result;
}
int ret = 0;
int ret2 = 0;
int retries = 0;
+ int flags;
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
+ flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
+ */
+ if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
ret = PTR_ERR(handle);
break;
}
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
- EXT4_GET_BLOCKS_NO_NORMALIZE);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);