]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/btrfs/inode.c
Btrfs: Throttle file_write when data=ordered is flushing the inode
[karo-tx-linux.git] / fs / btrfs / inode.c
index 46f54dfa2b0aa9a35893e0fd6f841228b29642e8..b31f52d4f2caeb64c125af85ac9e9e4cf86fd38f 100644 (file)
@@ -80,6 +80,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
        u64 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
        u64 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
        u64 thresh;
+       unsigned long flags;
        int ret = 0;
 
        if (for_del)
@@ -89,10 +90,10 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 
        do_div(thresh, 100);
 
-       spin_lock(&root->fs_info->delalloc_lock);
+       spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
        if (used + root->fs_info->delalloc_bytes + num_required > thresh)
                ret = -ENOSPC;
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
        return ret;
 }
 
@@ -122,9 +123,12 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
        if (alloc_hint == EXTENT_MAP_INLINE)
                goto out;
 
+       BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+
        while(num_bytes > 0) {
                cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
                ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
+                                        root->sectorsize,
                                         root->root_key.objectid,
                                         trans->transid,
                                         inode->i_ino, start, 0,
@@ -133,11 +137,17 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
                        WARN_ON(1);
                        goto out;
                }
+               cur_alloc_size = ins.offset;
                ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
                                               start, ins.objectid, ins.offset,
                                               ins.offset);
                inode->i_blocks += ins.offset >> 9;
                btrfs_check_file(root, inode);
+               if (num_bytes < cur_alloc_size) {
+                       printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+                              cur_alloc_size);
+                       break;
+               }
                num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
@@ -266,12 +276,13 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
                       unsigned long old, unsigned long bits)
 {
+       unsigned long flags;
        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
                root->fs_info->delalloc_bytes += end - start + 1;
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
        }
        return 0;
 }
@@ -281,7 +292,9 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 {
        if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
-               spin_lock(&root->fs_info->delalloc_lock);
+               unsigned long flags;
+
+               spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
                if (end - start + 1 > root->fs_info->delalloc_bytes) {
                        printk("warning: delalloc account %Lu %Lu\n",
                               end - start + 1, root->fs_info->delalloc_bytes);
@@ -291,7 +304,7 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
                        root->fs_info->delalloc_bytes -= end - start + 1;
                        BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
                }
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
        }
        return 0;
 }
@@ -304,13 +317,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
        u64 logical = bio->bi_sector << 9;
        u64 length = 0;
        u64 map_length;
-       struct bio_vec *bvec;
-       int i;
        int ret;
 
-       bio_for_each_segment(bvec, bio, i) {
-               length += bvec->bv_len;
-       }
+       length = bio->bi_size;
        map_tree = &root->fs_info->mapping_tree;
        map_length = length;
        ret = btrfs_map_block(map_tree, READ, logical,
@@ -322,12 +331,37 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
        return 0;
 }
 
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                          int mirror_num)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
+       char *sums = NULL;
+
+       ret = btrfs_csum_one_bio(root, bio, &sums);
+       BUG_ON(ret);
+
+       mutex_lock(&root->fs_info->fs_mutex);
+       trans = btrfs_start_transaction(root, 1);
+
+       btrfs_set_trans_block_group(trans, inode);
+       btrfs_csum_file_blocks(trans, root, inode, bio, sums);
+
+       ret = btrfs_end_transaction(trans, root);
+       BUG_ON(ret);
+       mutex_unlock(&root->fs_info->fs_mutex);
+
+       kfree(sums);
+
+       return btrfs_map_bio(root, rw, bio, mirror_num);
+}
+
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+                         int mirror_num)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret = 0;
 
        if (!(rw & (1 << BIO_RW))) {
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -340,13 +374,9 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                goto mapit;
        }
 
-       mutex_lock(&root->fs_info->fs_mutex);
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, inode);
-       btrfs_csum_file_blocks(trans, root, inode, bio);
-       ret = btrfs_end_transaction(trans, root);
-       BUG_ON(ret);
-       mutex_unlock(&root->fs_info->fs_mutex);
+       return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                  inode, rw, bio, mirror_num,
+                                  __btrfs_submit_bio_hook);
 mapit:
        return btrfs_map_bio(root, rw, bio, mirror_num);
 }
@@ -360,9 +390,11 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
        struct btrfs_csum_item *item;
        struct btrfs_path *path = NULL;
        u32 csum;
+
        if (btrfs_test_opt(root, NODATASUM) ||
            btrfs_test_flag(inode, NODATASUM))
                return 0;
+
        mutex_lock(&root->fs_info->fs_mutex);
        path = btrfs_alloc_path();
        item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
@@ -402,6 +434,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
        struct extent_map *em;
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct bio *bio;
        int num_copies;
        int ret;
@@ -409,7 +442,6 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 
        ret = get_state_private(failure_tree, start, &private);
        if (ret) {
-               size_t pg_offset = start - page_offset(page);
                failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
                if (!failrec)
                        return -ENOMEM;
@@ -417,8 +449,13 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
                failrec->len = end - start + 1;
                failrec->last_mirror = 0;
 
-               em = btrfs_get_extent(inode, NULL, pg_offset, start,
-                                     failrec->len, 0);
+               spin_lock(&em_tree->lock);
+               em = lookup_extent_mapping(em_tree, start, failrec->len);
+               if (em->start > start || em->start + em->len < start) {
+                       free_extent_map(em);
+                       em = NULL;
+               }
+               spin_unlock(&em_tree->lock);
 
                if (!em || IS_ERR(em)) {
                        kfree(failrec);
@@ -461,6 +498,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
        bio->bi_end_io = failed_bio->bi_end_io;
        bio->bi_sector = failrec->logical >> 9;
        bio->bi_bdev = failed_bio->bi_bdev;
+       bio->bi_size = 0;
        bio_add_page(bio, page, failrec->len, start - page_offset(page));
        btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
        return 0;
@@ -534,6 +572,8 @@ zeroit:
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_IRQ0);
        local_irq_restore(flags);
+       if (private == 0)
+               return 0;
        return -EIO;
 }
 
@@ -883,8 +923,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
+       u64 mask = root->sectorsize - 1;
 
-       btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
+       btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
        path = btrfs_alloc_path();
        path->reada = -1;
        BUG_ON(!path);
@@ -1187,7 +1228,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                                                       hole_start, 0, 0,
                                                       hole_size);
                        btrfs_drop_extent_cache(inode, hole_start,
-                                               hole_size - 1);
+                                               (u64)-1);
                        btrfs_check_file(root, inode);
                }
                btrfs_end_transaction(trans, root);
@@ -1378,6 +1419,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
                             inode->i_mapping, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
+       atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
        return 0;
 }
 
@@ -1687,6 +1729,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                             inode->i_mapping, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
+       atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
        BTRFS_I(inode)->delalloc_bytes = 0;
        BTRFS_I(inode)->root = root;
 
@@ -1915,6 +1958,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                                     inode->i_mapping, GFP_NOFS);
                BTRFS_I(inode)->delalloc_bytes = 0;
+               atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
        dir->i_sb->s_dirt = 1;
@@ -2058,6 +2102,68 @@ out_unlock:
        return err;
 }
 
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+                               struct extent_map *existing,
+                               struct extent_map *em)
+{
+       u64 start_diff;
+       u64 new_end;
+       int ret = 0;
+       int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
+
+       if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
+               goto invalid;
+
+       if (!real_blocks && em->block_start != existing->block_start)
+               goto invalid;
+
+       new_end = max(existing->start + existing->len, em->start + em->len);
+
+       if (existing->start >= em->start) {
+               if (em->start + em->len < existing->start)
+                       goto invalid;
+
+               start_diff = existing->start - em->start;
+               if (real_blocks && em->block_start + start_diff !=
+                   existing->block_start)
+                       goto invalid;
+
+               em->len = new_end - em->start;
+
+               remove_extent_mapping(em_tree, existing);
+               /* free for the tree */
+               free_extent_map(existing);
+               ret = add_extent_mapping(em_tree, em);
+
+       } else if (em->start > existing->start) {
+
+               if (existing->start + existing->len < em->start)
+                       goto invalid;
+
+               start_diff = em->start - existing->start;
+               if (real_blocks && existing->block_start + start_diff !=
+                   em->block_start)
+                       goto invalid;
+
+               remove_extent_mapping(em_tree, existing);
+               em->block_start = existing->block_start;
+               em->start = existing->start;
+               em->len = new_end - existing->start;
+               free_extent_map(existing);
+
+               ret = add_extent_mapping(em_tree, em);
+       } else {
+               goto invalid;
+       }
+       return ret;
+
+invalid:
+       printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
+              existing->start, existing->len, existing->block_start,
+              em->start, em->len, em->block_start);
+       return -EIO;
+}
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
                                    size_t pg_offset, u64 start, u64 len,
                                    int create)
@@ -2089,12 +2195,9 @@ again:
        spin_unlock(&em_tree->lock);
 
        if (em) {
-               if (em->start > start) {
-                       printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n",
-                              start, len, em->start, em->len);
-                       WARN_ON(1);
-               }
-               if (em->block_start == EXTENT_MAP_INLINE && page)
+               if (em->start > start || em->start + em->len <= start)
+                       free_extent_map(em);
+               else if (em->block_start == EXTENT_MAP_INLINE && page)
                        free_extent_map(em);
                else
                        goto out;
@@ -2242,12 +2345,39 @@ insert:
        err = 0;
        spin_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
+       /* it is possible that someone inserted the extent into the tree
+        * while we had the lock dropped.  It is also possible that
+        * an overlapping map exists in the tree
+        */
        if (ret == -EEXIST) {
-               free_extent_map(em);
-               em = lookup_extent_mapping(em_tree, start, len);
-               if (!em) {
-                       err = -EIO;
-                       printk("failing to insert %Lu %Lu\n", start, len);
+               struct extent_map *existing;
+               existing = lookup_extent_mapping(em_tree, start, len);
+               if (existing && (existing->start > start ||
+                   existing->start + existing->len <= start)) {
+                       free_extent_map(existing);
+                       existing = NULL;
+               }
+               if (!existing) {
+                       existing = lookup_extent_mapping(em_tree, em->start,
+                                                        em->len);
+                       if (existing) {
+                               err = merge_extent_mapping(em_tree, existing,
+                                                          em);
+                               free_extent_map(existing);
+                               if (err) {
+                                       free_extent_map(em);
+                                       em = NULL;
+                               }
+                       } else {
+                               err = -EIO;
+                               printk("failing to insert %Lu %Lu\n",
+                                      start, len);
+                               free_extent_map(em);
+                               em = NULL;
+                       }
+               } else {
+                       free_extent_map(em);
+                       em = existing;
                }
        }
        spin_unlock(&em_tree->lock);
@@ -2267,6 +2397,7 @@ out:
        return em;
 }
 
+#if 0 /* waiting for O_DIRECT reads */
 static int btrfs_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
@@ -2284,22 +2415,24 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
        if (!em || IS_ERR(em))
                goto out;
 
-       if (em->start > start || em->start + em->len <= start)
+       if (em->start > start || em->start + em->len <= start) {
            goto out;
+       }
 
        if (em->block_start == EXTENT_MAP_INLINE) {
                ret = -EINVAL;
                goto out;
        }
 
+       len = em->start + em->len - start;
+       len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
+
        if (em->block_start == EXTENT_MAP_HOLE ||
            em->block_start == EXTENT_MAP_DELALLOC) {
+               bh_result->b_size = len;
                goto out;
        }
 
-       len = em->start + em->len - start;
-       len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
        logical = start - em->start;
        logical = em->block_start + logical;
 
@@ -2309,6 +2442,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
        BUG_ON(ret);
        bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
        bh_result->b_size = min(map_length, len);
+
        bh_result->b_bdev = multi->stripes[0].dev->bdev;
        set_buffer_mapped(bh_result);
        kfree(multi);
@@ -2316,11 +2450,14 @@ out:
        free_extent_map(em);
        return ret;
 }
+#endif
 
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
+       return -EINVAL;
+#if 0
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
 
@@ -2329,6 +2466,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                  offset, nr_segs, btrfs_get_block, NULL);
+#endif
 }
 
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -2385,6 +2523,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
        map = &BTRFS_I(page->mapping->host)->extent_tree;
        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
        if (ret == 1) {
+               invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
                ClearPagePrivate(page);
                set_page_private(page, 0);
                page_cache_release(page);
@@ -2399,6 +2538,12 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
        tree = &BTRFS_I(page->mapping->host)->io_tree;
        extent_invalidatepage(tree, page, offset);
        btrfs_releasepage(page, GFP_NOFS);
+       if (PagePrivate(page)) {
+               invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+               ClearPagePrivate(page);
+               set_page_private(page, 0);
+               page_cache_release(page);
+       }
 }
 
 /*
@@ -3150,6 +3295,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                                     inode->i_mapping, GFP_NOFS);
                BTRFS_I(inode)->delalloc_bytes = 0;
+               atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
        dir->i_sb->s_dirt = 1;