Merge branch 'modules-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty...

[karo-tx-linux.git] / fs / btrfs / disk-io.c
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 62e0cafd6e250d5d1717656d3d5821e2d1bfec42..7cda51995c1e589eaf36fe048518bbbe0bd21109 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
  #include "check-integrity.h"
  #include "rcu-string.h"
  
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
  static struct extent_io_ops btree_extent_io_ops;
  static void end_workqueue_fn(struct btrfs_work *work);
  static void free_fs_root(struct btrfs_root *root);
@@ -217,26 +221,16 @@ static struct extent_map *btree_get_extent(struct inode *inode,
         write_lock(&em_tree->lock);
         ret = add_extent_mapping(em_tree, em);
         if (ret == -EEXIST) {
-               u64 failed_start = em->start;
-               u64 failed_len = em->len;
-
                 free_extent_map(em);
                 em = lookup_extent_mapping(em_tree, start, len);
-               if (em) {
-                       ret = 0;
-               } else {
-                       em = lookup_extent_mapping(em_tree, failed_start,
-                                                  failed_len);
-                       ret = -EIO;
-               }
+               if (!em)
+                       em = ERR_PTR(-EIO);
         } else if (ret) {
                 free_extent_map(em);
-               em = NULL;
+               em = ERR_PTR(ret);
         }
         write_unlock(&em_tree->lock);
  
-       if (ret)
-               em = ERR_PTR(ret);
  out:
         return em;
  }
@@ -377,9 +371,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                 ret = read_extent_buffer_pages(io_tree, eb, start,
                                                WAIT_COMPLETE,
                                                btree_get_extent, mirror_num);
-               if (!ret && !verify_parent_transid(io_tree, eb,
+               if (!ret) {
+                       if (!verify_parent_transid(io_tree, eb,
                                                    parent_transid, 0))
-                       break;
+                               break;
+                       else
+                               ret = -EIO;
+               }
  
                 /*
                  * This buffer's crc is fine, but its contents are corrupted, so
@@ -435,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                 WARN_ON(1);
                 return 0;
         }
-       if (eb->pages[0] != page) {
-               WARN_ON(1);
-               return 0;
-       }
         if (!PageUptodate(page)) {
                 WARN_ON(1);
                 return 0;
@@ -754,9 +748,7 @@ static void run_one_async_done(struct btrfs_work *work)
         limit = btrfs_async_submit_limit(fs_info);
         limit = limit * 2 / 3;
  
-       atomic_dec(&fs_info->nr_async_submits);
-
-       if (atomic_read(&fs_info->nr_async_submits) < limit &&
+       if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
             waitqueue_active(&fs_info->async_submit_wait))
                 wake_up(&fs_info->async_submit_wait);
  
@@ -867,10 +859,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
         return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
  }
  
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+       if (bio_flags & EXTENT_BIO_TREE_LOG)
+               return 0;
+#ifdef CONFIG_X86
+       if (cpu_has_xmm4_2)
+               return 0;
+#endif
+       return 1;
+}
+
  static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                  int mirror_num, unsigned long bio_flags,
                                  u64 bio_offset)
  {
+       int async = check_async_write(inode, bio_flags);
         int ret;
  
         if (!(rw & REQ_WRITE)) {
@@ -885,6 +889,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                         return ret;
                 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                      mirror_num, 0);
+       } else if (!async) {
+               ret = btree_csum_one_bio(bio);
+               if (ret)
+                       return ret;
+               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                    mirror_num, 0);
         }
  
         /*
@@ -1166,8 +1176,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
         atomic_set(&root->log_commit[0], 0);
         atomic_set(&root->log_commit[1], 0);
         atomic_set(&root->log_writers, 0);
+       atomic_set(&root->log_batch, 0);
         atomic_set(&root->orphan_inodes, 0);
-       root->log_batch = 0;
         root->log_transid = 0;
         root->last_log_commit = 0;
         extent_io_tree_init(&root->dirty_log_pages,
@@ -1665,9 +1675,10 @@ static int transaction_kthread(void *arg)
                 spin_unlock(&root->fs_info->trans_lock);
  
                 /* If the file system is aborted, this will always fail. */
-               trans = btrfs_join_transaction(root);
+               trans = btrfs_attach_transaction(root);
                 if (IS_ERR(trans)) {
-                       cannot_commit = true;
+                       if (PTR_ERR(trans) != -ENOENT)
+                               cannot_commit = true;
                         goto sleep;
                 }
                 if (transid == trans->transid) {
@@ -1992,13 +2003,11 @@ int open_ctree(struct super_block *sb,
         INIT_LIST_HEAD(&fs_info->trans_list);
         INIT_LIST_HEAD(&fs_info->dead_roots);
         INIT_LIST_HEAD(&fs_info->delayed_iputs);
-       INIT_LIST_HEAD(&fs_info->hashers);
         INIT_LIST_HEAD(&fs_info->delalloc_inodes);
         INIT_LIST_HEAD(&fs_info->ordered_operations);
         INIT_LIST_HEAD(&fs_info->caching_block_groups);
         spin_lock_init(&fs_info->delalloc_lock);
         spin_lock_init(&fs_info->trans_lock);
-       spin_lock_init(&fs_info->ref_cache_lock);
         spin_lock_init(&fs_info->fs_roots_radix_lock);
         spin_lock_init(&fs_info->delayed_iput_lock);
         spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -2012,12 +2021,15 @@ int open_ctree(struct super_block *sb,
         INIT_LIST_HEAD(&fs_info->space_info);
         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
         btrfs_mapping_init(&fs_info->mapping_tree);
-       btrfs_init_block_rsv(&fs_info->global_block_rsv);
-       btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-       btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-       btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-       btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-       btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+       btrfs_init_block_rsv(&fs_info->global_block_rsv,
+                            BTRFS_BLOCK_RSV_GLOBAL);
+       btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+                            BTRFS_BLOCK_RSV_DELALLOC);
+       btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+       btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+       btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+       btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+                            BTRFS_BLOCK_RSV_DELOPS);
         atomic_set(&fs_info->nr_async_submits, 0);
         atomic_set(&fs_info->async_delalloc_pages, 0);
         atomic_set(&fs_info->async_submit_draining, 0);
@@ -2032,8 +2044,6 @@ int open_ctree(struct super_block *sb,
         fs_info->free_chunk_space = 0;
         fs_info->tree_mod_log = RB_ROOT;
  
-       init_waitqueue_head(&fs_info->tree_mod_seq_wait);
-
         /* readahead state */
         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
         spin_lock_init(&fs_info->reada_lock);
@@ -2491,6 +2501,8 @@ retry_root_backup:
                 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                 goto fail_block_groups;
         }
+       fs_info->num_tolerated_disk_barrier_failures =
+               btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
  
         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                                "btrfs-cleaner");
@@ -2528,8 +2540,7 @@ retry_root_backup:
                 goto fail_trans_kthread;
  
         /* do not make disk changes in broken FS */
-       if (btrfs_super_log_root(disk_super) != 0 &&
-           !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+       if (btrfs_super_log_root(disk_super) != 0) {
                 u64 bytenr = btrfs_super_log_root(disk_super);
  
                 if (fs_devices->rw_devices == 0) {
@@ -2875,12 +2886,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                         printk_in_rcu("btrfs: disabling barriers on dev %s\n",
                                       rcu_str_deref(device->name));
                         device->nobarriers = 1;
-               }
-               if (!bio_flagged(bio, BIO_UPTODATE)) {
+               } else if (!bio_flagged(bio, BIO_UPTODATE)) {
                         ret = -EIO;
-                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
-                               btrfs_dev_stat_inc_and_print(device,
-                                       BTRFS_DEV_STAT_FLUSH_ERRS);
+                       btrfs_dev_stat_inc_and_print(device,
+                               BTRFS_DEV_STAT_FLUSH_ERRS);
                 }
  
                 /* drop the reference from the wait == 0 run */
@@ -2919,14 +2928,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
  {
         struct list_head *head;
         struct btrfs_device *dev;
-       int errors = 0;
+       int errors_send = 0;
+       int errors_wait = 0;
         int ret;
  
         /* send down all the barriers */
         head = &info->fs_devices->devices;
         list_for_each_entry_rcu(dev, head, dev_list) {
                 if (!dev->bdev) {
-                       errors++;
+                       errors_send++;
                         continue;
                 }
                 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2934,13 +2944,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
  
                 ret = write_dev_flush(dev, 0);
                 if (ret)
-                       errors++;
+                       errors_send++;
         }
  
         /* wait for all the barriers */
         list_for_each_entry_rcu(dev, head, dev_list) {
                 if (!dev->bdev) {
-                       errors++;
+                       errors_wait++;
                         continue;
                 }
                 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2948,13 +2958,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
  
                 ret = write_dev_flush(dev, 1);
                 if (ret)
-                       errors++;
+                       errors_wait++;
         }
-       if (errors)
+       if (errors_send > info->num_tolerated_disk_barrier_failures ||
+           errors_wait > info->num_tolerated_disk_barrier_failures)
                 return -EIO;
         return 0;
  }
  
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_ioctl_space_info space;
+       struct btrfs_space_info *sinfo;
+       u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+                      BTRFS_BLOCK_GROUP_SYSTEM,
+                      BTRFS_BLOCK_GROUP_METADATA,
+                      BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+       int num_types = 4;
+       int i;
+       int c;
+       int num_tolerated_disk_barrier_failures =
+               (int)fs_info->fs_devices->num_devices;
+
+       for (i = 0; i < num_types; i++) {
+               struct btrfs_space_info *tmp;
+
+               sinfo = NULL;
+               rcu_read_lock();
+               list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+                       if (tmp->flags == types[i]) {
+                               sinfo = tmp;
+                               break;
+                       }
+               }
+               rcu_read_unlock();
+
+               if (!sinfo)
+                       continue;
+
+               down_read(&sinfo->groups_sem);
+               for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                       if (!list_empty(&sinfo->block_groups[c])) {
+                               u64 flags;
+
+                               btrfs_get_block_group_info(
+                                       &sinfo->block_groups[c], &space);
+                               if (space.total_bytes == 0 ||
+                                   space.used_bytes == 0)
+                                       continue;
+                               flags = space.flags;
+                               /*
+                                * return
+                                * 0: if dup, single or RAID0 is configured for
+                                *    any of metadata, system or data, else
+                                * 1: if RAID5 is configured, or if RAID1 or
+                                *    RAID10 is configured and only two mirrors
+                                *    are used, else
+                                * 2: if RAID6 is configured, else
+                                * num_mirrors - 1: if RAID1 or RAID10 is
+                                *                  configured and more than
+                                *                  2 mirrors are used.
+                                */
+                               if (num_tolerated_disk_barrier_failures > 0 &&
+                                   ((flags & (BTRFS_BLOCK_GROUP_DUP |
+                                              BTRFS_BLOCK_GROUP_RAID0)) ||
+                                    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+                                     == 0)))
+                                       num_tolerated_disk_barrier_failures = 0;
+                               else if (num_tolerated_disk_barrier_failures > 1
+                                        &&
+                                        (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                                  BTRFS_BLOCK_GROUP_RAID10)))
+                                       num_tolerated_disk_barrier_failures = 1;
+                       }
+               }
+               up_read(&sinfo->groups_sem);
+       }
+
+       return num_tolerated_disk_barrier_failures;
+}
+
  int write_all_supers(struct btrfs_root *root, int max_mirrors)
  {
         struct list_head *head;
@@ -2977,8 +3061,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
         head = &root->fs_info->fs_devices->devices;
  
-       if (do_barriers)
-               barrier_all_devices(root->fs_info);
+       if (do_barriers) {
+               ret = barrier_all_devices(root->fs_info);
+               if (ret) {
+                       mutex_unlock(
+                               &root->fs_info->fs_devices->device_list_mutex);
+                       btrfs_error(root->fs_info, ret,
+                                   "errors while submitting device barriers.");
+                       return ret;
+               }
+       }
  
         list_for_each_entry_rcu(dev, head, dev_list) {
                 if (!dev->bdev) {
@@ -3189,30 +3281,14 @@ int close_ctree(struct btrfs_root *root)
         /* clear out the rbtree of defraggable inodes */
         btrfs_run_defrag_inodes(fs_info);
  
-       /*
-        * Here come 2 situations when btrfs is broken to flip readonly:
-        *
-        * 1. when btrfs flips readonly somewhere else before
-        * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
-        * and btrfs will skip to write sb directly to keep
-        * ERROR state on disk.
-        *
-        * 2. when btrfs flips readonly just in btrfs_commit_super,
-        * and in such case, btrfs cannot write sb via btrfs_commit_super,
-        * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
-        * btrfs will cleanup all FS resources first and write sb then.
-        */
         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                 ret = btrfs_commit_super(root);
                 if (ret)
                         printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
         }
  
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-               ret = btrfs_error_commit_super(root);
-               if (ret)
-                       printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
-       }
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               btrfs_error_commit_super(root);
  
         btrfs_put_block_group_cache(fs_info);
  
@@ -3228,10 +3304,6 @@ int close_ctree(struct btrfs_root *root)
                 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                        (unsigned long long)fs_info->delalloc_bytes);
         }
-       if (fs_info->total_ref_cache_size) {
-               printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-                      (unsigned long long)fs_info->total_ref_cache_size);
-       }
  
         free_extent_buffer(fs_info->extent_root->node);
         free_extent_buffer(fs_info->extent_root->commit_root);
@@ -3377,52 +3449,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
         return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
  }
  
-int btree_lock_page_hook(struct page *page, void *data,
-                               void (*flush_fn)(void *))
-{
-       struct inode *inode = page->mapping->host;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_buffer *eb;
-
-       /*
-        * We culled this eb but the page is still hanging out on the mapping,
-        * carry on.
-        */
-       if (!PagePrivate(page))
-               goto out;
-
-       eb = (struct extent_buffer *)page->private;
-       if (!eb) {
-               WARN_ON(1);
-               goto out;
-       }
-       if (page != eb->pages[0])
-               goto out;
-
-       if (!btrfs_try_tree_write_lock(eb)) {
-               flush_fn(data);
-               btrfs_tree_lock(eb);
-       }
-       btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               if (root->fs_info->dirty_metadata_bytes >= eb->len)
-                       root->fs_info->dirty_metadata_bytes -= eb->len;
-               else
-                       WARN_ON(1);
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
-
-       btrfs_tree_unlock(eb);
-out:
-       if (!trylock_page(page)) {
-               flush_fn(data);
-               lock_page(page);
-       }
-       return 0;
-}
-
  static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                               int read_only)
  {
@@ -3434,18 +3460,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
         if (read_only)
                 return 0;
  
-       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-               printk(KERN_WARNING "warning: mount fs with errors, "
-                      "running btrfsck is recommended\n");
-       }
-
         return 0;
  }
  
-int btrfs_error_commit_super(struct btrfs_root *root)
+void btrfs_error_commit_super(struct btrfs_root *root)
  {
-       int ret;
-
         mutex_lock(&root->fs_info->cleaner_mutex);
         btrfs_run_delayed_iputs(root);
         mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3455,10 +3474,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
  
         /* cleanup FS via transaction */
         btrfs_cleanup_transaction(root);
-
-       ret = write_ctree_super(NULL, root, 0);
-
-       return ret;
  }
  
  static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3636,7 +3651,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
  
         while (1) {
                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                           mark);
+                                           mark, NULL);
                 if (ret)
                         break;
  
@@ -3691,7 +3706,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
  again:
         while (1) {
                 ret = find_first_extent_bit(unpin, 0, &start, &end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY, NULL);
                 if (ret)
                         break;
  
@@ -3782,14 +3797,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
                 /* FIXME: cleanup wait for commit */
                 t->in_commit = 1;
                 t->blocked = 1;
+               smp_mb();
                 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
                         wake_up(&root->fs_info->transaction_blocked_wait);
  
                 t->blocked = 0;
+               smp_mb();
                 if (waitqueue_active(&root->fs_info->transaction_wait))
                         wake_up(&root->fs_info->transaction_wait);
  
                 t->commit_done = 1;
+               smp_mb();
                 if (waitqueue_active(&t->commit_wait))
                         wake_up(&t->commit_wait);
  
@@ -3825,7 +3843,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
  }
  
  static struct extent_io_ops btree_extent_io_ops = {
-       .write_cache_pages_lock_hook = btree_lock_page_hook,
         .readpage_end_io_hook = btree_readpage_end_io_hook,
         .readpage_io_failed_hook = btree_io_failed_hook,
         .submit_bio_hook = btree_submit_bio_hook,