Merge git://git.jan-o-sch.net/btrfs-unstable into integration

author Chris Mason <chris.mason@oracle.com>

Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)

committer Chris Mason <chris.mason@oracle.com>

Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)
author Chris Mason <chris.mason@oracle.com>
Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)
committer Chris Mason <chris.mason@oracle.com>
Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)
diff --combined fs/btrfs/Makefile

index bdd6fb238ce16a9fee87044c3d91d92ec282e87d,89b6ce3634fd852ab9c16160117e019bf75d3010..c0ddfd29c5e5a348464d5c3d15a77a7708fd8d79
--- 1/fs/btrfs/Makefile
--- 2/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@@ -7,7 -7,7 +7,7 @@@ btrfs-y += super.o ctree.o extent-tree.
            extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
            extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
            export.o tree-log.o free-space-cache.o zlib.o lzo.o \
- -         compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \
- -         scrub.o
+ +         compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o
++         reada.o backref.o
   
   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --combined fs/btrfs/disk-io.c

index cedfbfb278eb6c7d7edb602d1ea42b1cd23b4efa,dc03438025353b2a1a33b71fc267b4debed53073..0eb1f09512514a229182ed6e96cc608e6f31168a
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -256,7 -256,8 +256,7 @@@ void btrfs_csum_final(u32 crc, char *re
   static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                            int verify)
   {
- -      u16 csum_size =
- -              btrfs_super_csum_size(&root->fs_info->super_copy);
+ +      u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
         char *result = NULL;
         unsigned long len;
         unsigned long cur_len;
@@@ -366,8 -367,7 +366,8 @@@ static int btree_read_extent_buffer_pag
         clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
         io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
         while (1) {
- -              ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ +              ret = read_extent_buffer_pages(io_tree, eb, start,
+ +                                             WAIT_COMPLETE,
                                                btree_get_extent, mirror_num);
                 if (!ret &&
                     !verify_parent_transid(io_tree, eb, parent_transid))
@@@ -608,47 -608,11 +608,47 @@@ static int btree_readpage_end_io_hook(s
         end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
         end = eb->start + end - 1;
   err:
+ +      if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ +              clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ +              btree_readahead_hook(root, eb, eb->start, ret);
+ +      }
+ +
         free_extent_buffer(eb);
   out:
         return ret;
   }
   
-                        struct extent_state *state)
+ +static int btree_io_failed_hook(struct bio *failed_bio,
+ +                       struct page *page, u64 start, u64 end,
++                       u64 mirror_num, struct extent_state *state)
+ +{
+ +      struct extent_io_tree *tree;
+ +      unsigned long len;
+ +      struct extent_buffer *eb;
+ +      struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ +
+ +      tree = &BTRFS_I(page->mapping->host)->io_tree;
+ +      if (page->private == EXTENT_PAGE_PRIVATE)
+ +              goto out;
+ +      if (!page->private)
+ +              goto out;
+ +
+ +      len = page->private >> 2;
+ +      WARN_ON(len == 0);
+ +
+ +      eb = alloc_extent_buffer(tree, start, len, page);
+ +      if (eb == NULL)
+ +              goto out;
+ +
+ +      if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ +              clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ +              btree_readahead_hook(root, eb, eb->start, -EIO);
+ +      }
+ +
+ +out:
+ +      return -EIO;    /* we fixed nothing */
+ +}
+ +
   static void end_workqueue_bio(struct bio *bio, int err)
   {
         struct end_io_wq *end_io_wq = bio->bi_private;
@@@ -944,7 -908,7 +944,7 @@@ static int btree_readpage(struct file *
   {
         struct extent_io_tree *tree;
         tree = &BTRFS_I(page->mapping->host)->io_tree;
-       return extent_read_full_page(tree, page, btree_get_extent);
+       return extent_read_full_page(tree, page, btree_get_extent, 0);
   }
   
   static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@@ -1010,43 -974,11 +1010,43 @@@ int readahead_tree_block(struct btrfs_r
         if (!buf)
                 return 0;
         read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- -                               buf, 0, 0, btree_get_extent, 0);
+ +                               buf, 0, WAIT_NONE, btree_get_extent, 0);
         free_extent_buffer(buf);
         return ret;
   }
   
+ +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ +                       int mirror_num, struct extent_buffer **eb)
+ +{
+ +      struct extent_buffer *buf = NULL;
+ +      struct inode *btree_inode = root->fs_info->btree_inode;
+ +      struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ +      int ret;
+ +
+ +      buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ +      if (!buf)
+ +              return 0;
+ +
+ +      set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+ +
+ +      ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ +                                     btree_get_extent, mirror_num);
+ +      if (ret) {
+ +              free_extent_buffer(buf);
+ +              return ret;
+ +      }
+ +
+ +      if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ +              free_extent_buffer(buf);
+ +              return -EIO;
+ +      } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ +              *eb = buf;
+ +      } else {
+ +              free_extent_buffer(buf);
+ +      }
+ +      return 0;
+ +}
+ +
   struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                             u64 bytenr, u32 blocksize)
   {
@@@ -1203,12 -1135,10 +1203,12 @@@ static int find_and_setup_root(struct b
   
         generation = btrfs_root_generation(&root->root_item);
         blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ +      root->commit_root = NULL;
         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                      blocksize, generation);
         if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
                 free_extent_buffer(root->node);
+ +              root->node = NULL;
                 return -EIO;
         }
         root->commit_root = btrfs_root_node(root);
@@@ -1647,228 -1577,6 +1647,228 @@@ sleep
         return 0;
   }
   
+ +/*
+ + * this will find the highest generation in the array of
+ + * root backups.  The index of the highest array is returned,
+ + * or -1 if we can't find anything.
+ + *
+ + * We check to make sure the array is valid by comparing the
+ + * generation of the latest  root in the array with the generation
+ + * in the super block.  If they don't match we pitch it.
+ + */
+ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+ +{
+ +      u64 cur;
+ +      int newest_index = -1;
+ +      struct btrfs_root_backup *root_backup;
+ +      int i;
+ +
+ +      for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ +              root_backup = info->super_copy->super_roots + i;
+ +              cur = btrfs_backup_tree_root_gen(root_backup);
+ +              if (cur == newest_gen)
+ +                      newest_index = i;
+ +      }
+ +
+ +      /* check to see if we actually wrapped around */
+ +      if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ +              root_backup = info->super_copy->super_roots;
+ +              cur = btrfs_backup_tree_root_gen(root_backup);
+ +              if (cur == newest_gen)
+ +                      newest_index = 0;
+ +      }
+ +      return newest_index;
+ +}
+ +
+ +
+ +/*
+ + * find the oldest backup so we know where to store new entries
+ + * in the backup array.  This will set the backup_root_index
+ + * field in the fs_info struct
+ + */
+ +static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ +                                   u64 newest_gen)
+ +{
+ +      int newest_index = -1;
+ +
+ +      newest_index = find_newest_super_backup(info, newest_gen);
+ +      /* if there was garbage in there, just move along */
+ +      if (newest_index == -1) {
+ +              info->backup_root_index = 0;
+ +      } else {
+ +              info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ +      }
+ +}
+ +
+ +/*
+ + * copy all the root pointers into the super backup array.
+ + * this will bump the backup pointer by one when it is
+ + * done
+ + */
+ +static void backup_super_roots(struct btrfs_fs_info *info)
+ +{
+ +      int next_backup;
+ +      struct btrfs_root_backup *root_backup;
+ +      int last_backup;
+ +
+ +      next_backup = info->backup_root_index;
+ +      last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ +              BTRFS_NUM_BACKUP_ROOTS;
+ +
+ +      /*
+ +       * just overwrite the last backup if we're at the same generation
+ +       * this happens only at umount
+ +       */
+ +      root_backup = info->super_for_commit->super_roots + last_backup;
+ +      if (btrfs_backup_tree_root_gen(root_backup) ==
+ +          btrfs_header_generation(info->tree_root->node))
+ +              next_backup = last_backup;
+ +
+ +      root_backup = info->super_for_commit->super_roots + next_backup;
+ +
+ +      /*
+ +       * make sure all of our padding and empty slots get zero filled
+ +       * regardless of which ones we use today
+ +       */
+ +      memset(root_backup, 0, sizeof(*root_backup));
+ +
+ +      info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ +
+ +      btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ +      btrfs_set_backup_tree_root_gen(root_backup,
+ +                             btrfs_header_generation(info->tree_root->node));
+ +
+ +      btrfs_set_backup_tree_root_level(root_backup,
+ +                             btrfs_header_level(info->tree_root->node));
+ +
+ +      btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ +      btrfs_set_backup_chunk_root_gen(root_backup,
+ +                             btrfs_header_generation(info->chunk_root->node));
+ +      btrfs_set_backup_chunk_root_level(root_backup,
+ +                             btrfs_header_level(info->chunk_root->node));
+ +
+ +      btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ +      btrfs_set_backup_extent_root_gen(root_backup,
+ +                             btrfs_header_generation(info->extent_root->node));
+ +      btrfs_set_backup_extent_root_level(root_backup,
+ +                             btrfs_header_level(info->extent_root->node));
+ +
+ +      btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+ +      btrfs_set_backup_fs_root_gen(root_backup,
+ +                             btrfs_header_generation(info->fs_root->node));
+ +      btrfs_set_backup_fs_root_level(root_backup,
+ +                             btrfs_header_level(info->fs_root->node));
+ +
+ +      btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ +      btrfs_set_backup_dev_root_gen(root_backup,
+ +                             btrfs_header_generation(info->dev_root->node));
+ +      btrfs_set_backup_dev_root_level(root_backup,
+ +                                     btrfs_header_level(info->dev_root->node));
+ +
+ +      btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ +      btrfs_set_backup_csum_root_gen(root_backup,
+ +                             btrfs_header_generation(info->csum_root->node));
+ +      btrfs_set_backup_csum_root_level(root_backup,
+ +                             btrfs_header_level(info->csum_root->node));
+ +
+ +      btrfs_set_backup_total_bytes(root_backup,
+ +                           btrfs_super_total_bytes(info->super_copy));
+ +      btrfs_set_backup_bytes_used(root_backup,
+ +                           btrfs_super_bytes_used(info->super_copy));
+ +      btrfs_set_backup_num_devices(root_backup,
+ +                           btrfs_super_num_devices(info->super_copy));
+ +
+ +      /*
+ +       * if we don't copy this out to the super_copy, it won't get remembered
+ +       * for the next commit
+ +       */
+ +      memcpy(&info->super_copy->super_roots,
+ +             &info->super_for_commit->super_roots,
+ +             sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+ +}
+ +
+ +/*
+ + * this copies info out of the root backup array and back into
+ + * the in-memory super block.  It is meant to help iterate through
+ + * the array, so you send it the number of backups you've already
+ + * tried and the last backup index you used.
+ + *
+ + * this returns -1 when it has tried all the backups
+ + */
+ +static noinline int next_root_backup(struct btrfs_fs_info *info,
+ +                                   struct btrfs_super_block *super,
+ +                                   int *num_backups_tried, int *backup_index)
+ +{
+ +      struct btrfs_root_backup *root_backup;
+ +      int newest = *backup_index;
+ +
+ +      if (*num_backups_tried == 0) {
+ +              u64 gen = btrfs_super_generation(super);
+ +
+ +              newest = find_newest_super_backup(info, gen);
+ +              if (newest == -1)
+ +                      return -1;
+ +
+ +              *backup_index = newest;
+ +              *num_backups_tried = 1;
+ +      } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ +              /* we've tried all the backups, all done */
+ +              return -1;
+ +      } else {
+ +              /* jump to the next oldest backup */
+ +              newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ +                      BTRFS_NUM_BACKUP_ROOTS;
+ +              *backup_index = newest;
+ +              *num_backups_tried += 1;
+ +      }
+ +      root_backup = super->super_roots + newest;
+ +
+ +      btrfs_set_super_generation(super,
+ +                                 btrfs_backup_tree_root_gen(root_backup));
+ +      btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ +      btrfs_set_super_root_level(super,
+ +                                 btrfs_backup_tree_root_level(root_backup));
+ +      btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+ +
+ +      /*
+ +       * fixme: the total bytes and num_devices need to match or we should
+ +       * need a fsck
+ +       */
+ +      btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ +      btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ +      return 0;
+ +}
+ +
+ +/* helper to cleanup tree roots */
+ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+ +{
+ +      free_extent_buffer(info->tree_root->node);
+ +      free_extent_buffer(info->tree_root->commit_root);
+ +      free_extent_buffer(info->dev_root->node);
+ +      free_extent_buffer(info->dev_root->commit_root);
+ +      free_extent_buffer(info->extent_root->node);
+ +      free_extent_buffer(info->extent_root->commit_root);
+ +      free_extent_buffer(info->csum_root->node);
+ +      free_extent_buffer(info->csum_root->commit_root);
+ +
+ +      info->tree_root->node = NULL;
+ +      info->tree_root->commit_root = NULL;
+ +      info->dev_root->node = NULL;
+ +      info->dev_root->commit_root = NULL;
+ +      info->extent_root->node = NULL;
+ +      info->extent_root->commit_root = NULL;
+ +      info->csum_root->node = NULL;
+ +      info->csum_root->commit_root = NULL;
+ +
+ +      if (chunk_root) {
+ +              free_extent_buffer(info->chunk_root->node);
+ +              free_extent_buffer(info->chunk_root->commit_root);
+ +              info->chunk_root->node = NULL;
+ +              info->chunk_root->commit_root = NULL;
+ +      }
+ +}
+ +
+ +
   struct btrfs_root *open_ctree(struct super_block *sb,
                               struct btrfs_fs_devices *fs_devices,
                               char *options)
@@@ -1896,8 -1604,6 +1896,8 @@@
   
         int ret;
         int err = -EINVAL;
+ +      int num_backups_tried = 0;
+ +      int backup_index = 0;
   
         struct btrfs_super_block *disk_super;
   
@@@ -1942,7 -1648,6 +1942,7 @@@
         spin_lock_init(&fs_info->fs_roots_radix_lock);
         spin_lock_init(&fs_info->delayed_iput_lock);
         spin_lock_init(&fs_info->defrag_inodes_lock);
+ +      spin_lock_init(&fs_info->free_chunk_lock);
         mutex_init(&fs_info->reloc_mutex);
   
         init_completion(&fs_info->kobj_unregister);
@@@ -1960,7 -1665,8 +1960,7 @@@
         btrfs_init_block_rsv(&fs_info->trans_block_rsv);
         btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
         btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- -      INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
- -      mutex_init(&fs_info->durable_block_rsv_mutex);
+ +      btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
         atomic_set(&fs_info->nr_async_submits, 0);
         atomic_set(&fs_info->async_delalloc_pages, 0);
         atomic_set(&fs_info->async_submit_draining, 0);
@@@ -1971,11 -1677,6 +1971,11 @@@
         fs_info->metadata_ratio = 0;
         fs_info->defrag_inodes = RB_ROOT;
         fs_info->trans_no_join = 0;
+ +      fs_info->free_chunk_space = 0;
+ +
+ +      /* readahead state */
+ +      INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ +      spin_lock_init(&fs_info->reada_lock);
   
         fs_info->thread_pool_size = min_t(unsigned long,
                                           num_online_cpus() + 2, 8);
@@@ -2065,14 -1766,14 +2065,14 @@@
                 goto fail_alloc;
         }
   
- -      memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
- -      memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
- -             sizeof(fs_info->super_for_commit));
+ +      memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ +      memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ +             sizeof(*fs_info->super_for_commit));
         brelse(bh);
   
- -      memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+ +      memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
   
- -      disk_super = &fs_info->super_copy;
+ +      disk_super = fs_info->super_copy;
         if (!btrfs_super_root(disk_super))
                 goto fail_alloc;
   
@@@ -2081,13 -1782,6 +2081,13 @@@
   
         btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
   
+ +      /*
+ +       * run through our array of backup supers and setup
+ +       * our ring pointer to the oldest one
+ +       */
+ +      generation = btrfs_super_generation(disk_super);
+ +      find_oldest_super_backup(fs_info, generation);
+ +
         /*
          * In the long term, we'll store the compression type in the super
          * block, and it'll be used for per file compression control.
@@@ -2176,9 -1870,6 +2176,9 @@@
         btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
                            fs_info->thread_pool_size,
                            &fs_info->generic_worker);
+ +      btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+ +                         fs_info->thread_pool_size,
+ +                         &fs_info->generic_worker);
   
         /*
          * endios are largely parallel and should have a very
@@@ -2189,7 -1880,6 +2189,7 @@@
   
         fs_info->endio_write_workers.idle_thresh = 2;
         fs_info->endio_meta_write_workers.idle_thresh = 2;
+ +      fs_info->readahead_workers.idle_thresh = 2;
   
         btrfs_start_workers(&fs_info->workers, 1);
         btrfs_start_workers(&fs_info->generic_worker, 1);
@@@ -2203,7 -1893,6 +2203,7 @@@
         btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
         btrfs_start_workers(&fs_info->delayed_workers, 1);
         btrfs_start_workers(&fs_info->caching_workers, 1);
+ +      btrfs_start_workers(&fs_info->readahead_workers, 1);
   
         fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
         fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@@ -2250,7 -1939,7 +2250,7 @@@
         if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
                        sb->s_id);
- -              goto fail_chunk_root;
+ +              goto fail_tree_roots;
         }
         btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
         chunk_root->commit_root = btrfs_root_node(chunk_root);
@@@ -2265,12 -1954,11 +2265,12 @@@
         if (ret) {
                 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                        sb->s_id);
- -              goto fail_chunk_root;
+ +              goto fail_tree_roots;
         }
   
         btrfs_close_extra_devices(fs_devices);
   
+ +retry_root_backup:
         blocksize = btrfs_level_size(tree_root,
                                      btrfs_super_root_level(disk_super));
         generation = btrfs_super_generation(disk_super);
@@@ -2278,33 -1966,32 +2278,33 @@@
         tree_root->node = read_tree_block(tree_root,
                                           btrfs_super_root(disk_super),
                                           blocksize, generation);
- -      if (!tree_root->node)
- -              goto fail_chunk_root;
- -      if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ +      if (!tree_root->node ||
+ +          !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
                        sb->s_id);
- -              goto fail_tree_root;
+ +
+ +              goto recovery_tree_root;
         }
+ +
         btrfs_set_root_node(&tree_root->root_item, tree_root->node);
         tree_root->commit_root = btrfs_root_node(tree_root);
   
         ret = find_and_setup_root(tree_root, fs_info,
                                   BTRFS_EXTENT_TREE_OBJECTID, extent_root);
         if (ret)
- -              goto fail_tree_root;
+ +              goto recovery_tree_root;
         extent_root->track_dirty = 1;
   
         ret = find_and_setup_root(tree_root, fs_info,
                                   BTRFS_DEV_TREE_OBJECTID, dev_root);
         if (ret)
- -              goto fail_extent_root;
+ +              goto recovery_tree_root;
         dev_root->track_dirty = 1;
   
         ret = find_and_setup_root(tree_root, fs_info,
                                   BTRFS_CSUM_TREE_OBJECTID, csum_root);
         if (ret)
- -              goto fail_dev_root;
+ +              goto recovery_tree_root;
   
         csum_root->track_dirty = 1;
   
@@@ -2437,10 -2124,20 +2437,10 @@@ fail_cleaner
   
   fail_block_groups:
         btrfs_free_block_groups(fs_info);
- -      free_extent_buffer(csum_root->node);
- -      free_extent_buffer(csum_root->commit_root);
- -fail_dev_root:
- -      free_extent_buffer(dev_root->node);
- -      free_extent_buffer(dev_root->commit_root);
- -fail_extent_root:
- -      free_extent_buffer(extent_root->node);
- -      free_extent_buffer(extent_root->commit_root);
- -fail_tree_root:
- -      free_extent_buffer(tree_root->node);
- -      free_extent_buffer(tree_root->commit_root);
- -fail_chunk_root:
- -      free_extent_buffer(chunk_root->node);
- -      free_extent_buffer(chunk_root->commit_root);
+ +
+ +fail_tree_roots:
+ +      free_root_pointers(fs_info, 1);
+ +
   fail_sb_buffer:
         btrfs_stop_workers(&fs_info->generic_worker);
         btrfs_stop_workers(&fs_info->fixup_workers);
@@@ -2455,6 -2152,7 +2455,6 @@@
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
   fail_alloc:
- -      kfree(fs_info->delayed_root);
   fail_iput:
         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
         iput(fs_info->btree_inode);
@@@ -2466,27 -2164,13 +2466,27 @@@ fail_bdi
   fail_srcu:
         cleanup_srcu_struct(&fs_info->subvol_srcu);
   fail:
- -      kfree(extent_root);
- -      kfree(tree_root);
- -      kfree(fs_info);
- -      kfree(chunk_root);
- -      kfree(dev_root);
- -      kfree(csum_root);
+ +      free_fs_info(fs_info);
         return ERR_PTR(err);
+ +
+ +recovery_tree_root:
+ +
+ +      if (!btrfs_test_opt(tree_root, RECOVERY))
+ +              goto fail_tree_roots;
+ +
+ +      free_root_pointers(fs_info, 0);
+ +
+ +      /* don't use the log in recovery mode, it won't be valid */
+ +      btrfs_set_super_log_root(disk_super, 0);
+ +
+ +      /* we can't trust the free space cache either */
+ +      btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+ +
+ +      ret = next_root_backup(fs_info, fs_info->super_copy,
+ +                             &num_backups_tried, &backup_index);
+ +      if (ret == -1)
+ +              goto fail_block_groups;
+ +      goto retry_root_backup;
   }
   
   static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@@ -2654,11 -2338,10 +2654,11 @@@ int write_all_supers(struct btrfs_root 
         int total_errors = 0;
         u64 flags;
   
- -      max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ +      max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
         do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ +      backup_super_roots(root->fs_info);
   
- -      sb = &root->fs_info->super_for_commit;
+ +      sb = root->fs_info->super_for_commit;
         dev_item = &sb->dev_item;
   
         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@@ -2862,6 -2545,8 +2862,6 @@@ int close_ctree(struct btrfs_root *root
         /* clear out the rbtree of defraggable inodes */
         btrfs_run_defrag_inodes(root->fs_info);
   
- -      btrfs_put_block_group_cache(fs_info);
- -
         /*
          * Here come 2 situations when btrfs is broken to flip readonly:
          *
@@@ -2887,8 -2572,6 +2887,8 @@@
                         printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
         }
   
+ +      btrfs_put_block_group_cache(fs_info);
+ +
         kthread_stop(root->fs_info->transaction_kthread);
         kthread_stop(root->fs_info->cleaner_kthread);
   
@@@ -2920,6 -2603,7 +2920,6 @@@
         del_fs_roots(fs_info);
   
         iput(fs_info->btree_inode);
- -      kfree(fs_info->delayed_root);
   
         btrfs_stop_workers(&fs_info->generic_worker);
         btrfs_stop_workers(&fs_info->fixup_workers);
@@@ -2933,7 -2617,6 +2933,7 @@@
         btrfs_stop_workers(&fs_info->submit_workers);
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
+ +      btrfs_stop_workers(&fs_info->readahead_workers);
   
         btrfs_close_devices(fs_info->fs_devices);
         btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@@ -2941,7 -2624,12 +2941,7 @@@
         bdi_destroy(&fs_info->bdi);
         cleanup_srcu_struct(&fs_info->subvol_srcu);
   
- -      kfree(fs_info->extent_root);
- -      kfree(fs_info->tree_root);
- -      kfree(fs_info->chunk_root);
- -      kfree(fs_info->dev_root);
- -      kfree(fs_info->csum_root);
- -      kfree(fs_info);
+ +      free_fs_info(fs_info);
   
         return 0;
   }
@@@ -3047,8 -2735,7 +3047,8 @@@ int btrfs_read_buffer(struct extent_buf
         return ret;
   }
   
- -int btree_lock_page_hook(struct page *page)
+ +static int btree_lock_page_hook(struct page *page, void *data,
+ +                              void (*flush_fn)(void *))
   {
         struct inode *inode = page->mapping->host;
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -3065,10 -2752,7 +3065,10 @@@
         if (!eb)
                 goto out;
   
- -      btrfs_tree_lock(eb);
+ +      if (!btrfs_try_tree_write_lock(eb)) {
+ +              flush_fn(data);
+ +              btrfs_tree_lock(eb);
+ +      }
         btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
   
         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@@ -3083,10 -2767,7 +3083,10 @@@
         btrfs_tree_unlock(eb);
         free_extent_buffer(eb);
   out:
- -      lock_page(page);
+ +      if (!trylock_page(page)) {
+ +              flush_fn(data);
+ +              lock_page(page);
+ +      }
         return 0;
   }
   
@@@ -3442,7 -3123,6 +3442,7 @@@ static int btrfs_cleanup_transaction(st
   static struct extent_io_ops btree_extent_io_ops = {
         .write_cache_pages_lock_hook = btree_lock_page_hook,
         .readpage_end_io_hook = btree_readpage_end_io_hook,
+ +      .readpage_io_failed_hook = btree_io_failed_hook,
         .submit_bio_hook = btree_submit_bio_hook,
         /* note we're sharing with inode.c for the merge bio hook */
         .merge_bio_hook = btrfs_merge_bio_hook,
diff --combined fs/btrfs/extent-tree.c

index 23e936c3de76aaed14cb364c8dae89049ae3ea63,119f842c1d4f3331da21ab035928f847e87ff25f..18ea90c8943b77faebf68dfa3499193a6a02ab25
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -23,7 -23,6 +23,7 @@@
   #include <linux/rcupdate.h>
   #include <linux/kthread.h>
   #include <linux/slab.h>
+ +#include <linux/ratelimit.h>
   #include "compat.h"
   #include "hash.h"
   #include "ctree.h"
@@@ -53,21 -52,6 +53,21 @@@ enum 
         CHUNK_ALLOC_LIMITED = 2,
   };
   
+ +/*
+ + * Control how reservations are dealt with.
+ + *
+ + * RESERVE_FREE - freeing a reservation.
+ + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ + *   ENOSPC accounting
+ + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ + *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ + */
+ +enum {
+ +      RESERVE_FREE = 0,
+ +      RESERVE_ALLOC = 1,
+ +      RESERVE_ALLOC_NO_ACCOUNT = 2,
+ +};
+ +
   static int update_block_group(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               u64 bytenr, u64 num_bytes, int alloc);
@@@ -97,8 -81,6 +97,8 @@@ static int find_next_key(struct btrfs_p
                          struct btrfs_key *key);
   static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                             int dump_block_groups);
+ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ +                                     u64 num_bytes, int reserve);
   
   static noinline int
   block_group_cache_done(struct btrfs_block_group_cache *cache)
@@@ -122,6 -104,7 +122,6 @@@ void btrfs_put_block_group(struct btrfs
         if (atomic_dec_and_test(&cache->count)) {
                 WARN_ON(cache->pinned > 0);
                 WARN_ON(cache->reserved > 0);
- -              WARN_ON(cache->reserved_pinned > 0);
                 kfree(cache->free_space_ctl);
                 kfree(cache);
         }
@@@ -482,8 -465,7 +482,8 @@@ static int cache_block_group(struct btr
          * we likely hold important locks.
          */
         if (trans && (!trans->transaction->in_commit) &&
- -          (root && root != root->fs_info->tree_root)) {
+ +          (root && root != root->fs_info->tree_root) &&
+ +          btrfs_test_opt(root, SPACE_CACHE)) {
                 spin_lock(&cache->lock);
                 if (cache->cached != BTRFS_CACHE_NO) {
                         spin_unlock(&cache->lock);
@@@ -1788,18 -1770,18 +1788,18 @@@ static int btrfs_discard_extent(struct 
   {
         int ret;
         u64 discarded_bytes = 0;
-       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_bio *bbio = NULL;
   
   
         /* Tell the block device(s) that the sectors can be discarded */
         ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                             bytenr, &num_bytes, &multi, 0);
+                             bytenr, &num_bytes, &bbio, 0);
         if (!ret) {
-               struct btrfs_bio_stripe *stripe = multi->stripes;
+               struct btrfs_bio_stripe *stripe = bbio->stripes;
                 int i;
   
   
-               for (i = 0; i < multi->num_stripes; i++, stripe++) {
+               for (i = 0; i < bbio->num_stripes; i++, stripe++) {
                         if (!stripe->dev->can_discard)
                                 continue;
   
@@@ -1818,7 -1800,7 +1818,7 @@@
                          */
                         ret = 0;
                 }
-               kfree(multi);
+               kfree(bbio);
         }
   
         if (actual_bytes)
@@@ -2718,13 -2700,6 +2718,13 @@@ again
                 goto again;
         }
   
+ +      /* We've already setup this transaction, go ahead and exit */
+ +      if (block_group->cache_generation == trans->transid &&
+ +          i_size_read(inode)) {
+ +              dcs = BTRFS_DC_SETUP;
+ +              goto out_put;
+ +      }
+ +
         /*
          * We want to set the generation to 0, that way if anything goes wrong
          * from here on out we know not to trust this cache when we load up next
@@@ -2774,15 -2749,12 +2774,15 @@@
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
         btrfs_free_reserved_data_space(inode, num_pages);
+ +
   out_put:
         iput(inode);
   out_free:
         btrfs_release_path(path);
   out:
         spin_lock(&block_group->lock);
+ +      if (!ret)
+ +              block_group->cache_generation = trans->transid;
         block_group->disk_cache_state = dcs;
         spin_unlock(&block_group->lock);
   
@@@ -3150,13 -3122,16 +3150,13 @@@ commit_trans
                 return -ENOSPC;
         }
         data_sinfo->bytes_may_use += bytes;
- -      BTRFS_I(inode)->reserved_bytes += bytes;
         spin_unlock(&data_sinfo->lock);
   
         return 0;
   }
   
   /*
- - * called when we are clearing an delalloc extent from the
- - * inode's io_tree or there was an error for whatever reason
- - * after calling btrfs_check_data_free_space
+ + * Called if we need to clear a data reservation for this inode.
    */
   void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
   {
@@@ -3169,6 -3144,7 +3169,6 @@@
         data_sinfo = BTRFS_I(inode)->space_info;
         spin_lock(&data_sinfo->lock);
         data_sinfo->bytes_may_use -= bytes;
- -      BTRFS_I(inode)->reserved_bytes -= bytes;
         spin_unlock(&data_sinfo->lock);
   }
   
@@@ -3189,7 -3165,6 +3189,7 @@@ static int should_alloc_chunk(struct bt
                               struct btrfs_space_info *sinfo, u64 alloc_bytes,
                               int force)
   {
+ +      struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
         u64 thresh;
@@@ -3197,19 -3172,12 +3197,19 @@@
         if (force == CHUNK_ALLOC_FORCE)
                 return 1;
   
+ +      /*
+ +       * We need to take into account the global rsv because for all intents
+ +       * and purposes it's used space.  Don't worry about locking the
+ +       * global_rsv, it doesn't change except when the transaction commits.
+ +       */
+ +      num_allocated += global_rsv->size;
+ +
         /*
          * in limited mode, we want to have some free space up to
          * about 1% of the FS size.
          */
         if (force == CHUNK_ALLOC_LIMITED) {
- -              thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ +              thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
                 thresh = max_t(u64, 64 * 1024 * 1024,
                                div_factor_fine(thresh, 1));
   
@@@ -3231,7 -3199,7 +3231,7 @@@
         if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
                 return 0;
   
- -      thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ +      thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
   
         /* 256MB or 5% of the FS */
         thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@@ -3334,26 -3302,24 +3334,26 @@@ out
   /*
    * shrink metadata reservation for delalloc
    */
- -static int shrink_delalloc(struct btrfs_trans_handle *trans,
- -                         struct btrfs_root *root, u64 to_reclaim, int sync)
+ +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
+ +                         bool wait_ordered)
   {
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_space_info *space_info;
+ +      struct btrfs_trans_handle *trans;
         u64 reserved;
         u64 max_reclaim;
         u64 reclaimed = 0;
         long time_left;
- -      int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+ +      unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
         unsigned long progress;
   
+ +      trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
         space_info = block_rsv->space_info;
   
         smp_mb();
- -      reserved = space_info->bytes_reserved;
+ +      reserved = space_info->bytes_may_use;
         progress = space_info->reservation_progress;
   
         if (reserved == 0)
@@@ -3368,8 -3334,7 +3368,8 @@@
         }
   
         max_reclaim = min(reserved, to_reclaim);
- -
+ +      nr_pages = max_t(unsigned long, nr_pages,
+ +                       max_reclaim >> PAGE_CACHE_SHIFT);
         while (loops < 1024) {
                 /* have the flusher threads jump in and do some IO */
                 smp_mb();
@@@ -3378,9 -3343,9 +3378,9 @@@
                 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
   
                 spin_lock(&space_info->lock);
- -              if (reserved > space_info->bytes_reserved)
- -                      reclaimed += reserved - space_info->bytes_reserved;
- -              reserved = space_info->bytes_reserved;
+ +              if (reserved > space_info->bytes_may_use)
+ +                      reclaimed += reserved - space_info->bytes_may_use;
+ +              reserved = space_info->bytes_may_use;
                 spin_unlock(&space_info->lock);
   
                 loops++;
@@@ -3391,15 -3356,11 +3391,15 @@@
                 if (trans && trans->transaction->blocked)
                         return -EAGAIN;
   
- -              time_left = schedule_timeout_interruptible(1);
+ +              if (wait_ordered && !trans) {
+ +                      btrfs_wait_ordered_extents(root, 0, 0);
+ +              } else {
+ +                      time_left = schedule_timeout_interruptible(1);
   
- -              /* We were interrupted, exit */
- -              if (time_left)
- -                      break;
+ +                      /* We were interrupted, exit */
+ +                      if (time_left)
+ +                              break;
+ +              }
   
                 /* we've kicked the IO a few times, if anything has been freed,
                  * exit.  There is no sense in looping here for a long time
@@@ -3414,90 -3375,34 +3414,90 @@@
                 }
   
         }
- -      if (reclaimed >= to_reclaim && !trans)
- -              btrfs_wait_ordered_extents(root, 0, 0);
+ +
         return reclaimed >= to_reclaim;
   }
   
- -/*
- - * Retries tells us how many times we've called reserve_metadata_bytes.  The
- - * idea is if this is the first call (retries == 0) then we will add to our
- - * reserved count if we can't make the allocation in order to hold our place
- - * while we go and try and free up space.  That way for retries > 1 we don't try
- - * and add space, we just check to see if the amount of unused space is >= the
- - * total space, meaning that our reservation is valid.
+ +/**
+ + * maybe_commit_transaction - possibly commit the transaction if its ok to
+ + * @root - the root we're allocating for
+ + * @bytes - the number of bytes we want to reserve
+ + * @force - force the commit
    *
- - * However if we don't intend to retry this reservation, pass -1 as retries so
- - * that it short circuits this logic.
+ + * This will check to make sure that committing the transaction will actually
+ + * get us somewhere and then commit the transaction if it does.  Otherwise it
+ + * will return -ENOSPC.
    */
- -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
- -                                struct btrfs_root *root,
+ +static int may_commit_transaction(struct btrfs_root *root,
+ +                                struct btrfs_space_info *space_info,
+ +                                u64 bytes, int force)
+ +{
+ +      struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+ +      struct btrfs_trans_handle *trans;
+ +
+ +      trans = (struct btrfs_trans_handle *)current->journal_info;
+ +      if (trans)
+ +              return -EAGAIN;
+ +
+ +      if (force)
+ +              goto commit;
+ +
+ +      /* See if there is enough pinned space to make this reservation */
+ +      spin_lock(&space_info->lock);
+ +      if (space_info->bytes_pinned >= bytes) {
+ +              spin_unlock(&space_info->lock);
+ +              goto commit;
+ +      }
+ +      spin_unlock(&space_info->lock);
+ +
+ +      /*
+ +       * See if there is some space in the delayed insertion reservation for
+ +       * this reservation.
+ +       */
+ +      if (space_info != delayed_rsv->space_info)
+ +              return -ENOSPC;
+ +
+ +      spin_lock(&delayed_rsv->lock);
+ +      if (delayed_rsv->size < bytes) {
+ +              spin_unlock(&delayed_rsv->lock);
+ +              return -ENOSPC;
+ +      }
+ +      spin_unlock(&delayed_rsv->lock);
+ +
+ +commit:
+ +      trans = btrfs_join_transaction(root);
+ +      if (IS_ERR(trans))
+ +              return -ENOSPC;
+ +
+ +      return btrfs_commit_transaction(trans, root);
+ +}
+ +
+ +/**
+ + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ + * @root - the root we're allocating for
+ + * @block_rsv - the block_rsv we're allocating for
+ + * @orig_bytes - the number of bytes we want
+ + * @flush - wether or not we can flush to make our reservation
+ + *
+ + * This will reserve orgi_bytes number of bytes from the space info associated
+ + * with the block_rsv.  If there is not enough space it will make an attempt to
+ + * flush out space to make room.  It will do this by flushing delalloc if
+ + * possible or committing the transaction.  If flush is 0 then no attempts to
+ + * regain reservations will be made and this will fail if there is not enough
+ + * space already.
+ + */
+ +static int reserve_metadata_bytes(struct btrfs_root *root,
                                   struct btrfs_block_rsv *block_rsv,
                                   u64 orig_bytes, int flush)
   {
         struct btrfs_space_info *space_info = block_rsv->space_info;
- -      u64 unused;
+ +      u64 used;
         u64 num_bytes = orig_bytes;
         int retries = 0;
         int ret = 0;
         bool committed = false;
         bool flushing = false;
+ +      bool wait_ordered = false;
   
   again:
         ret = 0;
@@@ -3514,7 -3419,7 +3514,7 @@@
                  * deadlock since we are waiting for the flusher to finish, but
                  * hold the current transaction open.
                  */
- -              if (trans)
+ +              if (current->journal_info)
                         return -EAGAIN;
                 ret = wait_event_interruptible(space_info->wait,
                                                !space_info->flush);
@@@ -3526,9 -3431,9 +3526,9 @@@
         }
   
         ret = -ENOSPC;
- -      unused = space_info->bytes_used + space_info->bytes_reserved +
- -               space_info->bytes_pinned + space_info->bytes_readonly +
- -               space_info->bytes_may_use;
+ +      used = space_info->bytes_used + space_info->bytes_reserved +
+ +              space_info->bytes_pinned + space_info->bytes_readonly +
+ +              space_info->bytes_may_use;
   
         /*
          * The idea here is that we've not already over-reserved the block group
@@@ -3537,9 -3442,10 +3537,9 @@@
          * lets start flushing stuff first and then come back and try to make
          * our reservation.
          */
- -      if (unused <= space_info->total_bytes) {
- -              unused = space_info->total_bytes - unused;
- -              if (unused >= num_bytes) {
- -                      space_info->bytes_reserved += orig_bytes;
+ +      if (used <= space_info->total_bytes) {
+ +              if (used + orig_bytes <= space_info->total_bytes) {
+ +                      space_info->bytes_may_use += orig_bytes;
                         ret = 0;
                 } else {
                         /*
@@@ -3555,64 -3461,10 +3555,64 @@@
                  * amount plus the amount of bytes that we need for this
                  * reservation.
                  */
- -              num_bytes = unused - space_info->total_bytes +
+ +              wait_ordered = true;
+ +              num_bytes = used - space_info->total_bytes +
                         (orig_bytes * (retries + 1));
         }
   
+ +      if (ret) {
+ +              u64 profile = btrfs_get_alloc_profile(root, 0);
+ +              u64 avail;
+ +
+ +              /*
+ +               * If we have a lot of space that's pinned, don't bother doing
+ +               * the overcommit dance yet and just commit the transaction.
+ +               */
+ +              avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+ +              do_div(avail, 10);
+ +              if (space_info->bytes_pinned >= avail && flush && !committed) {
+ +                      space_info->flush = 1;
+ +                      flushing = true;
+ +                      spin_unlock(&space_info->lock);
+ +                      ret = may_commit_transaction(root, space_info,
+ +                                                   orig_bytes, 1);
+ +                      if (ret)
+ +                              goto out;
+ +                      committed = true;
+ +                      goto again;
+ +              }
+ +
+ +              spin_lock(&root->fs_info->free_chunk_lock);
+ +              avail = root->fs_info->free_chunk_space;
+ +
+ +              /*
+ +               * If we have dup, raid1 or raid10 then only half of the free
+ +               * space is actually useable.
+ +               */
+ +              if (profile & (BTRFS_BLOCK_GROUP_DUP |
+ +                             BTRFS_BLOCK_GROUP_RAID1 |
+ +                             BTRFS_BLOCK_GROUP_RAID10))
+ +                      avail >>= 1;
+ +
+ +              /*
+ +               * If we aren't flushing don't let us overcommit too much, say
+ +               * 1/8th of the space.  If we can flush, let it overcommit up to
+ +               * 1/2 of the space.
+ +               */
+ +              if (flush)
+ +                      avail >>= 3;
+ +              else
+ +                      avail >>= 1;
+ +               spin_unlock(&root->fs_info->free_chunk_lock);
+ +
+ +              if (used + num_bytes < space_info->total_bytes + avail) {
+ +                      space_info->bytes_may_use += orig_bytes;
+ +                      ret = 0;
+ +              } else {
+ +                      wait_ordered = true;
+ +              }
+ +      }
+ +
         /*
          * Couldn't make our reservation, save our place so while we're trying
          * to reclaim space we can actually use it instead of somebody else
@@@ -3632,7 -3484,7 +3632,7 @@@
          * We do synchronous shrinking since we don't actually unreserve
          * metadata until after the IO is completed.
          */
- -      ret = shrink_delalloc(trans, root, num_bytes, 1);
+ +      ret = shrink_delalloc(root, num_bytes, wait_ordered);
         if (ret < 0)
                 goto out;
   
@@@ -3644,17 -3496,35 +3644,17 @@@
          * so go back around and try again.
          */
         if (retries < 2) {
+ +              wait_ordered = true;
                 retries++;
                 goto again;
         }
   
- -      /*
- -       * Not enough space to be reclaimed, don't bother committing the
- -       * transaction.
- -       */
- -      spin_lock(&space_info->lock);
- -      if (space_info->bytes_pinned < orig_bytes)
- -              ret = -ENOSPC;
- -      spin_unlock(&space_info->lock);
- -      if (ret)
- -              goto out;
- -
- -      ret = -EAGAIN;
- -      if (trans)
- -              goto out;
- -
         ret = -ENOSPC;
         if (committed)
                 goto out;
   
- -      trans = btrfs_join_transaction(root);
- -      if (IS_ERR(trans))
- -              goto out;
- -      ret = btrfs_commit_transaction(trans, root);
+ +      ret = may_commit_transaction(root, space_info, orig_bytes, 0);
         if (!ret) {
- -              trans = NULL;
                 committed = true;
                 goto again;
         }
@@@ -3672,12 -3542,10 +3672,12 @@@ out
   static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
                                              struct btrfs_root *root)
   {
- -      struct btrfs_block_rsv *block_rsv;
- -      if (root->ref_cows)
+ +      struct btrfs_block_rsv *block_rsv = NULL;
+ +
+ +      if (root->ref_cows || root == root->fs_info->csum_root)
                 block_rsv = trans->block_rsv;
- -      else
+ +
+ +      if (!block_rsv)
                 block_rsv = root->block_rsv;
   
         if (!block_rsv)
@@@ -3748,7 -3616,7 +3748,7 @@@ static void block_rsv_release_bytes(str
                 }
                 if (num_bytes) {
                         spin_lock(&space_info->lock);
- -                      space_info->bytes_reserved -= num_bytes;
+ +                      space_info->bytes_may_use -= num_bytes;
                         space_info->reservation_progress++;
                         spin_unlock(&space_info->lock);
                 }
@@@ -3772,6 -3640,9 +3772,6 @@@ void btrfs_init_block_rsv(struct btrfs_
   {
         memset(rsv, 0, sizeof(*rsv));
         spin_lock_init(&rsv->lock);
- -      atomic_set(&rsv->usage, 1);
- -      rsv->priority = 6;
- -      INIT_LIST_HEAD(&rsv->list);
   }
   
   struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@@ -3792,38 -3663,38 +3792,38 @@@
   void btrfs_free_block_rsv(struct btrfs_root *root,
                           struct btrfs_block_rsv *rsv)
   {
- -      if (rsv && atomic_dec_and_test(&rsv->usage)) {
- -              btrfs_block_rsv_release(root, rsv, (u64)-1);
- -              if (!rsv->durable)
- -                      kfree(rsv);
- -      }
+ +      btrfs_block_rsv_release(root, rsv, (u64)-1);
+ +      kfree(rsv);
   }
   
- -/*
- - * make the block_rsv struct be able to capture freed space.
- - * the captured space will re-add to the the block_rsv struct
- - * after transaction commit
- - */
- -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
- -                               struct btrfs_block_rsv *block_rsv)
+ +int btrfs_block_rsv_add(struct btrfs_root *root,
+ +                      struct btrfs_block_rsv *block_rsv,
+ +                      u64 num_bytes)
   {
- -      block_rsv->durable = 1;
- -      mutex_lock(&fs_info->durable_block_rsv_mutex);
- -      list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
- -      mutex_unlock(&fs_info->durable_block_rsv_mutex);
+ +      int ret;
+ +
+ +      if (num_bytes == 0)
+ +              return 0;
+ +
+ +      ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ +      if (!ret) {
+ +              block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ +              return 0;
+ +      }
+ +
+ +      return ret;
   }
   
- -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
- -                      struct btrfs_root *root,
- -                      struct btrfs_block_rsv *block_rsv,
- -                      u64 num_bytes)
+ +int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+ +                              struct btrfs_block_rsv *block_rsv,
+ +                              u64 num_bytes)
   {
         int ret;
   
         if (num_bytes == 0)
                 return 0;
   
- -      ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+ +      ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
                 return 0;
@@@ -3832,52 -3703,55 +3832,52 @@@
         return ret;
   }
   
- -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
- -                        struct btrfs_root *root,
- -                        struct btrfs_block_rsv *block_rsv,
- -                        u64 min_reserved, int min_factor)
+ +int btrfs_block_rsv_check(struct btrfs_root *root,
+ +                        struct btrfs_block_rsv *block_rsv, int min_factor)
   {
         u64 num_bytes = 0;
- -      int commit_trans = 0;
         int ret = -ENOSPC;
   
         if (!block_rsv)
                 return 0;
   
         spin_lock(&block_rsv->lock);
- -      if (min_factor > 0)
- -              num_bytes = div_factor(block_rsv->size, min_factor);
- -      if (min_reserved > num_bytes)
- -              num_bytes = min_reserved;
+ +      num_bytes = div_factor(block_rsv->size, min_factor);
+ +      if (block_rsv->reserved >= num_bytes)
+ +              ret = 0;
+ +      spin_unlock(&block_rsv->lock);
   
- -      if (block_rsv->reserved >= num_bytes) {
+ +      return ret;
+ +}
+ +
+ +int btrfs_block_rsv_refill(struct btrfs_root *root,
+ +                        struct btrfs_block_rsv *block_rsv,
+ +                        u64 min_reserved)
+ +{
+ +      u64 num_bytes = 0;
+ +      int ret = -ENOSPC;
+ +
+ +      if (!block_rsv)
+ +              return 0;
+ +
+ +      spin_lock(&block_rsv->lock);
+ +      num_bytes = min_reserved;
+ +      if (block_rsv->reserved >= num_bytes)
                 ret = 0;
- -      } else {
+ +      else
                 num_bytes -= block_rsv->reserved;
- -              if (block_rsv->durable &&
- -                  block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
- -                      commit_trans = 1;
- -      }
         spin_unlock(&block_rsv->lock);
+ +
         if (!ret)
                 return 0;
   
- -      if (block_rsv->refill_used) {
- -              ret = reserve_metadata_bytes(trans, root, block_rsv,
- -                                           num_bytes, 0);
- -              if (!ret) {
- -                      block_rsv_add_bytes(block_rsv, num_bytes, 0);
- -                      return 0;
- -              }
- -      }
- -
- -      if (commit_trans) {
- -              if (trans)
- -                      return -EAGAIN;
- -              trans = btrfs_join_transaction(root);
- -              BUG_ON(IS_ERR(trans));
- -              ret = btrfs_commit_transaction(trans, root);
+ +      ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+ +      if (!ret) {
+ +              block_rsv_add_bytes(block_rsv, num_bytes, 0);
                 return 0;
         }
   
- -      return -ENOSPC;
+ +      return ret;
   }
   
   int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@@ -3909,7 -3783,7 +3909,7 @@@ static u64 calc_global_metadata_size(st
         u64 num_bytes;
         u64 meta_used;
         u64 data_used;
- -      int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ +      int csum_size = btrfs_super_csum_size(fs_info->super_copy);
   
         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
         spin_lock(&sinfo->lock);
@@@ -3953,12 -3827,12 +3953,12 @@@ static void update_global_block_rsv(str
         if (sinfo->total_bytes > num_bytes) {
                 num_bytes = sinfo->total_bytes - num_bytes;
                 block_rsv->reserved += num_bytes;
- -              sinfo->bytes_reserved += num_bytes;
+ +              sinfo->bytes_may_use += num_bytes;
         }
   
         if (block_rsv->reserved >= block_rsv->size) {
                 num_bytes = block_rsv->reserved - block_rsv->size;
- -              sinfo->bytes_reserved -= num_bytes;
+ +              sinfo->bytes_may_use -= num_bytes;
                 sinfo->reservation_progress++;
                 block_rsv->reserved = block_rsv->size;
                 block_rsv->full = 1;
@@@ -3974,13 -3848,16 +3974,13 @@@ static void init_global_block_rsv(struc
   
         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
         fs_info->chunk_block_rsv.space_info = space_info;
- -      fs_info->chunk_block_rsv.priority = 10;
   
         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
         fs_info->global_block_rsv.space_info = space_info;
- -      fs_info->global_block_rsv.priority = 10;
- -      fs_info->global_block_rsv.refill_used = 1;
         fs_info->delalloc_block_rsv.space_info = space_info;
         fs_info->trans_block_rsv.space_info = space_info;
         fs_info->empty_block_rsv.space_info = space_info;
- -      fs_info->empty_block_rsv.priority = 10;
+ +      fs_info->delayed_block_rsv.space_info = space_info;
   
         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@@ -3988,6 -3865,10 +3988,6 @@@
         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
   
- -      btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
- -
- -      btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
- -
         update_global_block_rsv(fs_info);
   }
   
@@@ -4000,8 -3881,37 +4000,8 @@@ static void release_global_block_rsv(st
         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
         WARN_ON(fs_info->chunk_block_rsv.size > 0);
         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
- -}
- -
- -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
- -                                  struct btrfs_root *root,
- -                                  struct btrfs_block_rsv *rsv)
- -{
- -      struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
- -      u64 num_bytes;
- -      int ret;
- -
- -      /*
- -       * Truncate should be freeing data, but give us 2 items just in case it
- -       * needs to use some space.  We may want to be smarter about this in the
- -       * future.
- -       */
- -      num_bytes = btrfs_calc_trans_metadata_size(root, 2);
- -
- -      /* We already have enough bytes, just return */
- -      if (rsv->reserved >= num_bytes)
- -              return 0;
- -
- -      num_bytes -= rsv->reserved;
- -
- -      /*
- -       * You should have reserved enough space before hand to do this, so this
- -       * should not fail.
- -       */
- -      ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
- -      BUG_ON(ret);
- -
- -      return 0;
+ +      WARN_ON(fs_info->delayed_block_rsv.size > 0);
+ +      WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
   }
   
   void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@@ -4010,7 -3920,9 +4010,7 @@@
         if (!trans->bytes_reserved)
                 return;
   
- -      BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
- -      btrfs_block_rsv_release(root, trans->block_rsv,
- -                              trans->bytes_reserved);
+ +      btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
         trans->bytes_reserved = 0;
   }
   
@@@ -4052,19 -3964,11 +4052,19 @@@ int btrfs_snap_reserve_metadata(struct 
         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
   }
   
+ +/**
+ + * drop_outstanding_extent - drop an outstanding extent
+ + * @inode: the inode we're dropping the extent for
+ + *
+ + * This is called when we are freeing up an outstanding extent, either called
+ + * after an error or after an extent is written.  This will return the number of
+ + * reserved extents that need to be freed.  This must be called with
+ + * BTRFS_I(inode)->lock held.
+ + */
   static unsigned drop_outstanding_extent(struct inode *inode)
   {
         unsigned dropped_extents = 0;
   
- -      spin_lock(&BTRFS_I(inode)->lock);
         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
         BTRFS_I(inode)->outstanding_extents--;
   
@@@ -4074,70 -3978,19 +4074,70 @@@
          */
         if (BTRFS_I(inode)->outstanding_extents >=
             BTRFS_I(inode)->reserved_extents)
- -              goto out;
+ +              return 0;
   
         dropped_extents = BTRFS_I(inode)->reserved_extents -
                 BTRFS_I(inode)->outstanding_extents;
         BTRFS_I(inode)->reserved_extents -= dropped_extents;
- -out:
- -      spin_unlock(&BTRFS_I(inode)->lock);
         return dropped_extents;
   }
   
- -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+ +/**
+ + * calc_csum_metadata_size - return the amount of metada space that must be
+ + *    reserved/free'd for the given bytes.
+ + * @inode: the inode we're manipulating
+ + * @num_bytes: the number of bytes in question
+ + * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ + *
+ + * This adjusts the number of csum_bytes in the inode and then returns the
+ + * correct amount of metadata that must either be reserved or freed.  We
+ + * calculate how many checksums we can fit into one leaf and then divide the
+ + * number of bytes that will need to be checksumed by this value to figure out
+ + * how many checksums will be required.  If we are adding bytes then the number
+ + * may go up and we will return the number of additional bytes that must be
+ + * reserved.  If it is going down we will return the number of bytes that must
+ + * be freed.
+ + *
+ + * This must be called with BTRFS_I(inode)->lock held.
+ + */
+ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+ +                                 int reserve)
   {
- -      return num_bytes >>= 3;
+ +      struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      u64 csum_size;
+ +      int num_csums_per_leaf;
+ +      int num_csums;
+ +      int old_csums;
+ +
+ +      if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+ +          BTRFS_I(inode)->csum_bytes == 0)
+ +              return 0;
+ +
+ +      old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ +      if (reserve)
+ +              BTRFS_I(inode)->csum_bytes += num_bytes;
+ +      else
+ +              BTRFS_I(inode)->csum_bytes -= num_bytes;
+ +      csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ +      num_csums_per_leaf = (int)div64_u64(csum_size,
+ +                                          sizeof(struct btrfs_csum_item) +
+ +                                          sizeof(struct btrfs_disk_key));
+ +      num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ +      num_csums = num_csums + num_csums_per_leaf - 1;
+ +      num_csums = num_csums / num_csums_per_leaf;
+ +
+ +      old_csums = old_csums + num_csums_per_leaf - 1;
+ +      old_csums = old_csums / num_csums_per_leaf;
+ +
+ +      /* No change, no need to reserve more */
+ +      if (old_csums == num_csums)
+ +              return 0;
+ +
+ +      if (reserve)
+ +              return btrfs_calc_trans_metadata_size(root,
+ +                                                    num_csums - old_csums);
+ +
+ +      return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
   }
   
   int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@@ -4146,13 -3999,9 +4146,13 @@@
         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
         u64 to_reserve = 0;
         unsigned nr_extents = 0;
+ +      int flush = 1;
         int ret;
   
- -      if (btrfs_transaction_in_commit(root->fs_info))
+ +      if (btrfs_is_free_space_inode(root, inode))
+ +              flush = 0;
+ +
+ +      if (flush && btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
   
         num_bytes = ALIGN(num_bytes, root->sectorsize);
@@@ -4168,29 -4017,18 +4168,29 @@@
   
                 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
         }
+ +      to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
         spin_unlock(&BTRFS_I(inode)->lock);
   
- -      to_reserve += calc_csum_metadata_size(inode, num_bytes);
- -      ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+ +      ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (ret) {
+ +              u64 to_free = 0;
                 unsigned dropped;
+ +
+ +              spin_lock(&BTRFS_I(inode)->lock);
+ +              dropped = drop_outstanding_extent(inode);
+ +              to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ +              spin_unlock(&BTRFS_I(inode)->lock);
+ +              to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ +
                 /*
- -               * We don't need the return value since our reservation failed,
- -               * we just need to clean up our counter.
+ +               * Somebody could have come in and twiddled with the
+ +               * reservation, so if we have to free more than we would have
+ +               * reserved from this reservation go ahead and release those
+ +               * bytes.
                  */
- -              dropped = drop_outstanding_extent(inode);
- -              WARN_ON(dropped > 1);
+ +              to_free -= to_reserve;
+ +              if (to_free)
+ +                      btrfs_block_rsv_release(root, block_rsv, to_free);
                 return ret;
         }
   
@@@ -4199,15 -4037,6 +4199,15 @@@
         return 0;
   }
   
+ +/**
+ + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ + * @inode: the inode to release the reservation for
+ + * @num_bytes: the number of bytes we're releasing
+ + *
+ + * This will release the metadata reservation for an inode.  This can be called
+ + * once we complete IO for a given set of bytes to release their metadata
+ + * reservations.
+ + */
   void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -4215,11 -4044,9 +4215,11 @@@
         unsigned dropped;
   
         num_bytes = ALIGN(num_bytes, root->sectorsize);
+ +      spin_lock(&BTRFS_I(inode)->lock);
         dropped = drop_outstanding_extent(inode);
   
- -      to_free = calc_csum_metadata_size(inode, num_bytes);
+ +      to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ +      spin_unlock(&BTRFS_I(inode)->lock);
         if (dropped > 0)
                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
   
@@@ -4227,21 -4054,6 +4227,21 @@@
                                 to_free);
   }
   
+ +/**
+ + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ + * @inode: inode we're writing to
+ + * @num_bytes: the number of bytes we want to allocate
+ + *
+ + * This will do the following things
+ + *
+ + * o reserve space in the data space info for num_bytes
+ + * o reserve space in the metadata space info based on number of outstanding
+ + *   extents and how much csums will be needed
+ + * o add to the inodes ->delalloc_bytes
+ + * o add it to the fs_info's delalloc inodes list.
+ + *
+ + * This will return 0 for success and -ENOSPC if there is no space left.
+ + */
   int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
   {
         int ret;
@@@ -4259,19 -4071,6 +4259,19 @@@
         return 0;
   }
   
+ +/**
+ + * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ + * @inode: inode we're releasing space for
+ + * @num_bytes: the number of bytes we want to free up
+ + *
+ + * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ + * called in the case that we don't need the metadata AND data reservations
+ + * anymore.  So if there is an error or we insert an inline extent.
+ + *
+ + * This function will release the metadata space that was not used and will
+ + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ + * list if there are no delalloc bytes left.
+ + */
   void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
   {
         btrfs_delalloc_release_metadata(inode, num_bytes);
@@@ -4291,12 -4090,12 +4291,12 @@@ static int update_block_group(struct bt
   
         /* block accounting for super block */
         spin_lock(&info->delalloc_lock);
- -      old_val = btrfs_super_bytes_used(&info->super_copy);
+ +      old_val = btrfs_super_bytes_used(info->super_copy);
         if (alloc)
                 old_val += num_bytes;
         else
                 old_val -= num_bytes;
- -      btrfs_set_super_bytes_used(&info->super_copy, old_val);
+ +      btrfs_set_super_bytes_used(info->super_copy, old_val);
         spin_unlock(&info->delalloc_lock);
   
         while (total) {
@@@ -4324,7 -4123,7 +4324,7 @@@
                 spin_lock(&cache->space_info->lock);
                 spin_lock(&cache->lock);
   
- -              if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+ +              if (btrfs_test_opt(root, SPACE_CACHE) &&
                     cache->disk_cache_state < BTRFS_DC_CLEAR)
                         cache->disk_cache_state = BTRFS_DC_CLEAR;
   
@@@ -4336,6 -4135,7 +4336,6 @@@
                         btrfs_set_block_group_used(&cache->item, old_val);
                         cache->reserved -= num_bytes;
                         cache->space_info->bytes_reserved -= num_bytes;
- -                      cache->space_info->reservation_progress++;
                         cache->space_info->bytes_used += num_bytes;
                         cache->space_info->disk_used += num_bytes * factor;
                         spin_unlock(&cache->lock);
@@@ -4387,6 -4187,7 +4387,6 @@@ static int pin_down_extent(struct btrfs
         if (reserved) {
                 cache->reserved -= num_bytes;
                 cache->space_info->bytes_reserved -= num_bytes;
- -              cache->space_info->reservation_progress++;
         }
         spin_unlock(&cache->lock);
         spin_unlock(&cache->space_info->lock);
@@@ -4414,82 -4215,45 +4414,82 @@@ int btrfs_pin_extent(struct btrfs_root 
   }
   
   /*
- - * update size of reserved extents. this function may return -EAGAIN
- - * if 'reserve' is true or 'sinfo' is false.
+ + * this function must be called within transaction
+ + */
+ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+ +                                  struct btrfs_root *root,
+ +                                  u64 bytenr, u64 num_bytes)
+ +{
+ +      struct btrfs_block_group_cache *cache;
+ +
+ +      cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ +      BUG_ON(!cache);
+ +
+ +      /*
+ +       * pull in the free space cache (if any) so that our pin
+ +       * removes the free space from the cache.  We have load_only set
+ +       * to one because the slow code to read in the free extents does check
+ +       * the pinned extents.
+ +       */
+ +      cache_block_group(cache, trans, root, 1);
+ +
+ +      pin_down_extent(root, cache, bytenr, num_bytes, 0);
+ +
+ +      /* remove us from the free space cache (if we're there at all) */
+ +      btrfs_remove_free_space(cache, bytenr, num_bytes);
+ +      btrfs_put_block_group(cache);
+ +      return 0;
+ +}
+ +
+ +/**
+ + * btrfs_update_reserved_bytes - update the block_group and space info counters
+ + * @cache:    The cache we are manipulating
+ + * @num_bytes:        The number of bytes in question
+ + * @reserve:  One of the reservation enums
+ + *
+ + * This is called by the allocator when it reserves space, or by somebody who is
+ + * freeing space that was never actually used on disk.  For example if you
+ + * reserve some space for a new leaf in transaction A and before transaction A
+ + * commits you free that leaf, you call this with reserve set to 0 in order to
+ + * clear the reservation.
+ + *
+ + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ + * ENOSPC accounting.  For data we handle the reservation through clearing the
+ + * delalloc bits in the io_tree.  We have to do this since we could end up
+ + * allocating less disk space for the amount of data we have reserved in the
+ + * case of compression.
+ + *
+ + * If this is a reservation and the block group has become read only we cannot
+ + * make the reservation and return -EAGAIN, otherwise this function always
+ + * succeeds.
    */
- -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- -                              u64 num_bytes, int reserve, int sinfo)
+ +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ +                                     u64 num_bytes, int reserve)
   {
+ +      struct btrfs_space_info *space_info = cache->space_info;
         int ret = 0;
- -      if (sinfo) {
- -              struct btrfs_space_info *space_info = cache->space_info;
- -              spin_lock(&space_info->lock);
- -              spin_lock(&cache->lock);
- -              if (reserve) {
- -                      if (cache->ro) {
- -                              ret = -EAGAIN;
- -                      } else {
- -                              cache->reserved += num_bytes;
- -                              space_info->bytes_reserved += num_bytes;
- -                      }
- -              } else {
- -                      if (cache->ro)
- -                              space_info->bytes_readonly += num_bytes;
- -                      cache->reserved -= num_bytes;
- -                      space_info->bytes_reserved -= num_bytes;
- -                      space_info->reservation_progress++;
- -              }
- -              spin_unlock(&cache->lock);
- -              spin_unlock(&space_info->lock);
- -      } else {
- -              spin_lock(&cache->lock);
+ +      spin_lock(&space_info->lock);
+ +      spin_lock(&cache->lock);
+ +      if (reserve != RESERVE_FREE) {
                 if (cache->ro) {
                         ret = -EAGAIN;
                 } else {
- -                      if (reserve)
- -                              cache->reserved += num_bytes;
- -                      else
- -                              cache->reserved -= num_bytes;
+ +                      cache->reserved += num_bytes;
+ +                      space_info->bytes_reserved += num_bytes;
+ +                      if (reserve == RESERVE_ALLOC) {
+ +                              BUG_ON(space_info->bytes_may_use < num_bytes);
+ +                              space_info->bytes_may_use -= num_bytes;
+ +                      }
                 }
- -              spin_unlock(&cache->lock);
+ +      } else {
+ +              if (cache->ro)
+ +                      space_info->bytes_readonly += num_bytes;
+ +              cache->reserved -= num_bytes;
+ +              space_info->bytes_reserved -= num_bytes;
+ +              space_info->reservation_progress++;
         }
+ +      spin_unlock(&cache->lock);
+ +      spin_unlock(&space_info->lock);
         return ret;
   }
   
@@@ -4555,8 -4319,13 +4555,8 @@@ static int unpin_extent_range(struct bt
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 cache->space_info->bytes_pinned -= len;
- -              if (cache->ro) {
+ +              if (cache->ro)
                         cache->space_info->bytes_readonly += len;
- -              } else if (cache->reserved_pinned > 0) {
- -                      len = min(len, cache->reserved_pinned);
- -                      cache->reserved_pinned -= len;
- -                      cache->space_info->bytes_reserved += len;
- -              }
                 spin_unlock(&cache->lock);
                 spin_unlock(&cache->space_info->lock);
         }
@@@ -4571,8 -4340,11 +4571,8 @@@ int btrfs_finish_extent_commit(struct b
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct extent_io_tree *unpin;
- -      struct btrfs_block_rsv *block_rsv;
- -      struct btrfs_block_rsv *next_rsv;
         u64 start;
         u64 end;
- -      int idx;
         int ret;
   
         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@@ -4595,6 -4367,30 +4595,6 @@@
                 cond_resched();
         }
   
- -      mutex_lock(&fs_info->durable_block_rsv_mutex);
- -      list_for_each_entry_safe(block_rsv, next_rsv,
- -                               &fs_info->durable_block_rsv_list, list) {
- -
- -              idx = trans->transid & 0x1;
- -              if (block_rsv->freed[idx] > 0) {
- -                      block_rsv_add_bytes(block_rsv,
- -                                          block_rsv->freed[idx], 0);
- -                      block_rsv->freed[idx] = 0;
- -              }
- -              if (atomic_read(&block_rsv->usage) == 0) {
- -                      btrfs_block_rsv_release(root, block_rsv, (u64)-1);
- -
- -                      if (block_rsv->freed[0] == 0 &&
- -                          block_rsv->freed[1] == 0) {
- -                              list_del_init(&block_rsv->list);
- -                              kfree(block_rsv);
- -                      }
- -              } else {
- -                      btrfs_block_rsv_release(root, block_rsv, 0);
- -              }
- -      }
- -      mutex_unlock(&fs_info->durable_block_rsv_mutex);
- -
         return 0;
   }
   
@@@ -4872,6 -4668,7 +4872,6 @@@ void btrfs_free_tree_block(struct btrfs
                            struct extent_buffer *buf,
                            u64 parent, int last_ref)
   {
- -      struct btrfs_block_rsv *block_rsv;
         struct btrfs_block_group_cache *cache = NULL;
         int ret;
   
@@@ -4886,24 -4683,64 +4886,24 @@@
         if (!last_ref)
                 return;
   
- -      block_rsv = get_block_rsv(trans, root);
         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
- -      if (block_rsv->space_info != cache->space_info)
- -              goto out;
   
         if (btrfs_header_generation(buf) == trans->transid) {
                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                         ret = check_ref_cleanup(trans, root, buf->start);
                         if (!ret)
- -                              goto pin;
+ +                              goto out;
                 }
   
                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                         pin_down_extent(root, cache, buf->start, buf->len, 1);
- -                      goto pin;
+ +                      goto out;
                 }
   
                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
   
                 btrfs_add_free_space(cache, buf->start, buf->len);
- -              ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
- -              if (ret == -EAGAIN) {
- -                      /* block group became read-only */
- -                      btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
- -                      goto out;
- -              }
- -
- -              ret = 1;
- -              spin_lock(&block_rsv->lock);
- -              if (block_rsv->reserved < block_rsv->size) {
- -                      block_rsv->reserved += buf->len;
- -                      ret = 0;
- -              }
- -              spin_unlock(&block_rsv->lock);
- -
- -              if (ret) {
- -                      spin_lock(&cache->space_info->lock);
- -                      cache->space_info->bytes_reserved -= buf->len;
- -                      cache->space_info->reservation_progress++;
- -                      spin_unlock(&cache->space_info->lock);
- -              }
- -              goto out;
- -      }
- -pin:
- -      if (block_rsv->durable && !cache->ro) {
- -              ret = 0;
- -              spin_lock(&cache->lock);
- -              if (!cache->ro) {
- -                      cache->reserved_pinned += buf->len;
- -                      ret = 1;
- -              }
- -              spin_unlock(&cache->lock);
- -
- -              if (ret) {
- -                      spin_lock(&block_rsv->lock);
- -                      block_rsv->freed[trans->transid & 0x1] += buf->len;
- -                      spin_unlock(&block_rsv->lock);
- -              }
+ +              btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
         }
   out:
         /*
@@@ -5046,13 -4883,10 +5046,13 @@@ static noinline int find_free_extent(st
         int last_ptr_loop = 0;
         int loop = 0;
         int index = 0;
+ +      int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+ +              RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
         bool found_uncached_bg = false;
         bool failed_cluster_refill = false;
         bool failed_alloc = false;
         bool use_cluster = true;
+ +      bool have_caching_bg = false;
         u64 ideal_cache_percent = 0;
         u64 ideal_cache_offset = 0;
   
@@@ -5135,7 -4969,6 +5135,7 @@@ ideal_cache
                 }
         }
   search:
+ +      have_caching_bg = false;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
@@@ -5344,8 -5177,6 +5344,8 @@@ refill_cluster
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
+ +                      if (!cached)
+ +                              have_caching_bg = true;
                         goto loop;
                 }
   checks:
@@@ -5371,8 -5202,8 +5371,8 @@@
                                              search_start - offset);
                 BUG_ON(offset > search_start);
   
- -              ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
- -                                          (data & BTRFS_BLOCK_GROUP_DATA));
+ +              ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ +                                                alloc_type);
                 if (ret == -EAGAIN) {
                         btrfs_add_free_space(block_group, offset, num_bytes);
                         goto loop;
@@@ -5396,9 -5227,6 +5396,9 @@@ loop
         }
         up_read(&space_info->groups_sem);
   
+ +      if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+ +              goto search;
+ +
         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                 goto search;
   
@@@ -5497,8 -5325,7 +5497,8 @@@ static void dump_space_info(struct btrf
         int index = 0;
   
         spin_lock(&info->lock);
- -      printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+ +      printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+ +             (unsigned long long)info->flags,
                (unsigned long long)(info->total_bytes - info->bytes_used -
                                     info->bytes_pinned - info->bytes_reserved -
                                     info->bytes_readonly),
@@@ -5584,8 -5411,7 +5584,8 @@@ again
         return ret;
   }
   
- -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+ +static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+ +                                      u64 start, u64 len, int pin)
   {
         struct btrfs_block_group_cache *cache;
         int ret = 0;
@@@ -5600,12 -5426,8 +5600,12 @@@
         if (btrfs_test_opt(root, DISCARD))
                 ret = btrfs_discard_extent(root, start, len, NULL);
   
- -      btrfs_add_free_space(cache, start, len);
- -      btrfs_update_reserved_bytes(cache, len, 0, 1);
+ +      if (pin)
+ +              pin_down_extent(root, cache, start, len, 1);
+ +      else {
+ +              btrfs_add_free_space(cache, start, len);
+ +              btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ +      }
         btrfs_put_block_group(cache);
   
         trace_btrfs_reserved_extent_free(root, start, len);
@@@ -5613,18 -5435,6 +5613,18 @@@
         return ret;
   }
   
+ +int btrfs_free_reserved_extent(struct btrfs_root *root,
+ +                                      u64 start, u64 len)
+ +{
+ +      return __btrfs_free_reserved_extent(root, start, len, 0);
+ +}
+ +
+ +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+ +                                     u64 start, u64 len)
+ +{
+ +      return __btrfs_free_reserved_extent(root, start, len, 1);
+ +}
+ +
   static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       u64 parent, u64 root_objectid,
@@@ -5820,8 -5630,7 +5820,8 @@@ int btrfs_alloc_logged_file_extent(stru
                 put_caching_control(caching_ctl);
         }
   
- -      ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+ +      ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+ +                                        RESERVE_ALLOC_NO_ACCOUNT);
         BUG_ON(ret);
         btrfs_put_block_group(block_group);
         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@@ -5878,7 -5687,8 +5878,7 @@@ use_block_rsv(struct btrfs_trans_handl
         block_rsv = get_block_rsv(trans, root);
   
         if (block_rsv->size == 0) {
- -              ret = reserve_metadata_bytes(trans, root, block_rsv,
- -                                           blocksize, 0);
+ +              ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                 /*
                  * If we couldn't reserve metadata bytes try and use some from
                  * the global reserve.
@@@ -5898,15 -5708,13 +5898,15 @@@
         if (!ret)
                 return block_rsv;
         if (ret) {
- -              WARN_ON(1);
- -              ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
- -                                           0);
+ +              static DEFINE_RATELIMIT_STATE(_rs,
+ +                              DEFAULT_RATELIMIT_INTERVAL,
+ +                              /*DEFAULT_RATELIMIT_BURST*/ 2);
+ +              if (__ratelimit(&_rs)) {
+ +                      printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+ +                      WARN_ON(1);
+ +              }
+ +              ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                 if (!ret) {
- -                      spin_lock(&block_rsv->lock);
- -                      block_rsv->size += blocksize;
- -                      spin_unlock(&block_rsv->lock);
                         return block_rsv;
                 } else if (ret && block_rsv != global_rsv) {
                         ret = block_rsv_use_bytes(global_rsv, blocksize);
@@@ -6784,9 -6592,12 +6784,9 @@@ static int set_block_group_ro(struct bt
                     cache->bytes_super - btrfs_block_group_used(&cache->item);
   
         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
- -          sinfo->bytes_may_use + sinfo->bytes_readonly +
- -          cache->reserved_pinned + num_bytes + min_allocable_bytes <=
- -          sinfo->total_bytes) {
+ +          sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+ +          min_allocable_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
- -              sinfo->bytes_reserved += cache->reserved_pinned;
- -              cache->reserved_pinned = 0;
                 cache->ro = 1;
                 ret = 0;
         }
@@@ -7153,8 -6964,7 +7153,8 @@@ int btrfs_free_block_groups(struct btrf
                                         struct btrfs_space_info,
                                         list);
                 if (space_info->bytes_pinned > 0 ||
- -                  space_info->bytes_reserved > 0) {
+ +                  space_info->bytes_reserved > 0 ||
+ +                  space_info->bytes_may_use > 0) {
                         WARN_ON(1);
                         dump_space_info(space_info, 0, 0);
                 }
@@@ -7196,12 -7006,14 +7196,12 @@@ int btrfs_read_block_groups(struct btrf
                 return -ENOMEM;
         path->reada = 1;
   
- -      cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
- -      if (cache_gen != 0 &&
- -          btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+ +      cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+ +      if (btrfs_test_opt(root, SPACE_CACHE) &&
+ +          btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
                 need_clear = 1;
         if (btrfs_test_opt(root, CLEAR_CACHE))
                 need_clear = 1;
- -      if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
- -              printk(KERN_INFO "btrfs: disk space caching is enabled\n");
   
         while (1) {
                 ret = find_first_block_group(root, path, &key);
@@@ -7440,7 -7252,7 +7440,7 @@@ int btrfs_remove_block_group(struct btr
                 goto out;
         }
   
- -      inode = lookup_free_space_inode(root, block_group, path);
+ +      inode = lookup_free_space_inode(tree_root, block_group, path);
         if (!IS_ERR(inode)) {
                 ret = btrfs_orphan_add(trans, inode);
                 BUG_ON(ret);
@@@ -7456,7 -7268,7 +7456,7 @@@
                         spin_unlock(&block_group->lock);
                 }
                 /* One for our lookup ref */
- -              iput(inode);
+ +              btrfs_add_delayed_iput(inode);
         }
   
         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@@ -7527,7 -7339,7 +7527,7 @@@ int btrfs_init_space_info(struct btrfs_
         int mixed = 0;
         int ret;
   
- -      disk_super = &fs_info->super_copy;
+ +      disk_super = fs_info->super_copy;
         if (!btrfs_super_root(disk_super))
                 return 1;
   
diff --combined fs/btrfs/extent_io.c

index c12705682c6543a4b180ccc32521cc4011a20dc4,624ef10d36cc11eff2c2b24a74e5d61a1718aa50..1f87c4d0e7a072c6361fb218b32f024b111bb0a8
--- 1/fs/btrfs/extent_io.c
--- 2/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -17,6 -17,7 +17,7 @@@
   #include "compat.h"
   #include "ctree.h"
   #include "btrfs_inode.h"
+ #include "volumes.h"
   
   static struct kmem_cache *extent_state_cache;
   static struct kmem_cache *extent_buffer_cache;
@@@ -894,194 -895,6 +895,194 @@@ search_again
         goto again;
   }
   
+ +/**
+ + * convert_extent - convert all bits in a given range from one bit to another
+ + * @tree:     the io tree to search
+ + * @start:    the start offset in bytes
+ + * @end:      the end offset in bytes (inclusive)
+ + * @bits:     the bits to set in this range
+ + * @clear_bits:       the bits to clear in this range
+ + * @mask:     the allocation mask
+ + *
+ + * This will go through and set bits for the given range.  If any states exist
+ + * already in this range they are set with the given bit and cleared of the
+ + * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ + * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ + * boundary bits like LOCK.
+ + */
+ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ +                     int bits, int clear_bits, gfp_t mask)
+ +{
+ +      struct extent_state *state;
+ +      struct extent_state *prealloc = NULL;
+ +      struct rb_node *node;
+ +      int err = 0;
+ +      u64 last_start;
+ +      u64 last_end;
+ +
+ +again:
+ +      if (!prealloc && (mask & __GFP_WAIT)) {
+ +              prealloc = alloc_extent_state(mask);
+ +              if (!prealloc)
+ +                      return -ENOMEM;
+ +      }
+ +
+ +      spin_lock(&tree->lock);
+ +      /*
+ +       * this search will find all the extents that end after
+ +       * our range starts.
+ +       */
+ +      node = tree_search(tree, start);
+ +      if (!node) {
+ +              prealloc = alloc_extent_state_atomic(prealloc);
+ +              if (!prealloc)
+ +                      return -ENOMEM;
+ +              err = insert_state(tree, prealloc, start, end, &bits);
+ +              prealloc = NULL;
+ +              BUG_ON(err == -EEXIST);
+ +              goto out;
+ +      }
+ +      state = rb_entry(node, struct extent_state, rb_node);
+ +hit_next:
+ +      last_start = state->start;
+ +      last_end = state->end;
+ +
+ +      /*
+ +       * | ---- desired range ---- |
+ +       * | state |
+ +       *
+ +       * Just lock what we found and keep going
+ +       */
+ +      if (state->start == start && state->end <= end) {
+ +              struct rb_node *next_node;
+ +
+ +              set_state_bits(tree, state, &bits);
+ +              clear_state_bit(tree, state, &clear_bits, 0);
+ +
+ +              merge_state(tree, state);
+ +              if (last_end == (u64)-1)
+ +                      goto out;
+ +
+ +              start = last_end + 1;
+ +              next_node = rb_next(&state->rb_node);
+ +              if (next_node && start < end && prealloc && !need_resched()) {
+ +                      state = rb_entry(next_node, struct extent_state,
+ +                                       rb_node);
+ +                      if (state->start == start)
+ +                              goto hit_next;
+ +              }
+ +              goto search_again;
+ +      }
+ +
+ +      /*
+ +       *     | ---- desired range ---- |
+ +       * | state |
+ +       *   or
+ +       * | ------------- state -------------- |
+ +       *
+ +       * We need to split the extent we found, and may flip bits on
+ +       * second half.
+ +       *
+ +       * If the extent we found extends past our
+ +       * range, we just split and search again.  It'll get split
+ +       * again the next time though.
+ +       *
+ +       * If the extent we found is inside our range, we set the
+ +       * desired bit on it.
+ +       */
+ +      if (state->start < start) {
+ +              prealloc = alloc_extent_state_atomic(prealloc);
+ +              if (!prealloc)
+ +                      return -ENOMEM;
+ +              err = split_state(tree, state, prealloc, start);
+ +              BUG_ON(err == -EEXIST);
+ +              prealloc = NULL;
+ +              if (err)
+ +                      goto out;
+ +              if (state->end <= end) {
+ +                      set_state_bits(tree, state, &bits);
+ +                      clear_state_bit(tree, state, &clear_bits, 0);
+ +                      merge_state(tree, state);
+ +                      if (last_end == (u64)-1)
+ +                              goto out;
+ +                      start = last_end + 1;
+ +              }
+ +              goto search_again;
+ +      }
+ +      /*
+ +       * | ---- desired range ---- |
+ +       *     | state | or               | state |
+ +       *
+ +       * There's a hole, we need to insert something in it and
+ +       * ignore the extent we found.
+ +       */
+ +      if (state->start > start) {
+ +              u64 this_end;
+ +              if (end < last_start)
+ +                      this_end = end;
+ +              else
+ +                      this_end = last_start - 1;
+ +
+ +              prealloc = alloc_extent_state_atomic(prealloc);
+ +              if (!prealloc)
+ +                      return -ENOMEM;
+ +
+ +              /*
+ +               * Avoid to free 'prealloc' if it can be merged with
+ +               * the later extent.
+ +               */
+ +              err = insert_state(tree, prealloc, start, this_end,
+ +                                 &bits);
+ +              BUG_ON(err == -EEXIST);
+ +              if (err) {
+ +                      free_extent_state(prealloc);
+ +                      prealloc = NULL;
+ +                      goto out;
+ +              }
+ +              prealloc = NULL;
+ +              start = this_end + 1;
+ +              goto search_again;
+ +      }
+ +      /*
+ +       * | ---- desired range ---- |
+ +       *                        | state |
+ +       * We need to split the extent, and set the bit
+ +       * on the first half
+ +       */
+ +      if (state->start <= end && state->end > end) {
+ +              prealloc = alloc_extent_state_atomic(prealloc);
+ +              if (!prealloc)
+ +                      return -ENOMEM;
+ +
+ +              err = split_state(tree, state, prealloc, end + 1);
+ +              BUG_ON(err == -EEXIST);
+ +
+ +              set_state_bits(tree, prealloc, &bits);
+ +              clear_state_bit(tree, prealloc, &clear_bits, 0);
+ +
+ +              merge_state(tree, prealloc);
+ +              prealloc = NULL;
+ +              goto out;
+ +      }
+ +
+ +      goto search_again;
+ +
+ +out:
+ +      spin_unlock(&tree->lock);
+ +      if (prealloc)
+ +              free_extent_state(prealloc);
+ +
+ +      return err;
+ +
+ +search_again:
+ +      if (start > end)
+ +              goto out;
+ +      spin_unlock(&tree->lock);
+ +      if (mask & __GFP_WAIT)
+ +              cond_resched();
+ +      goto again;
+ +}
+ +
   /* wrappers around set/clear extent bit */
   int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                      gfp_t mask)
@@@ -1107,7 -920,7 +1108,7 @@@ int set_extent_delalloc(struct extent_i
                         struct extent_state **cached_state, gfp_t mask)
   {
         return set_extent_bit(tree, start, end,
- -                            EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ +                            EXTENT_DELALLOC | EXTENT_UPTODATE,
                               0, NULL, cached_state, mask);
   }
   
@@@ -1787,6 -1600,368 +1788,368 @@@ static int check_page_writeback(struct 
         return 0;
   }
   
+ /*
+  * When IO fails, either with EIO or csum verification fails, we
+  * try other mirrors that might have a good copy of the data.  This
+  * io_failure_record is used to record state as we go through all the
+  * mirrors.  If another mirror has good data, the page is set up to date
+  * and things continue.  If a good mirror can't be found, the original
+  * bio end_io callback is called to indicate things have failed.
+  */
+ struct io_failure_record {
+       struct page *page;
+       u64 start;
+       u64 len;
+       u64 logical;
+       unsigned long bio_flags;
+       int this_mirror;
+       int failed_mirror;
+       int in_validation;
+ };
+ 
+ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+                               int did_repair)
+ {
+       int ret;
+       int err = 0;
+       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ 
+       set_state_private(failure_tree, rec->start, 0);
+       ret = clear_extent_bits(failure_tree, rec->start,
+                               rec->start + rec->len - 1,
+                               EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+       if (ret)
+               err = ret;
+ 
+       if (did_repair) {
+               ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                                       rec->start + rec->len - 1,
+                                       EXTENT_DAMAGED, GFP_NOFS);
+               if (ret && !err)
+                       err = ret;
+       }
+ 
+       kfree(rec);
+       return err;
+ }
+ 
+ static void repair_io_failure_callback(struct bio *bio, int err)
+ {
+       complete(bio->bi_private);
+ }
+ 
+ /*
+  * this bypasses the standard btrfs submit functions deliberately, as
+  * the standard behavior is to write all copies in a raid setup. here we only
+  * want to write the one bad copy. so we do the mapping for ourselves and issue
+  * submit_bio directly.
+  * to avoid any synchonization issues, wait for the data after writing, which
+  * actually prevents the read that triggered the error from finishing.
+  * currently, there can be no more than two copies of every data bit. thus,
+  * exactly one rewrite is required.
+  */
+ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                       u64 length, u64 logical, struct page *page,
+                       int mirror_num)
+ {
+       struct bio *bio;
+       struct btrfs_device *dev;
+       DECLARE_COMPLETION_ONSTACK(compl);
+       u64 map_length = 0;
+       u64 sector;
+       struct btrfs_bio *bbio = NULL;
+       int ret;
+ 
+       BUG_ON(!mirror_num);
+ 
+       bio = bio_alloc(GFP_NOFS, 1);
+       if (!bio)
+               return -EIO;
+       bio->bi_private = &compl;
+       bio->bi_end_io = repair_io_failure_callback;
+       bio->bi_size = 0;
+       map_length = length;
+ 
+       ret = btrfs_map_block(map_tree, WRITE, logical,
+                             &map_length, &bbio, mirror_num);
+       if (ret) {
+               bio_put(bio);
+               return -EIO;
+       }
+       BUG_ON(mirror_num != bbio->mirror_num);
+       sector = bbio->stripes[mirror_num-1].physical >> 9;
+       bio->bi_sector = sector;
+       dev = bbio->stripes[mirror_num-1].dev;
+       kfree(bbio);
+       if (!dev || !dev->bdev || !dev->writeable) {
+               bio_put(bio);
+               return -EIO;
+       }
+       bio->bi_bdev = dev->bdev;
+       bio_add_page(bio, page, length, start-page_offset(page));
+       submit_bio(WRITE_SYNC, bio);
+       wait_for_completion(&compl);
+ 
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+               /* try to remap that extent elsewhere? */
+               bio_put(bio);
+               return -EIO;
+       }
+ 
+       printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+                       "sector %llu)\n", page->mapping->host->i_ino, start,
+                       dev->name, sector);
+ 
+       bio_put(bio);
+       return 0;
+ }
+ 
+ /*
+  * each time an IO finishes, we do a fast check in the IO failure tree
+  * to see if we need to process or clean up an io_failure_record
+  */
+ static int clean_io_failure(u64 start, struct page *page)
+ {
+       u64 private;
+       u64 private_failure;
+       struct io_failure_record *failrec;
+       struct btrfs_mapping_tree *map_tree;
+       struct extent_state *state;
+       int num_copies;
+       int did_repair = 0;
+       int ret;
+       struct inode *inode = page->mapping->host;
+ 
+       private = 0;
+       ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                               (u64)-1, 1, EXTENT_DIRTY, 0);
+       if (!ret)
+               return 0;
+ 
+       ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+                               &private_failure);
+       if (ret)
+               return 0;
+ 
+       failrec = (struct io_failure_record *)(unsigned long) private_failure;
+       BUG_ON(!failrec->this_mirror);
+ 
+       if (failrec->in_validation) {
+               /* there was no real error, just free the record */
+               pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+                        failrec->start);
+               did_repair = 1;
+               goto out;
+       }
+ 
+       spin_lock(&BTRFS_I(inode)->io_tree.lock);
+       state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                           failrec->start,
+                                           EXTENT_LOCKED);
+       spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+ 
+       if (state && state->start == failrec->start) {
+               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+               num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                                               failrec->len);
+               if (num_copies > 1)  {
+                       ret = repair_io_failure(map_tree, start, failrec->len,
+                                               failrec->logical, page,
+                                               failrec->failed_mirror);
+                       did_repair = !ret;
+               }
+       }
+ 
+ out:
+       if (!ret)
+               ret = free_io_failure(inode, failrec, did_repair);
+ 
+       return ret;
+ }
+ 
+ /*
+  * this is a generic handler for readpage errors (default
+  * readpage_io_failed_hook). if other copies exist, read those and write back
+  * good data to the failed position. does not investigate in remapping the
+  * failed extent elsewhere, hoping the device will be smart enough to do this as
+  * needed
+  */
+ 
+ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+                               u64 start, u64 end, int failed_mirror,
+                               struct extent_state *state)
+ {
+       struct io_failure_record *failrec = NULL;
+       u64 private;
+       struct extent_map *em;
+       struct inode *inode = page->mapping->host;
+       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct bio *bio;
+       int num_copies;
+       int ret;
+       int read_mode;
+       u64 logical;
+ 
+       BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+ 
+       ret = get_state_private(failure_tree, start, &private);
+       if (ret) {
+               failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+               if (!failrec)
+                       return -ENOMEM;
+               failrec->start = start;
+               failrec->len = end - start + 1;
+               failrec->this_mirror = 0;
+               failrec->bio_flags = 0;
+               failrec->in_validation = 0;
+ 
+               read_lock(&em_tree->lock);
+               em = lookup_extent_mapping(em_tree, start, failrec->len);
+               if (!em) {
+                       read_unlock(&em_tree->lock);
+                       kfree(failrec);
+                       return -EIO;
+               }
+ 
+               if (em->start > start || em->start + em->len < start) {
+                       free_extent_map(em);
+                       em = NULL;
+               }
+               read_unlock(&em_tree->lock);
+ 
+               if (!em || IS_ERR(em)) {
+                       kfree(failrec);
+                       return -EIO;
+               }
+               logical = start - em->start;
+               logical = em->block_start + logical;
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                       logical = em->block_start;
+                       failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&failrec->bio_flags,
+                                                em->compress_type);
+               }
+               pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+                        "len=%llu\n", logical, start, failrec->len);
+               failrec->logical = logical;
+               free_extent_map(em);
+ 
+               /* set the bits in the private failure tree */
+               ret = set_extent_bits(failure_tree, start, end,
+                                       EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+               if (ret >= 0)
+                       ret = set_state_private(failure_tree, start,
+                                               (u64)(unsigned long)failrec);
+               /* set the bits in the inode's tree */
+               if (ret >= 0)
+                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+                                               GFP_NOFS);
+               if (ret < 0) {
+                       kfree(failrec);
+                       return ret;
+               }
+       } else {
+               failrec = (struct io_failure_record *)(unsigned long)private;
+               pr_debug("bio_readpage_error: (found) logical=%llu, "
+                        "start=%llu, len=%llu, validation=%d\n",
+                        failrec->logical, failrec->start, failrec->len,
+                        failrec->in_validation);
+               /*
+                * when data can be on disk more than twice, add to failrec here
+                * (e.g. with a list for failed_mirror) to make
+                * clean_io_failure() clean all those errors at once.
+                */
+       }
+       num_copies = btrfs_num_copies(
+                             &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                             failrec->logical, failrec->len);
+       if (num_copies == 1) {
+               /*
+                * we only have a single copy of the data, so don't bother with
+                * all the retry and error correction code that follows. no
+                * matter what the error is, it is very likely to persist.
+                */
+               pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+                        "state=%p, num_copies=%d, next_mirror %d, "
+                        "failed_mirror %d\n", state, num_copies,
+                        failrec->this_mirror, failed_mirror);
+               free_io_failure(inode, failrec, 0);
+               return -EIO;
+       }
+ 
+       if (!state) {
+               spin_lock(&tree->lock);
+               state = find_first_extent_bit_state(tree, failrec->start,
+                                                   EXTENT_LOCKED);
+               if (state && state->start != failrec->start)
+                       state = NULL;
+               spin_unlock(&tree->lock);
+       }
+ 
+       /*
+        * there are two premises:
+        *      a) deliver good data to the caller
+        *      b) correct the bad sectors on disk
+        */
+       if (failed_bio->bi_vcnt > 1) {
+               /*
+                * to fulfill b), we need to know the exact failing sectors, as
+                * we don't want to rewrite any more than the failed ones. thus,
+                * we need separate read requests for the failed bio
+                *
+                * if the following BUG_ON triggers, our validation request got
+                * merged. we need separate requests for our algorithm to work.
+                */
+               BUG_ON(failrec->in_validation);
+               failrec->in_validation = 1;
+               failrec->this_mirror = failed_mirror;
+               read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+       } else {
+               /*
+                * we're ready to fulfill a) and b) alongside. get a good copy
+                * of the failed sector and if we succeed, we have setup
+                * everything for repair_io_failure to do the rest for us.
+                */
+               if (failrec->in_validation) {
+                       BUG_ON(failrec->this_mirror != failed_mirror);
+                       failrec->in_validation = 0;
+                       failrec->this_mirror = 0;
+               }
+               failrec->failed_mirror = failed_mirror;
+               failrec->this_mirror++;
+               if (failrec->this_mirror == failed_mirror)
+                       failrec->this_mirror++;
+               read_mode = READ_SYNC;
+       }
+ 
+       if (!state || failrec->this_mirror > num_copies) {
+               pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+                        "next_mirror %d, failed_mirror %d\n", state,
+                        num_copies, failrec->this_mirror, failed_mirror);
+               free_io_failure(inode, failrec, 0);
+               return -EIO;
+       }
+ 
+       bio = bio_alloc(GFP_NOFS, 1);
+       bio->bi_private = state;
+       bio->bi_end_io = failed_bio->bi_end_io;
+       bio->bi_sector = failrec->logical >> 9;
+       bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+       bio->bi_size = 0;
+ 
+       bio_add_page(bio, page, failrec->len, start - page_offset(page));
+ 
+       pr_debug("bio_readpage_error: submitting new read[%#x] to "
+                "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+                failrec->this_mirror, num_copies, failrec->in_validation);
+ 
+       tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+                                       failrec->bio_flags, 0);
+       return 0;
+ }
+ 
   /* lots and lots of room for performance fixes in the end_bio funcs */
   
   /*
@@@ -1885,6 -2060,9 +2248,9 @@@ static void end_bio_extent_readpage(str
                 struct extent_state *cached = NULL;
                 struct extent_state *state;
   
+               pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+                        "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+                        (long int)bio->bi_bdev);
                 tree = &BTRFS_I(page->mapping->host)->io_tree;
   
                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@@ -1915,11 -2093,19 +2281,19 @@@
                                                               state);
                         if (ret)
                                 uptodate = 0;
+                       else
+                               clean_io_failure(start, page);
                 }
-               if (!uptodate && tree->ops &&
-                   tree->ops->readpage_io_failed_hook) {
-                       ret = tree->ops->readpage_io_failed_hook(bio, page,
-                                                        start, end, state);
+               if (!uptodate) {
+                       u64 failed_mirror;
+                       failed_mirror = (u64)bio->bi_bdev;
+                       if (tree->ops && tree->ops->readpage_io_failed_hook)
+                               ret = tree->ops->readpage_io_failed_hook(
+                                               bio, page, start, end,
- -                                              failed_mirror, NULL);
++                                              failed_mirror, state);
+                       else
+                               ret = bio_readpage_error(bio, page, start, end,
+                                                        failed_mirror, NULL);
                         if (ret == 0) {
                                 uptodate =
                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
@@@ -1999,6 -2185,7 +2373,7 @@@ static int submit_one_bio(int rw, struc
                                            mirror_num, bio_flags, start);
         else
                 submit_bio(rw, bio);
+ 
         if (bio_flagged(bio, BIO_EOPNOTSUPP))
                 ret = -EOPNOTSUPP;
         bio_put(bio);
@@@ -2264,16 -2451,16 +2639,16 @@@ out
   }
   
   int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-                           get_extent_t *get_extent)
+                           get_extent_t *get_extent, int mirror_num)
   {
         struct bio *bio = NULL;
         unsigned long bio_flags = 0;
         int ret;
   
-       ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+       ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
                                       &bio_flags);
         if (bio)
-               ret = submit_one_bio(READ, bio, 0, bio_flags);
+               ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
         return ret;
   }
   
@@@ -2324,7 -2511,6 +2699,7 @@@ static int __extent_writepage(struct pa
         int compressed;
         int write_flags;
         unsigned long nr_written = 0;
+ +      bool fill_delalloc = true;
   
         if (wbc->sync_mode == WB_SYNC_ALL)
                 write_flags = WRITE_SYNC;
@@@ -2334,9 -2520,6 +2709,9 @@@
         trace___extent_writepage(page, inode, wbc);
   
         WARN_ON(!PageLocked(page));
+ +
+ +      ClearPageError(page);
+ +
         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
         if (page->index > end_index ||
            (page->index == end_index && !pg_offset)) {
@@@ -2358,13 -2541,10 +2733,13 @@@
   
         set_page_extent_mapped(page);
   
+ +      if (!tree->ops || !tree->ops->fill_delalloc)
+ +              fill_delalloc = false;
+ +
         delalloc_start = start;
         delalloc_end = 0;
         page_started = 0;
- -      if (!epd->extent_locked) {
+ +      if (!epd->extent_locked && fill_delalloc) {
                 u64 delalloc_to_write = 0;
                 /*
                  * make sure the wbc mapping index is at least updated
@@@ -2616,16 -2796,10 +2991,16 @@@ retry
                          * swizzled back from swapper_space to tmpfs file
                          * mapping
                          */
- -                      if (tree->ops && tree->ops->write_cache_pages_lock_hook)
- -                              tree->ops->write_cache_pages_lock_hook(page);
- -                      else
- -                              lock_page(page);
+ +                      if (tree->ops &&
+ +                          tree->ops->write_cache_pages_lock_hook) {
+ +                              tree->ops->write_cache_pages_lock_hook(page,
+ +                                                             data, flush_fn);
+ +                      } else {
+ +                              if (!trylock_page(page)) {
+ +                                      flush_fn(data);
+ +                                      lock_page(page);
+ +                              }
+ +                      }
   
                         if (unlikely(page->mapping != mapping)) {
                                 unlock_page(page);
@@@ -3127,7 -3301,7 +3502,7 @@@ out
         return ret;
   }
   
- static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+ inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                               unsigned long i)
   {
         struct page *p;
@@@ -3152,7 -3326,7 +3527,7 @@@
         return p;
   }
   
- static inline unsigned long num_extent_pages(u64 start, u64 len)
+ inline unsigned long num_extent_pages(u64 start, u64 len)
   {
         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
                 (start >> PAGE_CACHE_SHIFT);
@@@ -3405,7 -3579,6 +3780,7 @@@ int clear_extent_buffer_dirty(struct ex
                                                 PAGECACHE_TAG_DIRTY);
                 }
                 spin_unlock_irq(&page->mapping->tree_lock);
+ +              ClearPageError(page);
                 unlock_page(page);
         }
         return 0;
@@@ -3551,7 -3724,8 +3926,7 @@@ int extent_buffer_uptodate(struct exten
   }
   
   int read_extent_buffer_pages(struct extent_io_tree *tree,
- -                           struct extent_buffer *eb,
- -                           u64 start, int wait,
+ +                           struct extent_buffer *eb, u64 start, int wait,
                              get_extent_t *get_extent, int mirror_num)
   {
         unsigned long i;
@@@ -3587,7 -3761,7 +3962,7 @@@
         num_pages = num_extent_pages(eb->start, eb->len);
         for (i = start_i; i < num_pages; i++) {
                 page = extent_buffer_page(eb, i);
- -              if (!wait) {
+ +              if (wait == WAIT_NONE) {
                         if (!trylock_page(page))
                                 goto unlock_exit;
                 } else {
@@@ -3631,7 -3805,7 +4006,7 @@@
         if (bio)
                 submit_one_bio(READ, bio, mirror_num, bio_flags);
   
- -      if (ret || !wait)
+ +      if (ret || wait != WAIT_COMPLETE)
                 return ret;
   
         for (i = start_i; i < num_pages; i++) {
diff --combined fs/btrfs/extent_io.h

index 697570eed9e8228c700782e9f6c3d93945d902ca,a8e20b6729227f4f469bc06b473fa5b7135a9be3..feb9be0e23bcca09a77497d08c74a3c8864d2a8c
--- 1/fs/btrfs/extent_io.h
--- 2/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@@ -17,7 -17,7 +17,8 @@@
   #define EXTENT_NODATASUM (1 << 10)
   #define EXTENT_DO_ACCOUNTING (1 << 11)
   #define EXTENT_FIRST_DELALLOC (1 << 12)
- -#define EXTENT_DAMAGED (1 << 13)
+ +#define EXTENT_NEED_WAIT (1 << 13)
++#define EXTENT_DAMAGED (1 << 14)
   #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
   #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
   
@@@ -33,7 -33,6 +34,7 @@@
   #define EXTENT_BUFFER_BLOCKING 1
   #define EXTENT_BUFFER_DIRTY 2
   #define EXTENT_BUFFER_CORRUPT 3
+ +#define EXTENT_BUFFER_READAHEAD 4     /* this got triggered by readahead */
   
   /* these are flags for extent_clear_unlock_delalloc */
   #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@@ -69,7 -68,7 +70,7 @@@ struct extent_io_ops 
                               unsigned long bio_flags);
         int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
         int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-                                      u64 start, u64 end,
+                                      u64 start, u64 end, u64 failed_mirror,
                                        struct extent_state *state);
         int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
                                         u64 start, u64 end,
@@@ -87,8 -86,7 +88,8 @@@
                                   struct extent_state *other);
         void (*split_extent_hook)(struct inode *inode,
                                   struct extent_state *orig, u64 split);
- -      int (*write_cache_pages_lock_hook)(struct page *page);
+ +      int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+ +                                         void (*flush_fn)(void *));
   };
   
   struct extent_io_tree {
@@@ -188,7 -186,7 +189,7 @@@ int unlock_extent_cached(struct extent_
   int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
   int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-                         get_extent_t *get_extent);
+                         get_extent_t *get_extent, int mirror_num);
   int __init extent_io_init(void);
   void extent_io_exit(void);
   
@@@ -217,8 -215,6 +218,8 @@@ int set_extent_dirty(struct extent_io_t
                      gfp_t mask);
   int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
+ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ +                     int bits, int clear_bits, gfp_t mask);
   int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask);
   int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@@ -253,12 -249,11 +254,14 @@@ struct extent_buffer *alloc_extent_buff
   struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len);
   void free_extent_buffer(struct extent_buffer *eb);
+ +#define WAIT_NONE     0
+ +#define WAIT_COMPLETE 1
+ +#define WAIT_PAGE_LOCK        2
   int read_extent_buffer_pages(struct extent_io_tree *tree,
                              struct extent_buffer *eb, u64 start, int wait,
                              get_extent_t *get_extent, int mirror_num);
+ unsigned long num_extent_pages(u64 start, u64 len);
+ struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
   
   static inline void extent_buffer_get(struct extent_buffer *eb)
   {
@@@ -308,4 -303,10 +311,10 @@@ int extent_clear_unlock_delalloc(struc
   struct bio *
   btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                 gfp_t gfp_flags);
+ 
+ struct btrfs_mapping_tree;
+ 
+ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                       u64 length, u64 logical, struct page *page,
+                       int mirror_num);
   #endif
diff --combined fs/btrfs/inode.c

index b6b612e14ed78cada82e566c5f281d3081f64711,9327f45434e813839d7f0dec10ce34107344c642..9d0eaa57d4ee357f0eb14495dbdc55b76fc6c8d8
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -45,10 -45,10 +45,10 @@@
   #include "btrfs_inode.h"
   #include "ioctl.h"
   #include "print-tree.h"
- #include "volumes.h"
   #include "ordered-data.h"
   #include "xattr.h"
   #include "tree-log.h"
+ #include "volumes.h"
   #include "compression.h"
   #include "locking.h"
   #include "free-space-cache.h"
@@@ -393,10 -393,7 +393,10 @@@ again
              (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                 WARN_ON(pages);
                 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
- -              BUG_ON(!pages);
+ +              if (!pages) {
+ +                      /* just bail out to the uncompressed code */
+ +                      goto cont;
+ +              }
   
                 if (BTRFS_I(inode)->force_compress)
                         compress_type = BTRFS_I(inode)->force_compress;
@@@ -427,7 -424,6 +427,7 @@@
                         will_compress = 1;
                 }
         }
+ +cont:
         if (start == 0) {
                 trans = btrfs_join_transaction(root);
                 BUG_ON(IS_ERR(trans));
@@@ -824,7 -820,7 +824,7 @@@ static noinline int cow_file_range(stru
         }
   
         BUG_ON(disk_num_bytes >
- -             btrfs_super_total_bytes(&root->fs_info->super_copy));
+ +             btrfs_super_total_bytes(root->fs_info->super_copy));
   
         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@@ -1796,12 -1792,12 +1796,12 @@@ static int btrfs_finish_ordered_io(stru
         }
         ret = 0;
   out:
- -      if (nolock) {
- -              if (trans)
- -                      btrfs_end_transaction_nolock(trans, root);
- -      } else {
+ +      if (root != root->fs_info->tree_root)
                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
- -              if (trans)
+ +      if (trans) {
+ +              if (nolock)
+ +                      btrfs_end_transaction_nolock(trans, root);
+ +              else
                         btrfs_end_transaction(trans, root);
         }
   
@@@ -1822,154 -1818,10 +1822,10 @@@ static int btrfs_writepage_end_io_hook(
         return btrfs_finish_ordered_io(page->mapping->host, start, end);
   }
   
- /*
-  * When IO fails, either with EIO or csum verification fails, we
-  * try other mirrors that might have a good copy of the data.  This
-  * io_failure_record is used to record state as we go through all the
-  * mirrors.  If another mirror has good data, the page is set up to date
-  * and things continue.  If a good mirror can't be found, the original
-  * bio end_io callback is called to indicate things have failed.
-  */
- struct io_failure_record {
-       struct page *page;
-       u64 start;
-       u64 len;
-       u64 logical;
-       unsigned long bio_flags;
-       int last_mirror;
- };
- 
- static int btrfs_io_failed_hook(struct bio *failed_bio,
-                        struct page *page, u64 start, u64 end,
-                        struct extent_state *state)
- {
-       struct io_failure_record *failrec = NULL;
-       u64 private;
-       struct extent_map *em;
-       struct inode *inode = page->mapping->host;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       struct bio *bio;
-       int num_copies;
-       int ret;
-       int rw;
-       u64 logical;
- 
-       ret = get_state_private(failure_tree, start, &private);
-       if (ret) {
-               failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-               if (!failrec)
-                       return -ENOMEM;
-               failrec->start = start;
-               failrec->len = end - start + 1;
-               failrec->last_mirror = 0;
-               failrec->bio_flags = 0;
- 
-               read_lock(&em_tree->lock);
-               em = lookup_extent_mapping(em_tree, start, failrec->len);
-               if (em->start > start || em->start + em->len < start) {
-                       free_extent_map(em);
-                       em = NULL;
-               }
-               read_unlock(&em_tree->lock);
- 
-               if (IS_ERR_OR_NULL(em)) {
-                       kfree(failrec);
-                       return -EIO;
-               }
-               logical = start - em->start;
-               logical = em->block_start + logical;
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-                       logical = em->block_start;
-                       failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-                       extent_set_compress_type(&failrec->bio_flags,
-                                                em->compress_type);
-               }
-               failrec->logical = logical;
-               free_extent_map(em);
-               set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-                               EXTENT_DIRTY, GFP_NOFS);
-               set_state_private(failure_tree, start,
-                                (u64)(unsigned long)failrec);
-       } else {
-               failrec = (struct io_failure_record *)(unsigned long)private;
-       }
-       num_copies = btrfs_num_copies(
-                             &BTRFS_I(inode)->root->fs_info->mapping_tree,
-                             failrec->logical, failrec->len);
-       failrec->last_mirror++;
-       if (!state) {
-               spin_lock(&BTRFS_I(inode)->io_tree.lock);
-               state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-                                                   failrec->start,
-                                                   EXTENT_LOCKED);
-               if (state && state->start != failrec->start)
-                       state = NULL;
-               spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-       }
-       if (!state || failrec->last_mirror > num_copies) {
-               set_state_private(failure_tree, failrec->start, 0);
-               clear_extent_bits(failure_tree, failrec->start,
-                                 failrec->start + failrec->len - 1,
-                                 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-               kfree(failrec);
-               return -EIO;
-       }
-       bio = bio_alloc(GFP_NOFS, 1);
-       bio->bi_private = state;
-       bio->bi_end_io = failed_bio->bi_end_io;
-       bio->bi_sector = failrec->logical >> 9;
-       bio->bi_bdev = failed_bio->bi_bdev;
-       bio->bi_size = 0;
- 
-       bio_add_page(bio, page, failrec->len, start - page_offset(page));
-       if (failed_bio->bi_rw & REQ_WRITE)
-               rw = WRITE;
-       else
-               rw = READ;
- 
-       ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-                                                     failrec->last_mirror,
-                                                     failrec->bio_flags, 0);
-       return ret;
- }
- 
- /*
-  * each time an IO finishes, we do a fast check in the IO failure tree
-  * to see if we need to process or clean up an io_failure_record
-  */
- static int btrfs_clean_io_failures(struct inode *inode, u64 start)
- {
-       u64 private;
-       u64 private_failure;
-       struct io_failure_record *failure;
-       int ret;
- 
-       private = 0;
-       if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                            (u64)-1, 1, EXTENT_DIRTY, 0)) {
-               ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                       start, &private_failure);
-               if (ret == 0) {
-                       failure = (struct io_failure_record *)(unsigned long)
-                                  private_failure;
-                       set_state_private(&BTRFS_I(inode)->io_failure_tree,
-                                         failure->start, 0);
-                       clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-                                         failure->start,
-                                         failure->start + failure->len - 1,
-                                         EXTENT_DIRTY | EXTENT_LOCKED,
-                                         GFP_NOFS);
-                       kfree(failure);
-               }
-       }
-       return 0;
- }
- 
   /*
    * when reads are done, we need to check csums to verify the data is correct
-  * if there's a match, we allow the bio to finish.  If not, we go through
-  * the io_failure_record routines to find good copies
+  * if there's a match, we allow the bio to finish.  If not, the code in
+  * extent_io.c will try to find good copies for us.
    */
   static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state)
@@@ -2015,10 -1867,6 +1871,6 @@@
   
         kunmap_atomic(kaddr, KM_USER0);
   good:
-       /* if the io failure tree for this inode is non-empty,
-        * check to see if we've recovered from a failed IO
-        */
-       btrfs_clean_io_failures(inode, start);
         return 0;
   
   zeroit:
@@@ -2083,6 -1931,89 +1935,6 @@@ void btrfs_run_delayed_iputs(struct btr
         up_read(&root->fs_info->cleanup_work_sem);
   }
   
- -/*
- - * calculate extra metadata reservation when snapshotting a subvolume
- - * contains orphan files.
- - */
- -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
- -                              struct btrfs_pending_snapshot *pending,
- -                              u64 *bytes_to_reserve)
- -{
- -      struct btrfs_root *root;
- -      struct btrfs_block_rsv *block_rsv;
- -      u64 num_bytes;
- -      int index;
- -
- -      root = pending->root;
- -      if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- -              return;
- -
- -      block_rsv = root->orphan_block_rsv;
- -
- -      /* orphan block reservation for the snapshot */
- -      num_bytes = block_rsv->size;
- -
- -      /*
- -       * after the snapshot is created, COWing tree blocks may use more
- -       * space than it frees. So we should make sure there is enough
- -       * reserved space.
- -       */
- -      index = trans->transid & 0x1;
- -      if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- -              num_bytes += block_rsv->size -
- -                           (block_rsv->reserved + block_rsv->freed[index]);
- -      }
- -
- -      *bytes_to_reserve += num_bytes;
- -}
- -
- -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
- -                              struct btrfs_pending_snapshot *pending)
- -{
- -      struct btrfs_root *root = pending->root;
- -      struct btrfs_root *snap = pending->snap;
- -      struct btrfs_block_rsv *block_rsv;
- -      u64 num_bytes;
- -      int index;
- -      int ret;
- -
- -      if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
- -              return;
- -
- -      /* refill source subvolume's orphan block reservation */
- -      block_rsv = root->orphan_block_rsv;
- -      index = trans->transid & 0x1;
- -      if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
- -              num_bytes = block_rsv->size -
- -                          (block_rsv->reserved + block_rsv->freed[index]);
- -              ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- -                                            root->orphan_block_rsv,
- -                                            num_bytes);
- -              BUG_ON(ret);
- -      }
- -
- -      /* setup orphan block reservation for the snapshot */
- -      block_rsv = btrfs_alloc_block_rsv(snap);
- -      BUG_ON(!block_rsv);
- -
- -      btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- -      snap->orphan_block_rsv = block_rsv;
- -
- -      num_bytes = root->orphan_block_rsv->size;
- -      ret = btrfs_block_rsv_migrate(&pending->block_rsv,
- -                                    block_rsv, num_bytes);
- -      BUG_ON(ret);
- -
- -#if 0
- -      /* insert orphan item for the snapshot */
- -      WARN_ON(!root->orphan_item_inserted);
- -      ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
- -                                     snap->root_key.objectid);
- -      BUG_ON(ret);
- -      snap->orphan_item_inserted = 1;
- -#endif
- -}
- -
   enum btrfs_orphan_cleanup_state {
         ORPHAN_CLEANUP_STARTED  = 1,
         ORPHAN_CLEANUP_DONE     = 2,
@@@ -2168,6 -2099,9 +2020,6 @@@ int btrfs_orphan_add(struct btrfs_trans
         }
         spin_unlock(&root->orphan_lock);
   
- -      if (block_rsv)
- -              btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- -
         /* grab metadata reservation from transaction handle */
         if (reserve) {
                 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@@ -2234,7 -2168,6 +2086,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
         struct btrfs_key key, found_key;
         struct btrfs_trans_handle *trans;
         struct inode *inode;
+ +      u64 last_objectid = 0;
         int ret = 0, nr_unlink = 0, nr_truncate = 0;
   
         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@@ -2286,49 -2219,41 +2138,49 @@@
                  * crossing root thing.  we store the inode number in the
                  * offset of the orphan item.
                  */
+ +
+ +              if (found_key.offset == last_objectid) {
+ +                      printk(KERN_ERR "btrfs: Error removing orphan entry, "
+ +                             "stopping orphan cleanup\n");
+ +                      ret = -EINVAL;
+ +                      goto out;
+ +              }
+ +
+ +              last_objectid = found_key.offset;
+ +
                 found_key.objectid = found_key.offset;
                 found_key.type = BTRFS_INODE_ITEM_KEY;
                 found_key.offset = 0;
                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- -              if (IS_ERR(inode)) {
- -                      ret = PTR_ERR(inode);
+ +              ret = PTR_RET(inode);
+ +              if (ret && ret != -ESTALE)
                         goto out;
- -              }
   
                 /*
- -               * add this inode to the orphan list so btrfs_orphan_del does
- -               * the proper thing when we hit it
- -               */
- -              spin_lock(&root->orphan_lock);
- -              list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- -              spin_unlock(&root->orphan_lock);
- -
- -              /*
- -               * if this is a bad inode, means we actually succeeded in
- -               * removing the inode, but not the orphan record, which means
- -               * we need to manually delete the orphan since iput will just
- -               * do a destroy_inode
+ +               * Inode is already gone but the orphan item is still there,
+ +               * kill the orphan item.
                  */
- -              if (is_bad_inode(inode)) {
- -                      trans = btrfs_start_transaction(root, 0);
+ +              if (ret == -ESTALE) {
+ +                      trans = btrfs_start_transaction(root, 1);
                         if (IS_ERR(trans)) {
                                 ret = PTR_ERR(trans);
                                 goto out;
                         }
- -                      btrfs_orphan_del(trans, inode);
+ +                      ret = btrfs_del_orphan_item(trans, root,
+ +                                                  found_key.objectid);
+ +                      BUG_ON(ret);
                         btrfs_end_transaction(trans, root);
- -                      iput(inode);
                         continue;
                 }
   
+ +              /*
+ +               * add this inode to the orphan list so btrfs_orphan_del does
+ +               * the proper thing when we hit it
+ +               */
+ +              spin_lock(&root->orphan_lock);
+ +              list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ +              spin_unlock(&root->orphan_lock);
+ +
                 /* if we have links, this was a truncate, lets do that */
                 if (inode->i_nlink) {
                         if (!S_ISREG(inode->i_mode)) {
@@@ -2762,16 -2687,7 +2614,16 @@@ static struct btrfs_trans_handle *__unl
         u64 ino = btrfs_ino(inode);
         u64 dir_ino = btrfs_ino(dir);
   
- -      trans = btrfs_start_transaction(root, 10);
+ +      /*
+ +       * 1 for the possible orphan item
+ +       * 1 for the dir item
+ +       * 1 for the dir index
+ +       * 1 for the inode ref
+ +       * 1 for the inode ref in the tree log
+ +       * 2 for the dir entries in the log
+ +       * 1 for the inode
+ +       */
+ +      trans = btrfs_start_transaction(root, 8);
         if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                 return trans;
   
@@@ -2794,8 -2710,7 +2646,8 @@@
                 return ERR_PTR(-ENOMEM);
         }
   
- -      trans = btrfs_start_transaction(root, 0);
+ +      /* 1 for the orphan item */
+ +      trans = btrfs_start_transaction(root, 1);
         if (IS_ERR(trans)) {
                 btrfs_free_path(path);
                 root->fs_info->enospc_unlink = 0;
@@@ -2900,12 -2815,6 +2752,12 @@@
         err = 0;
   out:
         btrfs_free_path(path);
+ +      /* Migrate the orphan reservation over */
+ +      if (!err)
+ +              err = btrfs_block_rsv_migrate(trans->block_rsv,
+ +                              &root->fs_info->global_block_rsv,
+ +                              trans->bytes_reserved);
+ +
         if (err) {
                 btrfs_end_transaction(trans, root);
                 root->fs_info->enospc_unlink = 0;
@@@ -2920,9 -2829,6 +2772,9 @@@ static void __unlink_end_trans(struct b
                                struct btrfs_root *root)
   {
         if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ +              btrfs_block_rsv_release(root, trans->block_rsv,
+ +                                      trans->bytes_reserved);
+ +              trans->block_rsv = &root->fs_info->trans_block_rsv;
                 BUG_ON(!root->fs_info->enospc_unlink);
                 root->fs_info->enospc_unlink = 0;
         }
@@@ -3314,7 -3220,6 +3166,7 @@@ static int btrfs_truncate_page(struct a
         pgoff_t index = from >> PAGE_CACHE_SHIFT;
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
         struct page *page;
+ +      gfp_t mask = btrfs_alloc_write_mask(mapping);
         int ret = 0;
         u64 page_start;
         u64 page_end;
@@@ -3327,7 -3232,7 +3179,7 @@@
   
         ret = -ENOMEM;
   again:
- -      page = find_or_create_page(mapping, index, GFP_NOFS);
+ +      page = find_or_create_page(mapping, index, mask);
         if (!page) {
                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                 goto out;
@@@ -3560,8 -3465,6 +3412,8 @@@ void btrfs_evict_inode(struct inode *in
   {
         struct btrfs_trans_handle *trans;
         struct btrfs_root *root = BTRFS_I(inode)->root;
+ +      struct btrfs_block_rsv *rsv, *global_rsv;
+ +      u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
         unsigned long nr;
         int ret;
   
@@@ -3589,55 -3492,22 +3441,55 @@@
                 goto no_delete;
         }
   
+ +      rsv = btrfs_alloc_block_rsv(root);
+ +      if (!rsv) {
+ +              btrfs_orphan_del(NULL, inode);
+ +              goto no_delete;
+ +      }
+ +      rsv->size = min_size;
+ +      global_rsv = &root->fs_info->global_block_rsv;
+ +
         btrfs_i_size_write(inode, 0);
   
+ +      /*
+ +       * This is a bit simpler than btrfs_truncate since
+ +       *
+ +       * 1) We've already reserved our space for our orphan item in the
+ +       *    unlink.
+ +       * 2) We're going to delete the inode item, so we don't need to update
+ +       *    it at all.
+ +       *
+ +       * So we just need to reserve some slack space in case we add bytes when
+ +       * doing the truncate.
+ +       */
         while (1) {
- -              trans = btrfs_join_transaction(root);
- -              BUG_ON(IS_ERR(trans));
- -              trans->block_rsv = root->orphan_block_rsv;
+ +              ret = btrfs_block_rsv_refill(root, rsv, min_size);
+ +
+ +              /*
+ +               * Try and steal from the global reserve since we will
+ +               * likely not use this space anyway, we want to try as
+ +               * hard as possible to get this to work.
+ +               */
+ +              if (ret)
+ +                      ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
   
- -              ret = btrfs_block_rsv_check(trans, root,
- -                                          root->orphan_block_rsv, 0, 5);
                 if (ret) {
- -                      BUG_ON(ret != -EAGAIN);
- -                      ret = btrfs_commit_transaction(trans, root);
- -                      BUG_ON(ret);
- -                      continue;
+ +                      printk(KERN_WARNING "Could not get space for a "
+ +                             "delete, will truncate on mount %d\n", ret);
+ +                      btrfs_orphan_del(NULL, inode);
+ +                      btrfs_free_block_rsv(root, rsv);
+ +                      goto no_delete;
+ +              }
+ +
+ +              trans = btrfs_start_transaction(root, 0);
+ +              if (IS_ERR(trans)) {
+ +                      btrfs_orphan_del(NULL, inode);
+ +                      btrfs_free_block_rsv(root, rsv);
+ +                      goto no_delete;
                 }
   
+ +              trans->block_rsv = rsv;
+ +
                 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                 if (ret != -EAGAIN)
                         break;
@@@ -3646,17 -3516,14 +3498,17 @@@
                 btrfs_end_transaction(trans, root);
                 trans = NULL;
                 btrfs_btree_balance_dirty(root, nr);
- -
         }
   
+ +      btrfs_free_block_rsv(root, rsv);
+ +
         if (ret == 0) {
+ +              trans->block_rsv = root->orphan_block_rsv;
                 ret = btrfs_orphan_del(trans, inode);
                 BUG_ON(ret);
         }
   
+ +      trans->block_rsv = &root->fs_info->trans_block_rsv;
         if (!(root == root->fs_info->tree_root ||
               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                 btrfs_return_ino(root, btrfs_ino(inode));
@@@ -5780,7 -5647,8 +5632,7 @@@ again
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
                 if (!ret)
- -                      ret = btrfs_update_inode(trans, root, inode);
- -              err = ret;
+ +                      err = btrfs_update_inode(trans, root, inode);
                 goto out;
         }
   
@@@ -6273,7 -6141,7 +6125,7 @@@ int btrfs_readpage(struct file *file, s
   {
         struct extent_io_tree *tree;
         tree = &BTRFS_I(page->mapping->host)->io_tree;
-       return extent_read_full_page(tree, page, btrfs_get_extent);
+       return extent_read_full_page(tree, page, btrfs_get_extent, 0);
   }
   
   static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@@ -6525,7 -6393,6 +6377,7 @@@ static int btrfs_truncate(struct inode 
         struct btrfs_trans_handle *trans;
         unsigned long nr;
         u64 mask = root->sectorsize - 1;
+ +      u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
   
         ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
         if (ret)
@@@ -6573,23 -6440,19 +6425,23 @@@
         rsv = btrfs_alloc_block_rsv(root);
         if (!rsv)
                 return -ENOMEM;
- -      btrfs_add_durable_block_rsv(root->fs_info, rsv);
+ +      rsv->size = min_size;
   
+ +      /*
+ +       * 1 for the truncate slack space
+ +       * 1 for the orphan item we're going to add
+ +       * 1 for the orphan item deletion
+ +       * 1 for updating the inode.
+ +       */
         trans = btrfs_start_transaction(root, 4);
         if (IS_ERR(trans)) {
                 err = PTR_ERR(trans);
                 goto out;
         }
   
- -      /*
- -       * Reserve space for the truncate process.  Truncate should be adding
- -       * space, but if there are snapshots it may end up using space.
- -       */
- -      ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+ +      /* Migrate the slack space for the truncate to our reserve */
+ +      ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ +                                    min_size);
         BUG_ON(ret);
   
         ret = btrfs_orphan_add(trans, inode);
@@@ -6598,6 -6461,21 +6450,6 @@@
                 goto out;
         }
   
- -      nr = trans->blocks_used;
- -      btrfs_end_transaction(trans, root);
- -      btrfs_btree_balance_dirty(root, nr);
- -
- -      /*
- -       * Ok so we've already migrated our bytes over for the truncate, so here
- -       * just reserve the one slot we need for updating the inode.
- -       */
- -      trans = btrfs_start_transaction(root, 1);
- -      if (IS_ERR(trans)) {
- -              err = PTR_ERR(trans);
- -              goto out;
- -      }
- -      trans->block_rsv = rsv;
- -
         /*
          * setattr is responsible for setting the ordered_data_close flag,
          * but that is only tested during the last file release.  That
@@@ -6619,30 -6497,20 +6471,30 @@@
                 btrfs_add_ordered_operation(trans, root, inode);
   
         while (1) {
+ +              ret = btrfs_block_rsv_refill(root, rsv, min_size);
+ +              if (ret) {
+ +                      /*
+ +                       * This can only happen with the original transaction we
+ +                       * started above, every other time we shouldn't have a
+ +                       * transaction started yet.
+ +                       */
+ +                      if (ret == -EAGAIN)
+ +                              goto end_trans;
+ +                      err = ret;
+ +                      break;
+ +              }
+ +
                 if (!trans) {
- -                      trans = btrfs_start_transaction(root, 3);
+ +                      /* Just need the 1 for updating the inode */
+ +                      trans = btrfs_start_transaction(root, 1);
                         if (IS_ERR(trans)) {
                                 err = PTR_ERR(trans);
                                 goto out;
                         }
- -
- -                      ret = btrfs_truncate_reserve_metadata(trans, root,
- -                                                            rsv);
- -                      BUG_ON(ret);
- -
- -                      trans->block_rsv = rsv;
                 }
   
+ +              trans->block_rsv = rsv;
+ +
                 ret = btrfs_truncate_inode_items(trans, root, inode,
                                                  inode->i_size,
                                                  BTRFS_EXTENT_DATA_KEY);
@@@ -6657,7 -6525,7 +6509,7 @@@
                         err = ret;
                         break;
                 }
- -
+ +end_trans:
                 nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
                 trans = NULL;
@@@ -6739,9 -6607,9 +6591,9 @@@ struct inode *btrfs_alloc_inode(struct 
         ei->last_sub_trans = 0;
         ei->logged_trans = 0;
         ei->delalloc_bytes = 0;
- -      ei->reserved_bytes = 0;
         ei->disk_i_size = 0;
         ei->flags = 0;
+ +      ei->csum_bytes = 0;
         ei->index_cnt = (u64)-1;
         ei->last_unlink_trans = 0;
   
@@@ -6787,8 -6655,6 +6639,8 @@@ void btrfs_destroy_inode(struct inode *
         WARN_ON(inode->i_data.nrpages);
         WARN_ON(BTRFS_I(inode)->outstanding_extents);
         WARN_ON(BTRFS_I(inode)->reserved_extents);
+ +      WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+ +      WARN_ON(BTRFS_I(inode)->csum_bytes);
   
         /*
          * This can happen where we create an inode, but somebody else also
@@@ -7406,7 -7272,6 +7258,6 @@@ static struct extent_io_ops btrfs_exten
         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
         .writepage_start_hook = btrfs_writepage_start_hook,
-       .readpage_io_failed_hook = btrfs_io_failed_hook,
         .set_bit_hook = btrfs_set_bit_hook,
         .clear_bit_hook = btrfs_clear_bit_hook,
         .merge_extent_hook = btrfs_merge_extent_hook,
diff --combined fs/btrfs/ioctl.c

index 8f6e14279409dc9fcef4dd0702ed0f02c296404a,7f57efa76d11e5027e6cec4d3910672dfc79bd7f..cc98939903417238bb9a0be978bd0c1aef6ff99a
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -51,6 -51,7 +51,7 @@@
   #include "volumes.h"
   #include "locking.h"
   #include "inode-map.h"
+ #include "backref.h"
   
   /* Mask out flags that are inappropriate for the given type of inode. */
   static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@@ -117,7 -118,7 +118,7 @@@ void btrfs_update_iflags(struct inode *
   /*
    * Inherit flags from the parent inode.
    *
- - * Unlike extN we don't have any flags we don't want to inherit currently.
+ + * Currently only the compression flags and the cow flags are inherited.
    */
   void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
   {
@@@ -128,17 -129,12 +129,17 @@@
   
         flags = BTRFS_I(dir)->flags;
   
- -      if (S_ISREG(inode->i_mode))
- -              flags &= ~BTRFS_INODE_DIRSYNC;
- -      else if (!S_ISDIR(inode->i_mode))
- -              flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+ +      if (flags & BTRFS_INODE_NOCOMPRESS) {
+ +              BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ +              BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ +      } else if (flags & BTRFS_INODE_COMPRESS) {
+ +              BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ +              BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ +      }
+ +
+ +      if (flags & BTRFS_INODE_NODATACOW)
+ +              BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
   
- -      BTRFS_I(inode)->flags = flags;
         btrfs_update_iflags(inode);
   }
   
@@@ -282,7 -278,6 +283,7 @@@ static noinline int btrfs_ioctl_fitrim(
         struct fstrim_range range;
         u64 minlen = ULLONG_MAX;
         u64 num_devices = 0;
+ +      u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
@@@ -301,15 -296,12 +302,15 @@@
                 }
         }
         rcu_read_unlock();
+ +
         if (!num_devices)
                 return -EOPNOTSUPP;
- -
         if (copy_from_user(&range, arg, sizeof(range)))
                 return -EFAULT;
+ +      if (range.start > total_bytes)
+ +              return -EINVAL;
   
+ +      range.len = min(range.len, total_bytes - range.start);
         range.minlen = max(range.minlen, minlen);
         ret = btrfs_trim_fs(root, &range);
         if (ret < 0)
@@@ -769,7 -761,7 +770,7 @@@ static int should_defrag_range(struct i
         int ret = 1;
   
         /*
- -       * make sure that once we start defragging and extent, we keep on
+ +       * make sure that once we start defragging an extent, we keep on
          * defragging it
          */
         if (start < *defrag_end)
@@@ -814,6 -806,7 +815,6 @@@
          * extent will force at least part of that big extent to be defragged.
          */
         if (ret) {
- -              *last_len += len;
                 *defrag_end = extent_map_end(em);
         } else {
                 *last_len = 0;
@@@ -851,7 -844,6 +852,7 @@@ static int cluster_pages_for_defrag(str
         int i_done;
         struct btrfs_ordered_extent *ordered;
         struct extent_state *cached_state = NULL;
+ +      gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
   
         if (isize == 0)
                 return 0;
@@@ -869,7 -861,7 +870,7 @@@ again
         for (i = 0; i < num_pages; i++) {
                 struct page *page;
                 page = find_or_create_page(inode->i_mapping,
- -                                          start_index + i, GFP_NOFS);
+ +                                          start_index + i, mask);
                 if (!page)
                         break;
   
@@@ -981,20 -973,18 +982,20 @@@ int btrfs_defrag_file(struct inode *ino
         struct btrfs_super_block *disk_super;
         struct file_ra_state *ra = NULL;
         unsigned long last_index;
+ +      u64 isize = i_size_read(inode);
         u64 features;
         u64 last_len = 0;
         u64 skip = 0;
         u64 defrag_end = 0;
         u64 newer_off = range->start;
- -      int newer_left = 0;
         unsigned long i;
+ +      unsigned long ra_index = 0;
         int ret;
         int defrag_count = 0;
         int compress_type = BTRFS_COMPRESS_ZLIB;
         int extent_thresh = range->extent_thresh;
- -      int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ +      int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ +      int cluster = max_cluster;
         u64 new_align = ~((u64)128 * 1024 - 1);
         struct page **pages = NULL;
   
@@@ -1008,7 -998,7 +1009,7 @@@
                         compress_type = range->compress_type;
         }
   
- -      if (inode->i_size == 0)
+ +      if (isize == 0)
                 return 0;
   
         /*
@@@ -1024,7 -1014,7 +1025,7 @@@
                 ra = &file->f_ra;
         }
   
- -      pages = kmalloc(sizeof(struct page *) * newer_cluster,
+ +      pages = kmalloc(sizeof(struct page *) * max_cluster,
                         GFP_NOFS);
         if (!pages) {
                 ret = -ENOMEM;
@@@ -1033,10 -1023,10 +1034,10 @@@
   
         /* find the last page to defrag */
         if (range->start + range->len > range->start) {
- -              last_index = min_t(u64, inode->i_size - 1,
+ +              last_index = min_t(u64, isize - 1,
                          range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
         } else {
- -              last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ +              last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
         }
   
         if (newer_than) {
@@@ -1049,24 -1039,16 +1050,24 @@@
                          * the extents in the file evenly spaced
                          */
                         i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- -                      newer_left = newer_cluster;
                 } else
                         goto out_ra;
         } else {
                 i = range->start >> PAGE_CACHE_SHIFT;
         }
         if (!max_to_defrag)
- -              max_to_defrag = last_index - 1;
+ +              max_to_defrag = last_index;
+ +
+ +      /*
+ +       * make writeback starts from i, so the defrag range can be
+ +       * written sequentially.
+ +       */
+ +      if (i < inode->i_mapping->writeback_index)
+ +              inode->i_mapping->writeback_index = i;
   
- -      while (i <= last_index && defrag_count < max_to_defrag) {
+ +      while (i <= last_index && defrag_count < max_to_defrag &&
+ +             (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ +              PAGE_CACHE_SHIFT)) {
                 /*
                  * make sure we stop running if someone unmounts
                  * the FS
@@@ -1089,31 -1071,18 +1090,31 @@@
                         i = max(i + 1, next);
                         continue;
                 }
+ +
+ +              if (!newer_than) {
+ +                      cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+ +                                 PAGE_CACHE_SHIFT) - i;
+ +                      cluster = min(cluster, max_cluster);
+ +              } else {
+ +                      cluster = max_cluster;
+ +              }
+ +
                 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                         BTRFS_I(inode)->force_compress = compress_type;
   
- -              btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+ +              if (i + cluster > ra_index) {
+ +                      ra_index = max(i, ra_index);
+ +                      btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+ +                                     cluster);
+ +                      ra_index += max_cluster;
+ +              }
   
- -              ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+ +              ret = cluster_pages_for_defrag(inode, pages, i, cluster);
                 if (ret < 0)
                         goto out_ra;
   
                 defrag_count += ret;
                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
- -              i += ret;
   
                 if (newer_than) {
                         if (newer_off == (u64)-1)
@@@ -1128,17 -1097,12 +1129,17 @@@
                         if (!ret) {
                                 range->start = newer_off;
                                 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
- -                              newer_left = newer_cluster;
                         } else {
                                 break;
                         }
                 } else {
- -                      i++;
+ +                      if (ret > 0) {
+ +                              i += ret;
+ +                              last_len += ret << PAGE_CACHE_SHIFT;
+ +                      } else {
+ +                              i++;
+ +                              last_len = 0;
+ +                      }
                 }
         }
   
@@@ -1164,14 -1128,16 +1165,14 @@@
                 mutex_unlock(&inode->i_mutex);
         }
   
- -      disk_super = &root->fs_info->super_copy;
+ +      disk_super = root->fs_info->super_copy;
         features = btrfs_super_incompat_flags(disk_super);
         if (range->compress_type == BTRFS_COMPRESS_LZO) {
                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
                 btrfs_set_super_incompat_flags(disk_super, features);
         }
   
- -      if (!file)
- -              kfree(ra);
- -      return defrag_count;
+ +      ret = defrag_count;
   
   out_ra:
         if (!file)
@@@ -2613,7 -2579,7 +2614,7 @@@ static long btrfs_ioctl_default_subvol(
                 return PTR_ERR(trans);
         }
   
- -      dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+ +      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
         di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                    dir_id, "default", 7, 1);
         if (IS_ERR_OR_NULL(di)) {
@@@ -2629,7 -2595,7 +2630,7 @@@
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
   
- -      disk_super = &root->fs_info->super_copy;
+ +      disk_super = root->fs_info->super_copy;
         features = btrfs_super_incompat_flags(disk_super);
         if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
                 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@@ -2890,6 -2856,144 +2891,144 @@@ static long btrfs_ioctl_scrub_progress(
         return ret;
   }
   
- -      struct btrfs_ioctl_ino_path_args *ipa;
+ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+ {
+       int ret = 0;
+       int i;
+       unsigned long rel_ptr;
+       int size;
++      struct btrfs_ioctl_ino_path_args *ipa = NULL;
+       struct inode_fs_paths *ipath = NULL;
+       struct btrfs_path *path;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+ 
+       ipa = memdup_user(arg, sizeof(*ipa));
+       if (IS_ERR(ipa)) {
+               ret = PTR_ERR(ipa);
+               ipa = NULL;
+               goto out;
+       }
+ 
+       size = min_t(u32, ipa->size, 4096);
+       ipath = init_ipath(size, root, path);
+       if (IS_ERR(ipath)) {
+               ret = PTR_ERR(ipath);
+               ipath = NULL;
+               goto out;
+       }
+ 
+       ret = paths_from_inode(ipa->inum, ipath);
+       if (ret < 0)
+               goto out;
+ 
+       for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+               rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
+               ipath->fspath->str[i] = (void *)rel_ptr;
+       }
+ 
+       ret = copy_to_user(ipa->fspath, ipath->fspath, size);
+       if (ret) {
+               ret = -EFAULT;
+               goto out;
+       }
+ 
+ out:
+       btrfs_free_path(path);
+       free_ipath(ipath);
+       kfree(ipa);
+ 
+       return ret;
+ }
+ 
+ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+ {
+       struct btrfs_data_container *inodes = ctx;
+       const size_t c = 3 * sizeof(u64);
+ 
+       if (inodes->bytes_left >= c) {
+               inodes->bytes_left -= c;
+               inodes->val[inodes->elem_cnt] = inum;
+               inodes->val[inodes->elem_cnt + 1] = offset;
+               inodes->val[inodes->elem_cnt + 2] = root;
+               inodes->elem_cnt += 3;
+       } else {
+               inodes->bytes_missing += c - inodes->bytes_left;
+               inodes->bytes_left = 0;
+               inodes->elem_missed += 3;
+       }
+ 
+       return 0;
+ }
+ 
+ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+                                       void __user *arg)
+ {
+       int ret = 0;
+       int size;
+       u64 extent_offset;
+       struct btrfs_ioctl_logical_ino_args *loi;
+       struct btrfs_data_container *inodes = NULL;
+       struct btrfs_path *path = NULL;
+       struct btrfs_key key;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+ 
+       loi = memdup_user(arg, sizeof(*loi));
+       if (IS_ERR(loi)) {
+               ret = PTR_ERR(loi);
+               loi = NULL;
+               goto out;
+       }
+ 
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+ 
+       size = min_t(u32, loi->size, 4096);
+       inodes = init_data_container(size);
+       if (IS_ERR(inodes)) {
+               ret = PTR_ERR(inodes);
+               inodes = NULL;
+               goto out;
+       }
+ 
+       ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+ 
+       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+               ret = -ENOENT;
+       if (ret < 0)
+               goto out;
+ 
+       extent_offset = loi->logical - key.objectid;
+       ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
+                                       extent_offset, build_ino_list, inodes);
+ 
+       if (ret < 0)
+               goto out;
+ 
+       ret = copy_to_user(loi->inodes, inodes, size);
+       if (ret)
+               ret = -EFAULT;
+ 
+ out:
+       btrfs_free_path(path);
+       kfree(inodes);
+       kfree(loi);
+ 
+       return ret;
+ }
+ 
   long btrfs_ioctl(struct file *file, unsigned int
                 cmd, unsigned long arg)
   {
@@@ -2947,6 -3051,10 +3086,10 @@@
                 return btrfs_ioctl_tree_search(file, argp);
         case BTRFS_IOC_INO_LOOKUP:
                 return btrfs_ioctl_ino_lookup(file, argp);
+       case BTRFS_IOC_INO_PATHS:
+               return btrfs_ioctl_ino_to_path(root, argp);
+       case BTRFS_IOC_LOGICAL_INO:
+               return btrfs_ioctl_logical_to_ino(root, argp);
         case BTRFS_IOC_SPACE_INFO:
                 return btrfs_ioctl_space_info(root, argp);
         case BTRFS_IOC_SYNC:
diff --combined fs/btrfs/reada.c

index 2b701d0822274990ba65a111bfd34927f34495d4,0000000000000000000000000000000000000000..cd857119ba8a4cd2ea9ee85bb1a60fe07fb3ad11

mode 100644,000000..100644
--- 1/fs/btrfs/reada.c
--- /dev/null
+++ b/fs/btrfs/reada.c
@@@ -1,949 -1,0 +1,949 @@@
-                                         struct btrfs_multi_bio *multi)
+ +/*
+ + * Copyright (C) 2011 STRATO.  All rights reserved.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public
+ + * License v2 as published by the Free Software Foundation.
+ + *
+ + * This program is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU General Public
+ + * License along with this program; if not, write to the
+ + * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ + * Boston, MA 021110-1307, USA.
+ + */
+ +
+ +#include <linux/sched.h>
+ +#include <linux/pagemap.h>
+ +#include <linux/writeback.h>
+ +#include <linux/blkdev.h>
+ +#include <linux/rbtree.h>
+ +#include <linux/slab.h>
+ +#include <linux/workqueue.h>
+ +#include "ctree.h"
+ +#include "volumes.h"
+ +#include "disk-io.h"
+ +#include "transaction.h"
+ +
+ +#undef DEBUG
+ +
+ +/*
+ + * This is the implementation for the generic read ahead framework.
+ + *
+ + * To trigger a readahead, btrfs_reada_add must be called. It will start
+ + * a read ahead for the given range [start, end) on tree root. The returned
+ + * handle can either be used to wait on the readahead to finish
+ + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ + *
+ + * The read ahead works as follows:
+ + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ + * reada_start_machine will then search for extents to prefetch and trigger
+ + * some reads. When a read finishes for a node, all contained node/leaf
+ + * pointers that lie in the given range will also be enqueued. The reads will
+ + * be triggered in sequential order, thus giving a big win over a naive
+ + * enumeration. It will also make use of multi-device layouts. Each disk
+ + * will have its on read pointer and all disks will by utilized in parallel.
+ + * Also will no two disks read both sides of a mirror simultaneously, as this
+ + * would waste seeking capacity. Instead both disks will read different parts
+ + * of the filesystem.
+ + * Any number of readaheads can be started in parallel. The read order will be
+ + * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ + * than the 2 started one after another.
+ + */
+ +
+ +#define MAX_MIRRORS 2
+ +#define MAX_IN_FLIGHT 6
+ +
+ +struct reada_extctl {
+ +      struct list_head        list;
+ +      struct reada_control    *rc;
+ +      u64                     generation;
+ +};
+ +
+ +struct reada_extent {
+ +      u64                     logical;
+ +      struct btrfs_key        top;
+ +      u32                     blocksize;
+ +      int                     err;
+ +      struct list_head        extctl;
+ +      struct kref             refcnt;
+ +      spinlock_t              lock;
+ +      struct reada_zone       *zones[MAX_MIRRORS];
+ +      int                     nzones;
+ +      struct btrfs_device     *scheduled_for;
+ +};
+ +
+ +struct reada_zone {
+ +      u64                     start;
+ +      u64                     end;
+ +      u64                     elems;
+ +      struct list_head        list;
+ +      spinlock_t              lock;
+ +      int                     locked;
+ +      struct btrfs_device     *device;
+ +      struct btrfs_device     *devs[MAX_MIRRORS]; /* full list, incl self */
+ +      int                     ndevs;
+ +      struct kref             refcnt;
+ +};
+ +
+ +struct reada_machine_work {
+ +      struct btrfs_work       work;
+ +      struct btrfs_fs_info    *fs_info;
+ +};
+ +
+ +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+ +static void reada_control_release(struct kref *kref);
+ +static void reada_zone_release(struct kref *kref);
+ +static void reada_start_machine(struct btrfs_fs_info *fs_info);
+ +static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+ +
+ +static int reada_add_block(struct reada_control *rc, u64 logical,
+ +                         struct btrfs_key *top, int level, u64 generation);
+ +
+ +/* recurses */
+ +/* in case of err, eb might be NULL */
+ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ +                          u64 start, int err)
+ +{
+ +      int level = 0;
+ +      int nritems;
+ +      int i;
+ +      u64 bytenr;
+ +      u64 generation;
+ +      struct reada_extent *re;
+ +      struct btrfs_fs_info *fs_info = root->fs_info;
+ +      struct list_head list;
+ +      unsigned long index = start >> PAGE_CACHE_SHIFT;
+ +      struct btrfs_device *for_dev;
+ +
+ +      if (eb)
+ +              level = btrfs_header_level(eb);
+ +
+ +      /* find extent */
+ +      spin_lock(&fs_info->reada_lock);
+ +      re = radix_tree_lookup(&fs_info->reada_tree, index);
+ +      if (re)
+ +              kref_get(&re->refcnt);
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      if (!re)
+ +              return -1;
+ +
+ +      spin_lock(&re->lock);
+ +      /*
+ +       * just take the full list from the extent. afterwards we
+ +       * don't need the lock anymore
+ +       */
+ +      list_replace_init(&re->extctl, &list);
+ +      for_dev = re->scheduled_for;
+ +      re->scheduled_for = NULL;
+ +      spin_unlock(&re->lock);
+ +
+ +      if (err == 0) {
+ +              nritems = level ? btrfs_header_nritems(eb) : 0;
+ +              generation = btrfs_header_generation(eb);
+ +              /*
+ +               * FIXME: currently we just set nritems to 0 if this is a leaf,
+ +               * effectively ignoring the content. In a next step we could
+ +               * trigger more readahead depending from the content, e.g.
+ +               * fetch the checksums for the extents in the leaf.
+ +               */
+ +      } else {
+ +              /*
+ +               * this is the error case, the extent buffer has not been
+ +               * read correctly. We won't access anything from it and
+ +               * just cleanup our data structures. Effectively this will
+ +               * cut the branch below this node from read ahead.
+ +               */
+ +              nritems = 0;
+ +              generation = 0;
+ +      }
+ +
+ +      for (i = 0; i < nritems; i++) {
+ +              struct reada_extctl *rec;
+ +              u64 n_gen;
+ +              struct btrfs_key key;
+ +              struct btrfs_key next_key;
+ +
+ +              btrfs_node_key_to_cpu(eb, &key, i);
+ +              if (i + 1 < nritems)
+ +                      btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+ +              else
+ +                      next_key = re->top;
+ +              bytenr = btrfs_node_blockptr(eb, i);
+ +              n_gen = btrfs_node_ptr_generation(eb, i);
+ +
+ +              list_for_each_entry(rec, &list, list) {
+ +                      struct reada_control *rc = rec->rc;
+ +
+ +                      /*
+ +                       * if the generation doesn't match, just ignore this
+ +                       * extctl. This will probably cut off a branch from
+ +                       * prefetch. Alternatively one could start a new (sub-)
+ +                       * prefetch for this branch, starting again from root.
+ +                       * FIXME: move the generation check out of this loop
+ +                       */
+ +#ifdef DEBUG
+ +                      if (rec->generation != generation) {
+ +                              printk(KERN_DEBUG "generation mismatch for "
+ +                                              "(%llu,%d,%llu) %llu != %llu\n",
+ +                                     key.objectid, key.type, key.offset,
+ +                                     rec->generation, generation);
+ +                      }
+ +#endif
+ +                      if (rec->generation == generation &&
+ +                          btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+ +                          btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+ +                              reada_add_block(rc, bytenr, &next_key,
+ +                                              level - 1, n_gen);
+ +              }
+ +      }
+ +      /*
+ +       * free extctl records
+ +       */
+ +      while (!list_empty(&list)) {
+ +              struct reada_control *rc;
+ +              struct reada_extctl *rec;
+ +
+ +              rec = list_first_entry(&list, struct reada_extctl, list);
+ +              list_del(&rec->list);
+ +              rc = rec->rc;
+ +              kfree(rec);
+ +
+ +              kref_get(&rc->refcnt);
+ +              if (atomic_dec_and_test(&rc->elems)) {
+ +                      kref_put(&rc->refcnt, reada_control_release);
+ +                      wake_up(&rc->wait);
+ +              }
+ +              kref_put(&rc->refcnt, reada_control_release);
+ +
+ +              reada_extent_put(fs_info, re);  /* one ref for each entry */
+ +      }
+ +      reada_extent_put(fs_info, re);  /* our ref */
+ +      if (for_dev)
+ +              atomic_dec(&for_dev->reada_in_flight);
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * start is passed separately in case eb in NULL, which may be the case with
+ + * failed I/O
+ + */
+ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+ +                       u64 start, int err)
+ +{
+ +      int ret;
+ +
+ +      ret = __readahead_hook(root, eb, start, err);
+ +
+ +      reada_start_machine(root->fs_info);
+ +
+ +      return ret;
+ +}
+ +
+ +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+ +                                        struct btrfs_device *dev, u64 logical,
-       struct btrfs_multi_bio *multi = NULL;
++                                        struct btrfs_bio *multi)
+ +{
+ +      int ret;
+ +      int looped = 0;
+ +      struct reada_zone *zone;
+ +      struct btrfs_block_group_cache *cache = NULL;
+ +      u64 start;
+ +      u64 end;
+ +      int i;
+ +
+ +again:
+ +      zone = NULL;
+ +      spin_lock(&fs_info->reada_lock);
+ +      ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+ +                                   logical >> PAGE_CACHE_SHIFT, 1);
+ +      if (ret == 1)
+ +              kref_get(&zone->refcnt);
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      if (ret == 1) {
+ +              if (logical >= zone->start && logical < zone->end)
+ +                      return zone;
+ +              spin_lock(&fs_info->reada_lock);
+ +              kref_put(&zone->refcnt, reada_zone_release);
+ +              spin_unlock(&fs_info->reada_lock);
+ +      }
+ +
+ +      if (looped)
+ +              return NULL;
+ +
+ +      cache = btrfs_lookup_block_group(fs_info, logical);
+ +      if (!cache)
+ +              return NULL;
+ +
+ +      start = cache->key.objectid;
+ +      end = start + cache->key.offset - 1;
+ +      btrfs_put_block_group(cache);
+ +
+ +      zone = kzalloc(sizeof(*zone), GFP_NOFS);
+ +      if (!zone)
+ +              return NULL;
+ +
+ +      zone->start = start;
+ +      zone->end = end;
+ +      INIT_LIST_HEAD(&zone->list);
+ +      spin_lock_init(&zone->lock);
+ +      zone->locked = 0;
+ +      kref_init(&zone->refcnt);
+ +      zone->elems = 0;
+ +      zone->device = dev; /* our device always sits at index 0 */
+ +      for (i = 0; i < multi->num_stripes; ++i) {
+ +              /* bounds have already been checked */
+ +              zone->devs[i] = multi->stripes[i].dev;
+ +      }
+ +      zone->ndevs = multi->num_stripes;
+ +
+ +      spin_lock(&fs_info->reada_lock);
+ +      ret = radix_tree_insert(&dev->reada_zones,
+ +                              (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+ +                              zone);
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      if (ret) {
+ +              kfree(zone);
+ +              looped = 1;
+ +              goto again;
+ +      }
+ +
+ +      return zone;
+ +}
+ +
+ +static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+ +                                            u64 logical,
+ +                                            struct btrfs_key *top, int level)
+ +{
+ +      int ret;
+ +      int looped = 0;
+ +      struct reada_extent *re = NULL;
+ +      struct btrfs_fs_info *fs_info = root->fs_info;
+ +      struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
++      struct btrfs_bio *multi = NULL;
+ +      struct btrfs_device *dev;
+ +      u32 blocksize;
+ +      u64 length;
+ +      int nzones = 0;
+ +      int i;
+ +      unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ +
+ +again:
+ +      spin_lock(&fs_info->reada_lock);
+ +      re = radix_tree_lookup(&fs_info->reada_tree, index);
+ +      if (re)
+ +              kref_get(&re->refcnt);
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      if (re || looped)
+ +              return re;
+ +
+ +      re = kzalloc(sizeof(*re), GFP_NOFS);
+ +      if (!re)
+ +              return NULL;
+ +
+ +      blocksize = btrfs_level_size(root, level);
+ +      re->logical = logical;
+ +      re->blocksize = blocksize;
+ +      re->top = *top;
+ +      INIT_LIST_HEAD(&re->extctl);
+ +      spin_lock_init(&re->lock);
+ +      kref_init(&re->refcnt);
+ +
+ +      /*
+ +       * map block
+ +       */
+ +      length = blocksize;
+ +      ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
+ +      if (ret || !multi || length < blocksize)
+ +              goto error;
+ +
+ +      if (multi->num_stripes > MAX_MIRRORS) {
+ +              printk(KERN_ERR "btrfs readahead: more than %d copies not "
+ +                              "supported", MAX_MIRRORS);
+ +              goto error;
+ +      }
+ +
+ +      for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+ +              struct reada_zone *zone;
+ +
+ +              dev = multi->stripes[nzones].dev;
+ +              zone = reada_find_zone(fs_info, dev, logical, multi);
+ +              if (!zone)
+ +                      break;
+ +
+ +              re->zones[nzones] = zone;
+ +              spin_lock(&zone->lock);
+ +              if (!zone->elems)
+ +                      kref_get(&zone->refcnt);
+ +              ++zone->elems;
+ +              spin_unlock(&zone->lock);
+ +              spin_lock(&fs_info->reada_lock);
+ +              kref_put(&zone->refcnt, reada_zone_release);
+ +              spin_unlock(&fs_info->reada_lock);
+ +      }
+ +      re->nzones = nzones;
+ +      if (nzones == 0) {
+ +              /* not a single zone found, error and out */
+ +              goto error;
+ +      }
+ +
+ +      /* insert extent in reada_tree + all per-device trees, all or nothing */
+ +      spin_lock(&fs_info->reada_lock);
+ +      ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+ +      if (ret) {
+ +              spin_unlock(&fs_info->reada_lock);
+ +              if (ret != -ENOMEM) {
+ +                      /* someone inserted the extent in the meantime */
+ +                      looped = 1;
+ +              }
+ +              goto error;
+ +      }
+ +      for (i = 0; i < nzones; ++i) {
+ +              dev = multi->stripes[i].dev;
+ +              ret = radix_tree_insert(&dev->reada_extents, index, re);
+ +              if (ret) {
+ +                      while (--i >= 0) {
+ +                              dev = multi->stripes[i].dev;
+ +                              BUG_ON(dev == NULL);
+ +                              radix_tree_delete(&dev->reada_extents, index);
+ +                      }
+ +                      BUG_ON(fs_info == NULL);
+ +                      radix_tree_delete(&fs_info->reada_tree, index);
+ +                      spin_unlock(&fs_info->reada_lock);
+ +                      goto error;
+ +              }
+ +      }
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      return re;
+ +
+ +error:
+ +      while (nzones) {
+ +              struct reada_zone *zone;
+ +
+ +              --nzones;
+ +              zone = re->zones[nzones];
+ +              kref_get(&zone->refcnt);
+ +              spin_lock(&zone->lock);
+ +              --zone->elems;
+ +              if (zone->elems == 0) {
+ +                      /*
+ +                       * no fs_info->reada_lock needed, as this can't be
+ +                       * the last ref
+ +                       */
+ +                      kref_put(&zone->refcnt, reada_zone_release);
+ +              }
+ +              spin_unlock(&zone->lock);
+ +
+ +              spin_lock(&fs_info->reada_lock);
+ +              kref_put(&zone->refcnt, reada_zone_release);
+ +              spin_unlock(&fs_info->reada_lock);
+ +      }
+ +      kfree(re);
+ +      if (looped)
+ +              goto again;
+ +      return NULL;
+ +}
+ +
+ +static void reada_kref_dummy(struct kref *kr)
+ +{
+ +}
+ +
+ +static void reada_extent_put(struct btrfs_fs_info *fs_info,
+ +                           struct reada_extent *re)
+ +{
+ +      int i;
+ +      unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+ +
+ +      spin_lock(&fs_info->reada_lock);
+ +      if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+ +              spin_unlock(&fs_info->reada_lock);
+ +              return;
+ +      }
+ +
+ +      radix_tree_delete(&fs_info->reada_tree, index);
+ +      for (i = 0; i < re->nzones; ++i) {
+ +              struct reada_zone *zone = re->zones[i];
+ +
+ +              radix_tree_delete(&zone->device->reada_extents, index);
+ +      }
+ +
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      for (i = 0; i < re->nzones; ++i) {
+ +              struct reada_zone *zone = re->zones[i];
+ +
+ +              kref_get(&zone->refcnt);
+ +              spin_lock(&zone->lock);
+ +              --zone->elems;
+ +              if (zone->elems == 0) {
+ +                      /* no fs_info->reada_lock needed, as this can't be
+ +                       * the last ref */
+ +                      kref_put(&zone->refcnt, reada_zone_release);
+ +              }
+ +              spin_unlock(&zone->lock);
+ +
+ +              spin_lock(&fs_info->reada_lock);
+ +              kref_put(&zone->refcnt, reada_zone_release);
+ +              spin_unlock(&fs_info->reada_lock);
+ +      }
+ +      if (re->scheduled_for)
+ +              atomic_dec(&re->scheduled_for->reada_in_flight);
+ +
+ +      kfree(re);
+ +}
+ +
+ +static void reada_zone_release(struct kref *kref)
+ +{
+ +      struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+ +
+ +      radix_tree_delete(&zone->device->reada_zones,
+ +                        zone->end >> PAGE_CACHE_SHIFT);
+ +
+ +      kfree(zone);
+ +}
+ +
+ +static void reada_control_release(struct kref *kref)
+ +{
+ +      struct reada_control *rc = container_of(kref, struct reada_control,
+ +                                              refcnt);
+ +
+ +      kfree(rc);
+ +}
+ +
+ +static int reada_add_block(struct reada_control *rc, u64 logical,
+ +                         struct btrfs_key *top, int level, u64 generation)
+ +{
+ +      struct btrfs_root *root = rc->root;
+ +      struct reada_extent *re;
+ +      struct reada_extctl *rec;
+ +
+ +      re = reada_find_extent(root, logical, top, level); /* takes one ref */
+ +      if (!re)
+ +              return -1;
+ +
+ +      rec = kzalloc(sizeof(*rec), GFP_NOFS);
+ +      if (!rec) {
+ +              reada_extent_put(root->fs_info, re);
+ +              return -1;
+ +      }
+ +
+ +      rec->rc = rc;
+ +      rec->generation = generation;
+ +      atomic_inc(&rc->elems);
+ +
+ +      spin_lock(&re->lock);
+ +      list_add_tail(&rec->list, &re->extctl);
+ +      spin_unlock(&re->lock);
+ +
+ +      /* leave the ref on the extent */
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * called with fs_info->reada_lock held
+ + */
+ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+ +{
+ +      int i;
+ +      unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+ +
+ +      for (i = 0; i < zone->ndevs; ++i) {
+ +              struct reada_zone *peer;
+ +              peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+ +              if (peer && peer->device != zone->device)
+ +                      peer->locked = lock;
+ +      }
+ +}
+ +
+ +/*
+ + * called with fs_info->reada_lock held
+ + */
+ +static int reada_pick_zone(struct btrfs_device *dev)
+ +{
+ +      struct reada_zone *top_zone = NULL;
+ +      struct reada_zone *top_locked_zone = NULL;
+ +      u64 top_elems = 0;
+ +      u64 top_locked_elems = 0;
+ +      unsigned long index = 0;
+ +      int ret;
+ +
+ +      if (dev->reada_curr_zone) {
+ +              reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+ +              kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+ +              dev->reada_curr_zone = NULL;
+ +      }
+ +      /* pick the zone with the most elements */
+ +      while (1) {
+ +              struct reada_zone *zone;
+ +
+ +              ret = radix_tree_gang_lookup(&dev->reada_zones,
+ +                                           (void **)&zone, index, 1);
+ +              if (ret == 0)
+ +                      break;
+ +              index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ +              if (zone->locked) {
+ +                      if (zone->elems > top_locked_elems) {
+ +                              top_locked_elems = zone->elems;
+ +                              top_locked_zone = zone;
+ +                      }
+ +              } else {
+ +                      if (zone->elems > top_elems) {
+ +                              top_elems = zone->elems;
+ +                              top_zone = zone;
+ +                      }
+ +              }
+ +      }
+ +      if (top_zone)
+ +              dev->reada_curr_zone = top_zone;
+ +      else if (top_locked_zone)
+ +              dev->reada_curr_zone = top_locked_zone;
+ +      else
+ +              return 0;
+ +
+ +      dev->reada_next = dev->reada_curr_zone->start;
+ +      kref_get(&dev->reada_curr_zone->refcnt);
+ +      reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+ +
+ +      return 1;
+ +}
+ +
+ +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+ +                                 struct btrfs_device *dev)
+ +{
+ +      struct reada_extent *re = NULL;
+ +      int mirror_num = 0;
+ +      struct extent_buffer *eb = NULL;
+ +      u64 logical;
+ +      u32 blocksize;
+ +      int ret;
+ +      int i;
+ +      int need_kick = 0;
+ +
+ +      spin_lock(&fs_info->reada_lock);
+ +      if (dev->reada_curr_zone == NULL) {
+ +              ret = reada_pick_zone(dev);
+ +              if (!ret) {
+ +                      spin_unlock(&fs_info->reada_lock);
+ +                      return 0;
+ +              }
+ +      }
+ +      /*
+ +       * FIXME currently we issue the reads one extent at a time. If we have
+ +       * a contiguous block of extents, we could also coagulate them or use
+ +       * plugging to speed things up
+ +       */
+ +      ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ +                                   dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ +      if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+ +              ret = reada_pick_zone(dev);
+ +              if (!ret) {
+ +                      spin_unlock(&fs_info->reada_lock);
+ +                      return 0;
+ +              }
+ +              re = NULL;
+ +              ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+ +                                      dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ +      }
+ +      if (ret == 0) {
+ +              spin_unlock(&fs_info->reada_lock);
+ +              return 0;
+ +      }
+ +      dev->reada_next = re->logical + re->blocksize;
+ +      kref_get(&re->refcnt);
+ +
+ +      spin_unlock(&fs_info->reada_lock);
+ +
+ +      /*
+ +       * find mirror num
+ +       */
+ +      for (i = 0; i < re->nzones; ++i) {
+ +              if (re->zones[i]->device == dev) {
+ +                      mirror_num = i + 1;
+ +                      break;
+ +              }
+ +      }
+ +      logical = re->logical;
+ +      blocksize = re->blocksize;
+ +
+ +      spin_lock(&re->lock);
+ +      if (re->scheduled_for == NULL) {
+ +              re->scheduled_for = dev;
+ +              need_kick = 1;
+ +      }
+ +      spin_unlock(&re->lock);
+ +
+ +      reada_extent_put(fs_info, re);
+ +
+ +      if (!need_kick)
+ +              return 0;
+ +
+ +      atomic_inc(&dev->reada_in_flight);
+ +      ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+ +                       mirror_num, &eb);
+ +      if (ret)
+ +              __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+ +      else if (eb)
+ +              __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+ +
+ +      if (eb)
+ +              free_extent_buffer(eb);
+ +
+ +      return 1;
+ +
+ +}
+ +
+ +static void reada_start_machine_worker(struct btrfs_work *work)
+ +{
+ +      struct reada_machine_work *rmw;
+ +      struct btrfs_fs_info *fs_info;
+ +
+ +      rmw = container_of(work, struct reada_machine_work, work);
+ +      fs_info = rmw->fs_info;
+ +
+ +      kfree(rmw);
+ +
+ +      __reada_start_machine(fs_info);
+ +}
+ +
+ +static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+ +{
+ +      struct btrfs_device *device;
+ +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ +      u64 enqueued;
+ +      u64 total = 0;
+ +      int i;
+ +
+ +      do {
+ +              enqueued = 0;
+ +              list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ +                      if (atomic_read(&device->reada_in_flight) <
+ +                          MAX_IN_FLIGHT)
+ +                              enqueued += reada_start_machine_dev(fs_info,
+ +                                                                  device);
+ +              }
+ +              total += enqueued;
+ +      } while (enqueued && total < 10000);
+ +
+ +      if (enqueued == 0)
+ +              return;
+ +
+ +      /*
+ +       * If everything is already in the cache, this is effectively single
+ +       * threaded. To a) not hold the caller for too long and b) to utilize
+ +       * more cores, we broke the loop above after 10000 iterations and now
+ +       * enqueue to workers to finish it. This will distribute the load to
+ +       * the cores.
+ +       */
+ +      for (i = 0; i < 2; ++i)
+ +              reada_start_machine(fs_info);
+ +}
+ +
+ +static void reada_start_machine(struct btrfs_fs_info *fs_info)
+ +{
+ +      struct reada_machine_work *rmw;
+ +
+ +      rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+ +      if (!rmw) {
+ +              /* FIXME we cannot handle this properly right now */
+ +              BUG();
+ +      }
+ +      rmw->work.func = reada_start_machine_worker;
+ +      rmw->fs_info = fs_info;
+ +
+ +      btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+ +}
+ +
+ +#ifdef DEBUG
+ +static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+ +{
+ +      struct btrfs_device *device;
+ +      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ +      unsigned long index;
+ +      int ret;
+ +      int i;
+ +      int j;
+ +      int cnt;
+ +
+ +      spin_lock(&fs_info->reada_lock);
+ +      list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ +              printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+ +                      atomic_read(&device->reada_in_flight));
+ +              index = 0;
+ +              while (1) {
+ +                      struct reada_zone *zone;
+ +                      ret = radix_tree_gang_lookup(&device->reada_zones,
+ +                                                   (void **)&zone, index, 1);
+ +                      if (ret == 0)
+ +                              break;
+ +                      printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+ +                              "%d devs", zone->start, zone->end, zone->elems,
+ +                              zone->locked);
+ +                      for (j = 0; j < zone->ndevs; ++j) {
+ +                              printk(KERN_CONT " %lld",
+ +                                      zone->devs[j]->devid);
+ +                      }
+ +                      if (device->reada_curr_zone == zone)
+ +                              printk(KERN_CONT " curr off %llu",
+ +                                      device->reada_next - zone->start);
+ +                      printk(KERN_CONT "\n");
+ +                      index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ +              }
+ +              cnt = 0;
+ +              index = 0;
+ +              while (all) {
+ +                      struct reada_extent *re = NULL;
+ +
+ +                      ret = radix_tree_gang_lookup(&device->reada_extents,
+ +                                                   (void **)&re, index, 1);
+ +                      if (ret == 0)
+ +                              break;
+ +                      printk(KERN_DEBUG
+ +                              "  re: logical %llu size %u empty %d for %lld",
+ +                              re->logical, re->blocksize,
+ +                              list_empty(&re->extctl), re->scheduled_for ?
+ +                              re->scheduled_for->devid : -1);
+ +
+ +                      for (i = 0; i < re->nzones; ++i) {
+ +                              printk(KERN_CONT " zone %llu-%llu devs",
+ +                                      re->zones[i]->start,
+ +                                      re->zones[i]->end);
+ +                              for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ +                                      printk(KERN_CONT " %lld",
+ +                                              re->zones[i]->devs[j]->devid);
+ +                              }
+ +                      }
+ +                      printk(KERN_CONT "\n");
+ +                      index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ +                      if (++cnt > 15)
+ +                              break;
+ +              }
+ +      }
+ +
+ +      index = 0;
+ +      cnt = 0;
+ +      while (all) {
+ +              struct reada_extent *re = NULL;
+ +
+ +              ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+ +                                           index, 1);
+ +              if (ret == 0)
+ +                      break;
+ +              if (!re->scheduled_for) {
+ +                      index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ +                      continue;
+ +              }
+ +              printk(KERN_DEBUG
+ +                      "re: logical %llu size %u list empty %d for %lld",
+ +                      re->logical, re->blocksize, list_empty(&re->extctl),
+ +                      re->scheduled_for ? re->scheduled_for->devid : -1);
+ +              for (i = 0; i < re->nzones; ++i) {
+ +                      printk(KERN_CONT " zone %llu-%llu devs",
+ +                              re->zones[i]->start,
+ +                              re->zones[i]->end);
+ +                      for (i = 0; i < re->nzones; ++i) {
+ +                              printk(KERN_CONT " zone %llu-%llu devs",
+ +                                      re->zones[i]->start,
+ +                                      re->zones[i]->end);
+ +                              for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ +                                      printk(KERN_CONT " %lld",
+ +                                              re->zones[i]->devs[j]->devid);
+ +                              }
+ +                      }
+ +              }
+ +              printk(KERN_CONT "\n");
+ +              index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ +      }
+ +      spin_unlock(&fs_info->reada_lock);
+ +}
+ +#endif
+ +
+ +/*
+ + * interface
+ + */
+ +struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+ +                      struct btrfs_key *key_start, struct btrfs_key *key_end)
+ +{
+ +      struct reada_control *rc;
+ +      u64 start;
+ +      u64 generation;
+ +      int level;
+ +      struct extent_buffer *node;
+ +      static struct btrfs_key max_key = {
+ +              .objectid = (u64)-1,
+ +              .type = (u8)-1,
+ +              .offset = (u64)-1
+ +      };
+ +
+ +      rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ +      if (!rc)
+ +              return ERR_PTR(-ENOMEM);
+ +
+ +      rc->root = root;
+ +      rc->key_start = *key_start;
+ +      rc->key_end = *key_end;
+ +      atomic_set(&rc->elems, 0);
+ +      init_waitqueue_head(&rc->wait);
+ +      kref_init(&rc->refcnt);
+ +      kref_get(&rc->refcnt); /* one ref for having elements */
+ +
+ +      node = btrfs_root_node(root);
+ +      start = node->start;
+ +      level = btrfs_header_level(node);
+ +      generation = btrfs_header_generation(node);
+ +      free_extent_buffer(node);
+ +
+ +      reada_add_block(rc, start, &max_key, level, generation);
+ +
+ +      reada_start_machine(root->fs_info);
+ +
+ +      return rc;
+ +}
+ +
+ +#ifdef DEBUG
+ +int btrfs_reada_wait(void *handle)
+ +{
+ +      struct reada_control *rc = handle;
+ +
+ +      while (atomic_read(&rc->elems)) {
+ +              wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+ +                                 5 * HZ);
+ +              dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+ +      }
+ +
+ +      dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+ +
+ +      kref_put(&rc->refcnt, reada_control_release);
+ +
+ +      return 0;
+ +}
+ +#else
+ +int btrfs_reada_wait(void *handle)
+ +{
+ +      struct reada_control *rc = handle;
+ +
+ +      while (atomic_read(&rc->elems)) {
+ +              wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+ +      }
+ +
+ +      kref_put(&rc->refcnt, reada_control_release);
+ +
+ +      return 0;
+ +}
+ +#endif
+ +
+ +void btrfs_reada_detach(void *handle)
+ +{
+ +      struct reada_control *rc = handle;
+ +
+ +      kref_put(&rc->refcnt, reada_control_release);
+ +}
diff --combined fs/btrfs/scrub.c

index 5bc4ec827b3d67bd4edb14daa1533955dd85bdb9,eba42e5fd5fd4119fb01e41b7858dcbc253232ef..94cd3a19e9c8bf5a903f6c0dfc04c5f4b5b56788
--- 1/fs/btrfs/scrub.c
--- 2/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@@ -17,10 -17,14 +17,14 @@@
    */
   
   #include <linux/blkdev.h>
+ #include <linux/ratelimit.h>
   #include "ctree.h"
   #include "volumes.h"
   #include "disk-io.h"
   #include "ordered-data.h"
+ #include "transaction.h"
+ #include "backref.h"
+ #include "extent_io.h"
   
   /*
    * This is only the first step towards a full-features scrub. It reads all
@@@ -29,12 -33,15 +33,12 @@@
    * any can be found.
    *
    * Future enhancements:
- - *  - To enhance the performance, better read-ahead strategies for the
- - *    extent-tree can be employed.
    *  - In case an unrepairable extent is encountered, track which files are
    *    affected and report them
    *  - In case of a read error on files with nodatasum, map the file and read
    *    the extent to trigger a writeback of the good copy
    *  - track and record media errors, throw out bad devices
    *  - add a mode to also read unallocated space
- - *  - make the prefetch cancellable
    */
   
   struct scrub_bio;
@@@ -60,7 -67,7 +64,7 @@@ static void scrub_fixup(struct scrub_bi
   struct scrub_page {
         u64                     flags;  /* extent flags */
         u64                     generation;
-       u64                     mirror_num;
+       int                     mirror_num;
         int                     have_csum;
         u8                      csum[BTRFS_CSUM_SIZE];
   };
@@@ -84,6 -91,7 +88,7 @@@ struct scrub_dev 
         int                     first_free;
         int                     curr;
         atomic_t                in_flight;
+       atomic_t                fixup_cnt;
         spinlock_t              list_lock;
         wait_queue_head_t       list_wait;
         u16                     csum_size;
@@@ -97,6 -105,27 +102,27 @@@
         spinlock_t              stat_lock;
   };
   
+ struct scrub_fixup_nodatasum {
+       struct scrub_dev        *sdev;
+       u64                     logical;
+       struct btrfs_root       *root;
+       struct btrfs_work       work;
+       int                     mirror_num;
+ };
+ 
+ struct scrub_warning {
+       struct btrfs_path       *path;
+       u64                     extent_item_size;
+       char                    *scratch_buf;
+       char                    *msg_buf;
+       const char              *errstr;
+       sector_t                sector;
+       u64                     logical;
+       struct btrfs_device     *dev;
+       int                     msg_bufsize;
+       int                     scratch_bufsize;
+ };
+ 
   static void scrub_free_csums(struct scrub_dev *sdev)
   {
         while (!list_empty(&sdev->csum_list)) {
@@@ -172,14 -201,15 +198,15 @@@ struct scrub_dev *scrub_setup_dev(struc
   
                 if (i != SCRUB_BIOS_PER_DEV-1)
                         sdev->bios[i]->next_free = i + 1;
-                else
+               else
                         sdev->bios[i]->next_free = -1;
         }
         sdev->first_free = 0;
         sdev->curr = -1;
         atomic_set(&sdev->in_flight, 0);
+       atomic_set(&sdev->fixup_cnt, 0);
         atomic_set(&sdev->cancel_req, 0);
- -      sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ +      sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
         INIT_LIST_HEAD(&sdev->csum_list);
   
         spin_lock_init(&sdev->list_lock);
@@@ -192,24 -222,361 +219,361 @@@ nomem
         return ERR_PTR(-ENOMEM);
   }
   
+ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+ {
+       u64 isize;
+       u32 nlink;
+       int ret;
+       int i;
+       struct extent_buffer *eb;
+       struct btrfs_inode_item *inode_item;
+       struct scrub_warning *swarn = ctx;
+       struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+       struct inode_fs_paths *ipath = NULL;
+       struct btrfs_root *local_root;
+       struct btrfs_key root_key;
+ 
+       root_key.objectid = root;
+       root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root_key.offset = (u64)-1;
+       local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+       if (IS_ERR(local_root)) {
+               ret = PTR_ERR(local_root);
+               goto err;
+       }
+ 
+       ret = inode_item_info(inum, 0, local_root, swarn->path);
+       if (ret) {
+               btrfs_release_path(swarn->path);
+               goto err;
+       }
+ 
+       eb = swarn->path->nodes[0];
+       inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+                                       struct btrfs_inode_item);
+       isize = btrfs_inode_size(eb, inode_item);
+       nlink = btrfs_inode_nlink(eb, inode_item);
+       btrfs_release_path(swarn->path);
+ 
+       ipath = init_ipath(4096, local_root, swarn->path);
+       ret = paths_from_inode(inum, ipath);
+ 
+       if (ret < 0)
+               goto err;
+ 
+       /*
+        * we deliberately ignore the bit ipath might have been too small to
+        * hold all of the paths here
+        */
+       for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+               printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+                       "%s, sector %llu, root %llu, inode %llu, offset %llu, "
+                       "length %llu, links %u (path: %s)\n", swarn->errstr,
+                       swarn->logical, swarn->dev->name,
+                       (unsigned long long)swarn->sector, root, inum, offset,
+                       min(isize - offset, (u64)PAGE_SIZE), nlink,
+                       ipath->fspath->str[i]);
+ 
+       free_ipath(ipath);
+       return 0;
+ 
+ err:
+       printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+               "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+               "resolving failed with ret=%d\n", swarn->errstr,
+               swarn->logical, swarn->dev->name,
+               (unsigned long long)swarn->sector, root, inum, offset, ret);
+ 
+       free_ipath(ipath);
+       return 0;
+ }
+ 
+ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
+                               int ix)
+ {
+       struct btrfs_device *dev = sbio->sdev->dev;
+       struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+       struct btrfs_path *path;
+       struct btrfs_key found_key;
+       struct extent_buffer *eb;
+       struct btrfs_extent_item *ei;
+       struct scrub_warning swarn;
+       u32 item_size;
+       int ret;
+       u64 ref_root;
+       u8 ref_level;
+       unsigned long ptr = 0;
+       const int bufsize = 4096;
+       u64 extent_offset;
+ 
+       path = btrfs_alloc_path();
+ 
+       swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+       swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+       swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+       swarn.logical = sbio->logical + ix * PAGE_SIZE;
+       swarn.errstr = errstr;
+       swarn.dev = dev;
+       swarn.msg_bufsize = bufsize;
+       swarn.scratch_bufsize = bufsize;
+ 
+       if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+               goto out;
+ 
+       ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+       if (ret < 0)
+               goto out;
+ 
+       extent_offset = swarn.logical - found_key.objectid;
+       swarn.extent_item_size = found_key.offset;
+ 
+       eb = path->nodes[0];
+       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+       item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ 
+       if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               do {
+                       ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+                                                       &ref_root, &ref_level);
+                       printk(KERN_WARNING "%s at logical %llu on dev %s, "
+                               "sector %llu: metadata %s (level %d) in tree "
+                               "%llu\n", errstr, swarn.logical, dev->name,
+                               (unsigned long long)swarn.sector,
+                               ref_level ? "node" : "leaf",
+                               ret < 0 ? -1 : ref_level,
+                               ret < 0 ? -1 : ref_root);
+               } while (ret != 1);
+       } else {
+               swarn.path = path;
+               iterate_extent_inodes(fs_info, path, found_key.objectid,
+                                       extent_offset,
+                                       scrub_print_warning_inode, &swarn);
+       }
+ 
+ out:
+       btrfs_free_path(path);
+       kfree(swarn.scratch_buf);
+       kfree(swarn.msg_buf);
+ }
+ 
+ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+ {
+       struct page *page = NULL;
+       unsigned long index;
+       struct scrub_fixup_nodatasum *fixup = ctx;
+       int ret;
+       int corrected = 0;
+       struct btrfs_key key;
+       struct inode *inode = NULL;
+       u64 end = offset + PAGE_SIZE - 1;
+       struct btrfs_root *local_root;
+ 
+       key.objectid = root;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = (u64)-1;
+       local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+       if (IS_ERR(local_root))
+               return PTR_ERR(local_root);
+ 
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.objectid = inum;
+       key.offset = 0;
+       inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+ 
+       index = offset >> PAGE_CACHE_SHIFT;
+ 
+       page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+       if (!page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+ 
+       if (PageUptodate(page)) {
+               struct btrfs_mapping_tree *map_tree;
+               if (PageDirty(page)) {
+                       /*
+                        * we need to write the data to the defect sector. the
+                        * data that was in that sector is not in memory,
+                        * because the page was modified. we must not write the
+                        * modified page to that sector.
+                        *
+                        * TODO: what could be done here: wait for the delalloc
+                        *       runner to write out that page (might involve
+                        *       COW) and see whether the sector is still
+                        *       referenced afterwards.
+                        *
+                        * For the meantime, we'll treat this error
+                        * incorrectable, although there is a chance that a
+                        * later scrub will find the bad sector again and that
+                        * there's no dirty page in memory, then.
+                        */
+                       ret = -EIO;
+                       goto out;
+               }
+               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+               ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+                                       fixup->logical, page,
+                                       fixup->mirror_num);
+               unlock_page(page);
+               corrected = !ret;
+       } else {
+               /*
+                * we need to get good data first. the general readpage path
+                * will call repair_io_failure for us, we just have to make
+                * sure we read the bad mirror.
+                */
+               ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+                                       EXTENT_DAMAGED, GFP_NOFS);
+               if (ret) {
+                       /* set_extent_bits should give proper error */
+                       WARN_ON(ret > 0);
+                       if (ret > 0)
+                               ret = -EFAULT;
+                       goto out;
+               }
+ 
+               ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+                                               btrfs_get_extent,
+                                               fixup->mirror_num);
+               wait_on_page_locked(page);
+ 
+               corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+                                               end, EXTENT_DAMAGED, 0, NULL);
+               if (!corrected)
+                       clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+                                               EXTENT_DAMAGED, GFP_NOFS);
+       }
+ 
+ out:
+       if (page)
+               put_page(page);
+       if (inode)
+               iput(inode);
+ 
+       if (ret < 0)
+               return ret;
+ 
+       if (ret == 0 && corrected) {
+               /*
+                * we only need to call readpage for one of the inodes belonging
+                * to this extent. so make iterate_extent_inodes stop
+                */
+               return 1;
+       }
+ 
+       return -EIO;
+ }
+ 
+ static void scrub_fixup_nodatasum(struct btrfs_work *work)
+ {
+       int ret;
+       struct scrub_fixup_nodatasum *fixup;
+       struct scrub_dev *sdev;
+       struct btrfs_trans_handle *trans = NULL;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_path *path;
+       int uncorrectable = 0;
+ 
+       fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+       sdev = fixup->sdev;
+       fs_info = fixup->root->fs_info;
+ 
+       path = btrfs_alloc_path();
+       if (!path) {
+               spin_lock(&sdev->stat_lock);
+               ++sdev->stat.malloc_errors;
+               spin_unlock(&sdev->stat_lock);
+               uncorrectable = 1;
+               goto out;
+       }
+ 
+       trans = btrfs_join_transaction(fixup->root);
+       if (IS_ERR(trans)) {
+               uncorrectable = 1;
+               goto out;
+       }
+ 
+       /*
+        * the idea is to trigger a regular read through the standard path. we
+        * read a page from the (failed) logical address by specifying the
+        * corresponding copynum of the failed sector. thus, that readpage is
+        * expected to fail.
+        * that is the point where on-the-fly error correction will kick in
+        * (once it's finished) and rewrite the failed sector if a good copy
+        * can be found.
+        */
+       ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+                                               path, scrub_fixup_readpage,
+                                               fixup);
+       if (ret < 0) {
+               uncorrectable = 1;
+               goto out;
+       }
+       WARN_ON(ret != 1);
+ 
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.corrected_errors;
+       spin_unlock(&sdev->stat_lock);
+ 
+ out:
+       if (trans && !IS_ERR(trans))
+               btrfs_end_transaction(trans, fixup->root);
+       if (uncorrectable) {
+               spin_lock(&sdev->stat_lock);
+               ++sdev->stat.uncorrectable_errors;
+               spin_unlock(&sdev->stat_lock);
+               printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+                                       "(nodatasum) error at logical %llu\n",
+                                       fixup->logical);
+       }
+ 
+       btrfs_free_path(path);
+       kfree(fixup);
+ 
+       /* see caller why we're pretending to be paused in the scrub counters */
+       mutex_lock(&fs_info->scrub_lock);
+       atomic_dec(&fs_info->scrubs_running);
+       atomic_dec(&fs_info->scrubs_paused);
+       mutex_unlock(&fs_info->scrub_lock);
+       atomic_dec(&sdev->fixup_cnt);
+       wake_up(&fs_info->scrub_pause_wait);
+       wake_up(&sdev->list_wait);
+ }
+ 
   /*
    * scrub_recheck_error gets called when either verification of the page
    * failed or the bio failed to read, e.g. with EIO. In the latter case,
    * recheck_error gets called for every page in the bio, even though only
    * one may be bad
    */
- static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+ static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
   {
+       struct scrub_dev *sdev = sbio->sdev;
+       u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                       DEFAULT_RATELIMIT_BURST);
+ 
         if (sbio->err) {
-               if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
-                                  (sbio->physical + ix * PAGE_SIZE) >> 9,
+               if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
                                    sbio->bio->bi_io_vec[ix].bv_page) == 0) {
                         if (scrub_fixup_check(sbio, ix) == 0)
-                               return;
+                               return 0;
                 }
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("i/o error", sbio, ix);
+       } else {
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("checksum error", sbio, ix);
         }
   
+       spin_lock(&sdev->stat_lock);
+       ++sdev->stat.read_errors;
+       spin_unlock(&sdev->stat_lock);
+ 
         scrub_fixup(sbio, ix);
+       return 1;
   }
   
   static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@@ -247,7 -614,8 +611,8 @@@ static void scrub_fixup(struct scrub_bi
         struct scrub_dev *sdev = sbio->sdev;
         struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_bio *bbio = NULL;
+       struct scrub_fixup_nodatasum *fixup;
         u64 logical = sbio->logical + ix * PAGE_SIZE;
         u64 length;
         int i;
@@@ -256,18 -624,36 +621,36 @@@
   
         if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
             (sbio->spag[ix].have_csum == 0)) {
+               fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+               if (!fixup)
+                       goto uncorrectable;
+               fixup->sdev = sdev;
+               fixup->logical = logical;
+               fixup->root = fs_info->extent_root;
+               fixup->mirror_num = sbio->spag[ix].mirror_num;
                 /*
-                * nodatasum, don't try to fix anything
-                * FIXME: we can do better, open the inode and trigger a
-                * writeback
+                * increment scrubs_running to prevent cancel requests from
+                * completing as long as a fixup worker is running. we must also
+                * increment scrubs_paused to prevent deadlocking on pause
+                * requests used for transactions commits (as the worker uses a
+                * transaction context). it is safe to regard the fixup worker
+                * as paused for all matters practical. effectively, we only
+                * avoid cancellation requests from completing.
                  */
-               goto uncorrectable;
+               mutex_lock(&fs_info->scrub_lock);
+               atomic_inc(&fs_info->scrubs_running);
+               atomic_inc(&fs_info->scrubs_paused);
+               mutex_unlock(&fs_info->scrub_lock);
+               atomic_inc(&sdev->fixup_cnt);
+               fixup->work.func = scrub_fixup_nodatasum;
+               btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+               return;
         }
   
         length = PAGE_SIZE;
         ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-                             &multi, 0);
-       if (ret || !multi || length < PAGE_SIZE) {
+                             &bbio, 0);
+       if (ret || !bbio || length < PAGE_SIZE) {
                 printk(KERN_ERR
                        "scrub_fixup: btrfs_map_block failed us for %llu\n",
                        (unsigned long long)logical);
@@@ -275,19 -661,19 +658,19 @@@
                 return;
         }
   
-       if (multi->num_stripes == 1)
+       if (bbio->num_stripes == 1)
                 /* there aren't any replicas */
                 goto uncorrectable;
   
         /*
          * first find a good copy
          */
-       for (i = 0; i < multi->num_stripes; ++i) {
-               if (i == sbio->spag[ix].mirror_num)
+       for (i = 0; i < bbio->num_stripes; ++i) {
+               if (i + 1 == sbio->spag[ix].mirror_num)
                         continue;
   
-               if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
-                                  multi->stripes[i].physical >> 9,
+               if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
+                                  bbio->stripes[i].physical >> 9,
                                    sbio->bio->bi_io_vec[ix].bv_page)) {
                         /* I/O-error, this is not a good copy */
                         continue;
@@@ -296,7 -682,7 +679,7 @@@
                 if (scrub_fixup_check(sbio, ix) == 0)
                         break;
         }
-       if (i == multi->num_stripes)
+       if (i == bbio->num_stripes)
                 goto uncorrectable;
   
         if (!sdev->readonly) {
@@@ -311,25 -697,23 +694,23 @@@
                 }
         }
   
-       kfree(multi);
+       kfree(bbio);
         spin_lock(&sdev->stat_lock);
         ++sdev->stat.corrected_errors;
         spin_unlock(&sdev->stat_lock);
   
-       if (printk_ratelimit())
-               printk(KERN_ERR "btrfs: fixed up at %llu\n",
-                      (unsigned long long)logical);
+       printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
+                              (unsigned long long)logical);
         return;
   
   uncorrectable:
-       kfree(multi);
+       kfree(bbio);
         spin_lock(&sdev->stat_lock);
         ++sdev->stat.uncorrectable_errors;
         spin_unlock(&sdev->stat_lock);
   
-       if (printk_ratelimit())
-               printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
-                        (unsigned long long)logical);
+       printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
+                               "logical %llu\n", (unsigned long long)logical);
   }
   
   static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@@ -379,8 -763,14 +760,14 @@@ static void scrub_checksum(struct btrfs
         int ret;
   
         if (sbio->err) {
+               ret = 0;
                 for (i = 0; i < sbio->count; ++i)
-                       scrub_recheck_error(sbio, i);
+                       ret |= scrub_recheck_error(sbio, i);
+               if (!ret) {
+                       spin_lock(&sdev->stat_lock);
+                       ++sdev->stat.unverified_errors;
+                       spin_unlock(&sdev->stat_lock);
+               }
   
                 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
                 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@@ -393,10 -783,6 +780,6 @@@
                         bi->bv_offset = 0;
                         bi->bv_len = PAGE_SIZE;
                 }
- 
-               spin_lock(&sdev->stat_lock);
-               ++sdev->stat.read_errors;
-               spin_unlock(&sdev->stat_lock);
                 goto out;
         }
         for (i = 0; i < sbio->count; ++i) {
@@@ -417,8 -803,14 +800,14 @@@
                         WARN_ON(1);
                 }
                 kunmap_atomic(buffer, KM_USER0);
-               if (ret)
-                       scrub_recheck_error(sbio, i);
+               if (ret) {
+                       ret = scrub_recheck_error(sbio, i);
+                       if (!ret) {
+                               spin_lock(&sdev->stat_lock);
+                               ++sdev->stat.unverified_errors;
+                               spin_unlock(&sdev->stat_lock);
+                       }
+               }
         }
   
   out:
@@@ -601,7 -993,7 +990,7 @@@ nomem
   }
   
   static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-                     u64 physical, u64 flags, u64 gen, u64 mirror_num,
+                     u64 physical, u64 flags, u64 gen, int mirror_num,
                       u8 *csum, int force)
   {
         struct scrub_bio *sbio;
@@@ -698,7 -1090,7 +1087,7 @@@ static int scrub_find_csum(struct scrub
   
   /* scrub extent tries to collect up to 64 kB for each bio */
   static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-                       u64 physical, u64 flags, u64 gen, u64 mirror_num)
+                       u64 physical, u64 flags, u64 gen, int mirror_num)
   {
         int ret;
         u8 csum[BTRFS_CSUM_SIZE];
@@@ -738,16 -1130,13 +1127,16 @@@ static noinline_for_stack int scrub_str
         int slot;
         int i;
         u64 nstripes;
- -      int start_stripe;
         struct extent_buffer *l;
         struct btrfs_key key;
         u64 physical;
         u64 logical;
         u64 generation;
-       u64 mirror_num;
+       int mirror_num;
+ +      struct reada_control *reada1;
+ +      struct reada_control *reada2;
+ +      struct btrfs_key key_start;
+ +      struct btrfs_key key_end;
   
         u64 increment = map->stripe_len;
         u64 offset;
@@@ -758,88 -1147,102 +1147,88 @@@
         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                 offset = map->stripe_len * num;
                 increment = map->stripe_len * map->num_stripes;
-               mirror_num = 0;
+               mirror_num = 1;
         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                 int factor = map->num_stripes / map->sub_stripes;
                 offset = map->stripe_len * (num / map->sub_stripes);
                 increment = map->stripe_len * factor;
-               mirror_num = num % map->sub_stripes;
+               mirror_num = num % map->sub_stripes + 1;
         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
                 increment = map->stripe_len;
-               mirror_num = num % map->num_stripes;
+               mirror_num = num % map->num_stripes + 1;
         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                 increment = map->stripe_len;
-               mirror_num = num % map->num_stripes;
+               mirror_num = num % map->num_stripes + 1;
         } else {
                 increment = map->stripe_len;
-               mirror_num = 0;
+               mirror_num = 1;
         }
   
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
   
- -      path->reada = 2;
         path->search_commit_root = 1;
         path->skip_locking = 1;
   
         /*
- -       * find all extents for each stripe and just read them to get
- -       * them into the page cache
- -       * FIXME: we can do better. build a more intelligent prefetching
+ +       * trigger the readahead for extent tree csum tree and wait for
+ +       * completion. During readahead, the scrub is officially paused
+ +       * to not hold off transaction commits
          */
         logical = base + offset;
- -      physical = map->stripes[num].physical;
- -      ret = 0;
- -      for (i = 0; i < nstripes; ++i) {
- -              key.objectid = logical;
- -              key.type = BTRFS_EXTENT_ITEM_KEY;
- -              key.offset = (u64)0;
- -
- -              ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- -              if (ret < 0)
- -                      goto out_noplug;
   
- -              /*
- -               * we might miss half an extent here, but that doesn't matter,
- -               * as it's only the prefetch
- -               */
- -              while (1) {
- -                      l = path->nodes[0];
- -                      slot = path->slots[0];
- -                      if (slot >= btrfs_header_nritems(l)) {
- -                              ret = btrfs_next_leaf(root, path);
- -                              if (ret == 0)
- -                                      continue;
- -                              if (ret < 0)
- -                                      goto out_noplug;
- -
- -                              break;
- -                      }
- -                      btrfs_item_key_to_cpu(l, &key, slot);
+ +      wait_event(sdev->list_wait,
+ +                 atomic_read(&sdev->in_flight) == 0);
+ +      atomic_inc(&fs_info->scrubs_paused);
+ +      wake_up(&fs_info->scrub_pause_wait);
   
- -                      if (key.objectid >= logical + map->stripe_len)
- -                              break;
+ +      /* FIXME it might be better to start readahead at commit root */
+ +      key_start.objectid = logical;
+ +      key_start.type = BTRFS_EXTENT_ITEM_KEY;
+ +      key_start.offset = (u64)0;
+ +      key_end.objectid = base + offset + nstripes * increment;
+ +      key_end.type = BTRFS_EXTENT_ITEM_KEY;
+ +      key_end.offset = (u64)0;
+ +      reada1 = btrfs_reada_add(root, &key_start, &key_end);
+ +
+ +      key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ +      key_start.type = BTRFS_EXTENT_CSUM_KEY;
+ +      key_start.offset = logical;
+ +      key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ +      key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ +      key_end.offset = base + offset + nstripes * increment;
+ +      reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+ +
+ +      if (!IS_ERR(reada1))
+ +              btrfs_reada_wait(reada1);
+ +      if (!IS_ERR(reada2))
+ +              btrfs_reada_wait(reada2);
   
- -                      path->slots[0]++;
- -              }
- -              btrfs_release_path(path);
- -              logical += increment;
- -              physical += map->stripe_len;
- -              cond_resched();
+ +      mutex_lock(&fs_info->scrub_lock);
+ +      while (atomic_read(&fs_info->scrub_pause_req)) {
+ +              mutex_unlock(&fs_info->scrub_lock);
+ +              wait_event(fs_info->scrub_pause_wait,
+ +                 atomic_read(&fs_info->scrub_pause_req) == 0);
+ +              mutex_lock(&fs_info->scrub_lock);
         }
+ +      atomic_dec(&fs_info->scrubs_paused);
+ +      mutex_unlock(&fs_info->scrub_lock);
+ +      wake_up(&fs_info->scrub_pause_wait);
   
         /*
          * collect all data csums for the stripe to avoid seeking during
          * the scrub. This might currently (crc32) end up to be about 1MB
          */
- -      start_stripe = 0;
         blk_start_plug(&plug);
- -again:
- -      logical = base + offset + start_stripe * increment;
- -      for (i = start_stripe; i < nstripes; ++i) {
- -              ret = btrfs_lookup_csums_range(csum_root, logical,
- -                                             logical + map->stripe_len - 1,
- -                                             &sdev->csum_list, 1);
- -              if (ret)
- -                      goto out;
   
- -              logical += increment;
- -              cond_resched();
- -      }
         /*
          * now find all extents for each stripe and scrub them
          */
- -      logical = base + offset + start_stripe * increment;
- -      physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+ +      logical = base + offset;
+ +      physical = map->stripes[num].physical;
         ret = 0;
- -      for (i = start_stripe; i < nstripes; ++i) {
+ +      for (i = 0; i < nstripes; ++i) {
                 /*
                  * canceled?
                  */
@@@ -868,14 -1271,11 +1257,14 @@@
                         atomic_dec(&fs_info->scrubs_paused);
                         mutex_unlock(&fs_info->scrub_lock);
                         wake_up(&fs_info->scrub_pause_wait);
- -                      scrub_free_csums(sdev);
- -                      start_stripe = i;
- -                      goto again;
                 }
   
+ +              ret = btrfs_lookup_csums_range(csum_root, logical,
+ +                                             logical + map->stripe_len - 1,
+ +                                             &sdev->csum_list, 1);
+ +              if (ret)
+ +                      goto out;
+ +
                 key.objectid = logical;
                 key.type = BTRFS_EXTENT_ITEM_KEY;
                 key.offset = (u64)0;
@@@ -971,6 -1371,7 +1360,6 @@@ next
   
   out:
         blk_finish_plug(&plug);
- -out_noplug:
         btrfs_free_path(path);
         return ret < 0 ? ret : 0;
   }
@@@ -1241,10 -1642,11 +1630,11 @@@ int btrfs_scrub_dev(struct btrfs_root *
                 ret = scrub_enumerate_chunks(sdev, start, end);
   
         wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
- 
         atomic_dec(&fs_info->scrubs_running);
         wake_up(&fs_info->scrub_pause_wait);
   
+       wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+ 
         if (progress)
                 memcpy(progress, &sdev->stat, sizeof(*progress));
   
diff --combined fs/btrfs/volumes.c

index f1685a2b45c88a8471d56c12351ac674fff0ca10,18baac5a3f6c3114a8a70a5f063ccf6b5bd4990f..f8e2943101a11b43efeac758643d54e85abb9ee4
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -366,14 -366,6 +366,14 @@@ static noinline int device_list_add(con
                 }
                 INIT_LIST_HEAD(&device->dev_alloc_list);
   
+ +              /* init readahead state */
+ +              spin_lock_init(&device->reada_lock);
+ +              device->reada_curr_zone = NULL;
+ +              atomic_set(&device->reada_in_flight, 0);
+ +              device->reada_next = 0;
+ +              INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+ +              INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ +
                 mutex_lock(&fs_devices->device_list_mutex);
                 list_add_rcu(&device->dev_list, &fs_devices->devices);
                 mutex_unlock(&fs_devices->device_list_mutex);
@@@ -605,8 -597,10 +605,8 @@@ static int __btrfs_open_devices(struct 
                 set_blocksize(bdev, 4096);
   
                 bh = btrfs_read_dev_super(bdev);
- -              if (!bh) {
- -                      ret = -EINVAL;
+ +              if (!bh)
                         goto error_close;
- -              }
   
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@@ -661,7 -655,7 +661,7 @@@ error
                 continue;
         }
         if (fs_devices->open_devices == 0) {
- -              ret = -EIO;
+ +              ret = -EINVAL;
                 goto out;
         }
         fs_devices->seeding = seeding;
@@@ -1019,13 -1013,8 +1019,13 @@@ static int btrfs_free_dev_extent(struc
         }
         BUG_ON(ret);
   
- -      if (device->bytes_used > 0)
- -              device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+ +      if (device->bytes_used > 0) {
+ +              u64 len = btrfs_dev_extent_length(leaf, extent);
+ +              device->bytes_used -= len;
+ +              spin_lock(&root->fs_info->free_chunk_lock);
+ +              root->fs_info->free_chunk_space += len;
+ +              spin_unlock(&root->fs_info->free_chunk_lock);
+ +      }
         ret = btrfs_del_item(trans, root, path);
   
   out:
@@@ -1367,11 -1356,6 +1367,11 @@@ int btrfs_rm_device(struct btrfs_root *
         if (ret)
                 goto error_undo;
   
+ +      spin_lock(&root->fs_info->free_chunk_lock);
+ +      root->fs_info->free_chunk_space = device->total_bytes -
+ +              device->bytes_used;
+ +      spin_unlock(&root->fs_info->free_chunk_lock);
+ +
         device->in_fs_metadata = 0;
         btrfs_scrub_cancel_dev(root, device);
   
@@@ -1403,8 -1387,8 +1403,8 @@@
         call_rcu(&device->rcu, free_device);
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
   
- -      num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
- -      btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+ +      num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+ +      btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
   
         if (cur_devices->open_devices == 0) {
                 struct btrfs_fs_devices *fs_devices;
@@@ -1466,7 -1450,7 +1466,7 @@@ static int btrfs_prepare_sprout(struct 
         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
         struct btrfs_fs_devices *old_devices;
         struct btrfs_fs_devices *seed_devices;
- -      struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ +      struct btrfs_super_block *disk_super = root->fs_info->super_copy;
         struct btrfs_device *device;
         u64 super_flags;
   
@@@ -1707,19 -1691,15 +1707,19 @@@ int btrfs_init_new_device(struct btrfs_
                 root->fs_info->fs_devices->num_can_discard++;
         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
   
+ +      spin_lock(&root->fs_info->free_chunk_lock);
+ +      root->fs_info->free_chunk_space += device->total_bytes;
+ +      spin_unlock(&root->fs_info->free_chunk_lock);
+ +
         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                 root->fs_info->fs_devices->rotating = 1;
   
- -      total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
- -      btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+ +      total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ +      btrfs_set_super_total_bytes(root->fs_info->super_copy,
                                     total_bytes + device->total_bytes);
   
- -      total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
- -      btrfs_set_super_num_devices(&root->fs_info->super_copy,
+ +      total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+ +      btrfs_set_super_num_devices(root->fs_info->super_copy,
                                     total_bytes + 1);
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
   
@@@ -1810,7 -1790,7 +1810,7 @@@ static int __btrfs_grow_device(struct b
                       struct btrfs_device *device, u64 new_size)
   {
         struct btrfs_super_block *super_copy =
- -              &device->dev_root->fs_info->super_copy;
+ +              device->dev_root->fs_info->super_copy;
         u64 old_total = btrfs_super_total_bytes(super_copy);
         u64 diff = new_size - device->total_bytes;
   
@@@ -1869,7 -1849,7 +1869,7 @@@ static int btrfs_free_chunk(struct btrf
   static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                         chunk_offset)
   {
- -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
         struct btrfs_disk_key *disk_key;
         struct btrfs_chunk *chunk;
         u8 *ptr;
@@@ -2195,7 -2175,7 +2195,7 @@@ int btrfs_shrink_device(struct btrfs_de
         bool retried = false;
         struct extent_buffer *l;
         struct btrfs_key key;
- -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
         u64 old_total = btrfs_super_total_bytes(super_copy);
         u64 old_size = device->total_bytes;
         u64 diff = device->total_bytes - new_size;
@@@ -2212,12 -2192,8 +2212,12 @@@
         lock_chunks(root);
   
         device->total_bytes = new_size;
- -      if (device->writeable)
+ +      if (device->writeable) {
                 device->fs_devices->total_rw_bytes -= diff;
+ +              spin_lock(&root->fs_info->free_chunk_lock);
+ +              root->fs_info->free_chunk_space -= diff;
+ +              spin_unlock(&root->fs_info->free_chunk_lock);
+ +      }
         unlock_chunks(root);
   
   again:
@@@ -2281,9 -2257,6 +2281,9 @@@
                 device->total_bytes = old_size;
                 if (device->writeable)
                         device->fs_devices->total_rw_bytes += diff;
+ +              spin_lock(&root->fs_info->free_chunk_lock);
+ +              root->fs_info->free_chunk_space += diff;
+ +              spin_unlock(&root->fs_info->free_chunk_lock);
                 unlock_chunks(root);
                 goto done;
         }
@@@ -2319,7 -2292,7 +2319,7 @@@ static int btrfs_add_system_chunk(struc
                            struct btrfs_key *key,
                            struct btrfs_chunk *chunk, int item_size)
   {
- -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
         struct btrfs_disk_key disk_key;
         u32 array_size;
         u8 *ptr;
@@@ -2642,11 -2615,6 +2642,11 @@@ static int __finish_chunk_alloc(struct 
                 index++;
         }
   
+ +      spin_lock(&extent_root->fs_info->free_chunk_lock);
+ +      extent_root->fs_info->free_chunk_space -= (stripe_size *
+ +                                                 map->num_stripes);
+ +      spin_unlock(&extent_root->fs_info->free_chunk_lock);
+ +
         index = 0;
         stripe = &chunk->stripe;
         while (index < map->num_stripes) {
@@@ -2880,7 -2848,7 +2880,7 @@@ static int find_live_mirror(struct map_
   
   static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                              u64 logical, u64 *length,
-                            struct btrfs_multi_bio **multi_ret,
+                            struct btrfs_bio **bbio_ret,
                              int mirror_num)
   {
         struct extent_map *em;
@@@ -2898,18 -2866,18 +2898,18 @@@
         int i;
         int num_stripes;
         int max_errors = 0;
-       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_bio *bbio = NULL;
   
-       if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+       if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
                 stripes_allocated = 1;
   again:
-       if (multi_ret) {
-               multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+       if (bbio_ret) {
+               bbio = kzalloc(btrfs_bio_size(stripes_allocated),
                                 GFP_NOFS);
-               if (!multi)
+               if (!bbio)
                         return -ENOMEM;
   
-               atomic_set(&multi->error, 0);
+               atomic_set(&bbio->error, 0);
         }
   
         read_lock(&em_tree->lock);
@@@ -2930,7 -2898,7 +2930,7 @@@
         if (mirror_num > map->num_stripes)
                 mirror_num = 0;
   
-       /* if our multi bio struct is too small, back off and try again */
+       /* if our btrfs_bio struct is too small, back off and try again */
         if (rw & REQ_WRITE) {
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_DUP)) {
@@@ -2949,11 -2917,11 +2949,11 @@@
                         stripes_required = map->num_stripes;
                 }
         }
-       if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+       if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
             stripes_allocated < stripes_required) {
                 stripes_allocated = map->num_stripes;
                 free_extent_map(em);
-               kfree(multi);
+               kfree(bbio);
                 goto again;
         }
         stripe_nr = offset;
@@@ -2982,7 -2950,7 +2982,7 @@@
                 *length = em->len - offset;
         }
   
-       if (!multi_ret)
+       if (!bbio_ret)
                 goto out;
   
         num_stripes = 1;
@@@ -3007,13 -2975,17 +3007,17 @@@
                         stripe_index = find_live_mirror(map, 0,
                                             map->num_stripes,
                                             current->pid % map->num_stripes);
+                       mirror_num = stripe_index + 1;
                 }
   
         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (rw & (REQ_WRITE | REQ_DISCARD))
+               if (rw & (REQ_WRITE | REQ_DISCARD)) {
                         num_stripes = map->num_stripes;
-               else if (mirror_num)
+               } else if (mirror_num) {
                         stripe_index = mirror_num - 1;
+               } else {
+                       mirror_num = 1;
+               }
   
         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                 int factor = map->num_stripes / map->sub_stripes;
@@@ -3033,6 -3005,7 +3037,7 @@@
                         stripe_index = find_live_mirror(map, stripe_index,
                                               map->sub_stripes, stripe_index +
                                               current->pid % map->sub_stripes);
+                       mirror_num = stripe_index + 1;
                 }
         } else {
                 /*
@@@ -3041,15 -3014,16 +3046,16 @@@
                  * stripe_index is the number of our device in the stripe array
                  */
                 stripe_index = do_div(stripe_nr, map->num_stripes);
+               mirror_num = stripe_index + 1;
         }
         BUG_ON(stripe_index >= map->num_stripes);
   
         if (rw & REQ_DISCARD) {
                 for (i = 0; i < num_stripes; i++) {
-                       multi->stripes[i].physical =
+                       bbio->stripes[i].physical =
                                 map->stripes[stripe_index].physical +
                                 stripe_offset + stripe_nr * map->stripe_len;
-                       multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                       bbio->stripes[i].dev = map->stripes[stripe_index].dev;
   
                         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                                 u64 stripes;
@@@ -3070,16 -3044,16 +3076,16 @@@
                                 }
                                 stripes = stripe_nr_end - 1 - j;
                                 do_div(stripes, map->num_stripes);
-                               multi->stripes[i].length = map->stripe_len *
+                               bbio->stripes[i].length = map->stripe_len *
                                         (stripes - stripe_nr + 1);
   
                                 if (i == 0) {
-                                       multi->stripes[i].length -=
+                                       bbio->stripes[i].length -=
                                                 stripe_offset;
                                         stripe_offset = 0;
                                 }
                                 if (stripe_index == last_stripe)
-                                       multi->stripes[i].length -=
+                                       bbio->stripes[i].length -=
                                                 stripe_end_offset;
                         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                                 u64 stripes;
@@@ -3104,11 -3078,11 +3110,11 @@@
                                 }
                                 stripes = stripe_nr_end - 1 - j;
                                 do_div(stripes, factor);
-                               multi->stripes[i].length = map->stripe_len *
+                               bbio->stripes[i].length = map->stripe_len *
                                         (stripes - stripe_nr + 1);
   
                                 if (i < map->sub_stripes) {
-                                       multi->stripes[i].length -=
+                                       bbio->stripes[i].length -=
                                                 stripe_offset;
                                         if (i == map->sub_stripes - 1)
                                                 stripe_offset = 0;
@@@ -3116,11 -3090,11 +3122,11 @@@
                                 if (stripe_index >= last_stripe &&
                                     stripe_index <= (last_stripe +
                                                      map->sub_stripes - 1)) {
-                                       multi->stripes[i].length -=
+                                       bbio->stripes[i].length -=
                                                 stripe_end_offset;
                                 }
                         } else
-                               multi->stripes[i].length = *length;
+                               bbio->stripes[i].length = *length;
   
                         stripe_index++;
                         if (stripe_index == map->num_stripes) {
@@@ -3131,19 -3105,20 +3137,20 @@@
                 }
         } else {
                 for (i = 0; i < num_stripes; i++) {
-                       multi->stripes[i].physical =
+                       bbio->stripes[i].physical =
                                 map->stripes[stripe_index].physical +
                                 stripe_offset +
                                 stripe_nr * map->stripe_len;
-                       multi->stripes[i].dev =
+                       bbio->stripes[i].dev =
                                 map->stripes[stripe_index].dev;
                         stripe_index++;
                 }
         }
-       if (multi_ret) {
-               *multi_ret = multi;
-               multi->num_stripes = num_stripes;
-               multi->max_errors = max_errors;
+       if (bbio_ret) {
+               *bbio_ret = bbio;
+               bbio->num_stripes = num_stripes;
+               bbio->max_errors = max_errors;
+               bbio->mirror_num = mirror_num;
         }
   out:
         free_extent_map(em);
@@@ -3152,9 -3127,9 +3159,9 @@@
   
   int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                       u64 logical, u64 *length,
-                     struct btrfs_multi_bio **multi_ret, int mirror_num)
+                     struct btrfs_bio **bbio_ret, int mirror_num)
   {
-       return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+       return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
                                  mirror_num);
   }
   
@@@ -3223,28 -3198,30 +3230,30 @@@ int btrfs_rmap_block(struct btrfs_mappi
         return 0;
   }
   
- static void end_bio_multi_stripe(struct bio *bio, int err)
+ static void btrfs_end_bio(struct bio *bio, int err)
   {
-       struct btrfs_multi_bio *multi = bio->bi_private;
+       struct btrfs_bio *bbio = bio->bi_private;
         int is_orig_bio = 0;
   
         if (err)
-               atomic_inc(&multi->error);
+               atomic_inc(&bbio->error);
   
-       if (bio == multi->orig_bio)
+       if (bio == bbio->orig_bio)
                 is_orig_bio = 1;
   
-       if (atomic_dec_and_test(&multi->stripes_pending)) {
+       if (atomic_dec_and_test(&bbio->stripes_pending)) {
                 if (!is_orig_bio) {
                         bio_put(bio);
-                       bio = multi->orig_bio;
+                       bio = bbio->orig_bio;
                 }
-               bio->bi_private = multi->private;
-               bio->bi_end_io = multi->end_io;
+               bio->bi_private = bbio->private;
+               bio->bi_end_io = bbio->end_io;
+               bio->bi_bdev = (struct block_device *)
+                                       (unsigned long)bbio->mirror_num;
                 /* only send an error to the higher layers if it is
                  * beyond the tolerance of the multi-bio
                  */
-               if (atomic_read(&multi->error) > multi->max_errors) {
+               if (atomic_read(&bbio->error) > bbio->max_errors) {
                         err = -EIO;
                 } else if (err) {
                         /*
@@@ -3254,7 -3231,7 +3263,7 @@@
                         set_bit(BIO_UPTODATE, &bio->bi_flags);
                         err = 0;
                 }
-               kfree(multi);
+               kfree(bbio);
   
                 bio_endio(bio, err);
         } else if (!is_orig_bio) {
@@@ -3334,20 -3311,20 +3343,20 @@@ int btrfs_map_bio(struct btrfs_root *ro
         u64 logical = (u64)bio->bi_sector << 9;
         u64 length = 0;
         u64 map_length;
-       struct btrfs_multi_bio *multi = NULL;
         int ret;
         int dev_nr = 0;
         int total_devs = 1;
+       struct btrfs_bio *bbio = NULL;
   
         length = bio->bi_size;
         map_tree = &root->fs_info->mapping_tree;
         map_length = length;
   
-       ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+       ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
                               mirror_num);
         BUG_ON(ret);
   
-       total_devs = multi->num_stripes;
+       total_devs = bbio->num_stripes;
         if (map_length < length) {
                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
                        "len %llu\n", (unsigned long long)logical,
@@@ -3355,25 -3332,28 +3364,28 @@@
                        (unsigned long long)map_length);
                 BUG();
         }
-       multi->end_io = first_bio->bi_end_io;
-       multi->private = first_bio->bi_private;
-       multi->orig_bio = first_bio;
-       atomic_set(&multi->stripes_pending, multi->num_stripes);
+ 
+       bbio->orig_bio = first_bio;
+       bbio->private = first_bio->bi_private;
+       bbio->end_io = first_bio->bi_end_io;
+       atomic_set(&bbio->stripes_pending, bbio->num_stripes);
   
         while (dev_nr < total_devs) {
-               if (total_devs > 1) {
-                       if (dev_nr < total_devs - 1) {
-                               bio = bio_clone(first_bio, GFP_NOFS);
-                               BUG_ON(!bio);
-                       } else {
-                               bio = first_bio;
-                       }
-                       bio->bi_private = multi;
-                       bio->bi_end_io = end_bio_multi_stripe;
+               if (dev_nr < total_devs - 1) {
+                       bio = bio_clone(first_bio, GFP_NOFS);
+                       BUG_ON(!bio);
+               } else {
+                       bio = first_bio;
                 }
-               bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
-               dev = multi->stripes[dev_nr].dev;
+               bio->bi_private = bbio;
+               bio->bi_end_io = btrfs_end_bio;
+               bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+               dev = bbio->stripes[dev_nr].dev;
                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+                       pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+                                "(%s id %llu), size=%u\n", rw,
+                                (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                                dev->name, dev->devid, bio->bi_size);
                         bio->bi_bdev = dev->bdev;
                         if (async_submit)
                                 schedule_bio(root, dev, rw, bio);
@@@ -3386,8 -3366,6 +3398,6 @@@
                 }
                 dev_nr++;
         }
-       if (total_devs == 1)
-               kfree(multi);
         return 0;
   }
   
@@@ -3648,20 -3626,15 +3658,20 @@@ static int read_one_dev(struct btrfs_ro
         fill_device_from_item(leaf, dev_item, device);
         device->dev_root = root->fs_info->dev_root;
         device->in_fs_metadata = 1;
- -      if (device->writeable)
+ +      if (device->writeable) {
                 device->fs_devices->total_rw_bytes += device->total_bytes;
+ +              spin_lock(&root->fs_info->free_chunk_lock);
+ +              root->fs_info->free_chunk_space += device->total_bytes -
+ +                      device->bytes_used;
+ +              spin_unlock(&root->fs_info->free_chunk_lock);
+ +      }
         ret = 0;
         return ret;
   }
   
   int btrfs_read_sys_array(struct btrfs_root *root)
   {
- -      struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ +      struct btrfs_super_block *super_copy = root->fs_info->super_copy;
         struct extent_buffer *sb;
         struct btrfs_disk_key *disk_key;
         struct btrfs_chunk *chunk;
diff --combined fs/btrfs/volumes.h

index 2a751246188ad713501d7da60f7810ea8517e47f,71f4f3f674955fd3edd7b95925fc66e98246ed64..ab5b1c49f3529e9e7e112649b98f22d0a923fb35
--- 1/fs/btrfs/volumes.h
--- 2/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@@ -92,14 -92,6 +92,14 @@@ struct btrfs_device 
         struct btrfs_work work;
         struct rcu_head rcu;
         struct work_struct rcu_work;
+ +
+ +      /* readahead state */
+ +      spinlock_t reada_lock;
+ +      atomic_t reada_in_flight;
+ +      u64 reada_next;
+ +      struct reada_zone *reada_curr_zone;
+ +      struct radix_tree_root reada_zones;
+ +      struct radix_tree_root reada_extents;
   };
   
   struct btrfs_fs_devices {
@@@ -144,7 -136,10 +144,10 @@@ struct btrfs_bio_stripe 
         u64 length; /* only used for discard mappings */
   };
   
- struct btrfs_multi_bio {
+ struct btrfs_bio;
+ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+ 
+ struct btrfs_bio {
         atomic_t stripes_pending;
         bio_end_io_t *end_io;
         struct bio *orig_bio;
@@@ -152,6 -147,7 +155,7 @@@
         atomic_t error;
         int max_errors;
         int num_stripes;
+       int mirror_num;
         struct btrfs_bio_stripe stripes[];
   };
   
@@@ -179,7 -175,7 +183,7 @@@ struct map_lookup 
   int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                    u64 end, u64 *length);
   
- #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+ #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
                             (sizeof(struct btrfs_bio_stripe) * (n)))
   
   int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@@ -188,7 -184,7 +192,7 @@@
                            u64 chunk_offset, u64 start, u64 num_bytes);
   int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                     u64 logical, u64 *length,
-                   struct btrfs_multi_bio **multi_ret, int mirror_num);
+                   struct btrfs_bio **bbio_ret, int mirror_num);
   int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                      u64 chunk_start, u64 physical, u64 devid,
                      u64 **logical, int *naddrs, int *stripe_len);
author	Chris Mason <chris.mason@oracle.com>
	Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)
committer	Chris Mason <chris.mason@oracle.com>
	Sun, 6 Nov 2011 08:07:10 +0000 (03:07 -0500)
		1	2
fs/btrfs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent_io.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/reada.c	patch \|	diff1 \|	\|	blob \| history
fs/btrfs/scrub.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.h	patch \|	diff1 \|	diff2 \|	blob \| history