]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2013 19:33:09 +0000 (12:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 9 Jul 2013 19:33:09 +0000 (12:33 -0700)
Pull btrfs update from Chris Mason:
 "These are the usual mixture of bugs, cleanups and performance fixes.
  Miao has some really nice tuning of our crc code as well as our
  transaction commits.

  Josef is peeling off more and more problems related to early enospc,
  and has a number of important bug fixes in here too"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (81 commits)
  Btrfs: wait ordered range before doing direct io
  Btrfs: only do the tree_mod_log_free_eb if this is our last ref
  Btrfs: hold the tree mod lock in __tree_mod_log_rewind
  Btrfs: make backref walking code handle skinny metadata
  Btrfs: fix crash regarding to ulist_add_merge
  Btrfs: fix several potential problems in copy_nocow_pages_for_inode
  Btrfs: cleanup the code of copy_nocow_pages_for_inode()
  Btrfs: fix oops when recovering the file data by scrub function
  Btrfs: make the chunk allocator completely tree lockless
  Btrfs: cleanup orphaned root orphan item
  Btrfs: fix wrong mirror number tuning
  Btrfs: cleanup redundant code in btrfs_submit_direct()
  Btrfs: remove btrfs_sector_sum structure
  Btrfs: check if we can nocow if we don't have data space
  Btrfs: stop using try_to_writeback_inodes_sb_nr to flush delalloc
  Btrfs: use a percpu to keep track of possibly pinned bytes
  Btrfs: check for actual acls rather than just xattrs when caching no acl
  Btrfs: move btrfs_truncate_page to btrfs_cont_expand instead of btrfs_truncate
  Btrfs: optimize reada_for_balance
  Btrfs: optimize read_block_for_search
  ...

1  2 
fs/btrfs/delayed-inode.c
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/delayed-inode.c
index eb34438ddedbc8ca0377fd6410d831fb3e824f7e,5615eacc7e7f11eaa7501ace2bfc397250f6f29c..375510913fe744784f8f56966ed29693ee8e3612
@@@ -535,20 -535,6 +535,6 @@@ static struct btrfs_delayed_item *__btr
        return next;
  }
  
- static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
-                                                  u64 root_id)
- {
-       struct btrfs_key root_key;
-       if (root->objectid == root_id)
-               return root;
-       root_key.objectid = root_id;
-       root_key.type = BTRFS_ROOT_ITEM_KEY;
-       root_key.offset = (u64)-1;
-       return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
- }
  static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                                               struct btrfs_root *root,
                                               struct btrfs_delayed_item *item)
@@@ -1681,7 -1667,8 +1667,7 @@@ int btrfs_should_delete_dir_index(struc
   * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
   *
   */
 -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
 -                                  filldir_t filldir,
 +int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                                    struct list_head *ins_list)
  {
        struct btrfs_dir_item *di;
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
  
 -              if (curr->key.offset < filp->f_pos) {
 +              if (curr->key.offset < ctx->pos) {
                        if (atomic_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }
  
 -              filp->f_pos = curr->key.offset;
 +              ctx->pos = curr->key.offset;
  
                di = (struct btrfs_dir_item *)curr->data;
                name = (char *)(di + 1);
                d_type = btrfs_filetype_table[di->type];
                btrfs_disk_key_to_cpu(&location, &di->location);
  
 -              over = filldir(dirent, name, name_len, curr->key.offset,
 +              over = !dir_emit(ctx, name, name_len,
                               location.objectid, d_type);
  
                if (atomic_dec_and_test(&curr->refs))
diff --combined fs/btrfs/disk-io.c
index b0292b3ead54d1651ba47d7e9efcc567566dd1ed,3c2886ca7d8cbac3ffe353a0280a1831e53ec2ff..6b092a1c4e37bab47adb0e9fc35ae6ec3e6081f8
@@@ -1013,8 -1013,7 +1013,8 @@@ static int btree_releasepage(struct pag
        return try_release_extent_buffer(page);
  }
  
 -static void btree_invalidatepage(struct page *page, unsigned long offset)
 +static void btree_invalidatepage(struct page *page, unsigned int offset,
 +                               unsigned int length)
  {
        struct extent_io_tree *tree;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
@@@ -1192,6 -1191,8 +1192,8 @@@ static void __setup_root(u32 nodesize, 
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
+       root->nr_delalloc_inodes = 0;
+       root->nr_ordered_extents = 0;
        root->name = NULL;
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
  
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+       INIT_LIST_HEAD(&root->delalloc_inodes);
+       INIT_LIST_HEAD(&root->delalloc_root);
+       INIT_LIST_HEAD(&root->ordered_extents);
+       INIT_LIST_HEAD(&root->ordered_root);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+       spin_lock_init(&root->delalloc_lock);
+       spin_lock_init(&root->ordered_extent_lock);
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
+       atomic_set(&root->refs, 1);
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
        spin_lock_init(&root->root_item_lock);
  }
  
- static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
-                                           struct btrfs_fs_info *fs_info,
-                                           u64 objectid,
-                                           struct btrfs_root *root)
- {
-       int ret;
-       u32 blocksize;
-       u64 generation;
-       __setup_root(tree_root->nodesize, tree_root->leafsize,
-                    tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, objectid);
-       ret = btrfs_find_last_root(tree_root, objectid,
-                                  &root->root_item, &root->root_key);
-       if (ret > 0)
-               return -ENOENT;
-       else if (ret < 0)
-               return ret;
-       generation = btrfs_root_generation(&root->root_item);
-       blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-       root->commit_root = NULL;
-       root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-                                    blocksize, generation);
-       if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
-               free_extent_buffer(root->node);
-               root->node = NULL;
-               return -EIO;
-       }
-       root->commit_root = btrfs_root_node(root);
-       return 0;
- }
  static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
  {
        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@@ -1452,70 -1427,73 +1428,73 @@@ int btrfs_add_log_tree(struct btrfs_tra
        return 0;
  }
  
- struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-                                              struct btrfs_key *location)
+ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+                                       struct btrfs_key *key)
  {
        struct btrfs_root *root;
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_path *path;
-       struct extent_buffer *l;
        u64 generation;
        u32 blocksize;
-       int ret = 0;
-       int slot;
+       int ret;
  
-       root = btrfs_alloc_root(fs_info);
-       if (!root)
+       path = btrfs_alloc_path();
+       if (!path)
                return ERR_PTR(-ENOMEM);
-       if (location->offset == (u64)-1) {
-               ret = find_and_setup_root(tree_root, fs_info,
-                                         location->objectid, root);
-               if (ret) {
-                       kfree(root);
-                       return ERR_PTR(ret);
-               }
-               goto out;
+       root = btrfs_alloc_root(fs_info);
+       if (!root) {
+               ret = -ENOMEM;
+               goto alloc_fail;
        }
  
        __setup_root(tree_root->nodesize, tree_root->leafsize,
                     tree_root->sectorsize, tree_root->stripesize,
-                    root, fs_info, location->objectid);
+                    root, fs_info, key->objectid);
  
-       path = btrfs_alloc_path();
-       if (!path) {
-               kfree(root);
-               return ERR_PTR(-ENOMEM);
-       }
-       ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-       if (ret == 0) {
-               l = path->nodes[0];
-               slot = path->slots[0];
-               btrfs_read_root_item(l, slot, &root->root_item);
-               memcpy(&root->root_key, location, sizeof(*location));
-       }
-       btrfs_free_path(path);
+       ret = btrfs_find_root(tree_root, key, path,
+                             &root->root_item, &root->root_key);
        if (ret) {
-               kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
-               return ERR_PTR(ret);
+               goto find_fail;
        }
  
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-       if (!root->node || !extent_buffer_uptodate(root->node)) {
-               ret = (!root->node) ? -ENOMEM : -EIO;
-               free_extent_buffer(root->node);
-               kfree(root);
-               return ERR_PTR(ret);
+       if (!root->node) {
+               ret = -ENOMEM;
+               goto find_fail;
+       } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+               ret = -EIO;
+               goto read_fail;
        }
        root->commit_root = btrfs_root_node(root);
  out:
-       if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+       btrfs_free_path(path);
+       return root;
+ read_fail:
+       free_extent_buffer(root->node);
+ find_fail:
+       kfree(root);
+ alloc_fail:
+       root = ERR_PTR(ret);
+       goto out;
+ }
+ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+                                     struct btrfs_key *location)
+ {
+       struct btrfs_root *root;
+       root = btrfs_read_tree_root(tree_root, location);
+       if (IS_ERR(root))
+               return root;
+       if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                root->ref_cows = 1;
                btrfs_check_and_init_root_item(&root->root_item);
        }
        return root;
  }
  
+ int btrfs_init_fs_root(struct btrfs_root *root)
+ {
+       int ret;
+       root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+       root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                                       GFP_NOFS);
+       if (!root->free_ino_pinned || !root->free_ino_ctl) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       btrfs_init_free_ino_ctl(root);
+       mutex_init(&root->fs_commit_mutex);
+       spin_lock_init(&root->cache_lock);
+       init_waitqueue_head(&root->cache_wait);
+       ret = get_anon_bdev(&root->anon_dev);
+       if (ret)
+               goto fail;
+       return 0;
+ fail:
+       kfree(root->free_ino_ctl);
+       kfree(root->free_ino_pinned);
+       return ret;
+ }
+ struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                       u64 root_id)
+ {
+       struct btrfs_root *root;
+       spin_lock(&fs_info->fs_roots_radix_lock);
+       root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                (unsigned long)root_id);
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       return root;
+ }
+ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root)
+ {
+       int ret;
+       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       if (ret)
+               return ret;
+       spin_lock(&fs_info->fs_roots_radix_lock);
+       ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                               (unsigned long)root->root_key.objectid,
+                               root);
+       if (ret == 0)
+               root->in_radix = 1;
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       radix_tree_preload_end();
+       return ret;
+ }
  struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
  {
                return fs_info->quota_root ? fs_info->quota_root :
                                             ERR_PTR(-ENOENT);
  again:
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                (unsigned long)location->objectid);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       root = btrfs_lookup_fs_root(fs_info, location->objectid);
        if (root)
                return root;
  
-       root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+       root = btrfs_read_fs_root(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
  
-       root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
-       root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
-                                       GFP_NOFS);
-       if (!root->free_ino_pinned || !root->free_ino_ctl) {
-               ret = -ENOMEM;
+       if (btrfs_root_refs(&root->root_item) == 0) {
+               ret = -ENOENT;
                goto fail;
        }
  
-       btrfs_init_free_ino_ctl(root);
-       mutex_init(&root->fs_commit_mutex);
-       spin_lock_init(&root->cache_lock);
-       init_waitqueue_head(&root->cache_wait);
-       ret = get_anon_bdev(&root->anon_dev);
+       ret = btrfs_init_fs_root(root);
        if (ret)
                goto fail;
  
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               ret = -ENOENT;
-               goto fail;
-       }
        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
        if (ret < 0)
                goto fail;
        if (ret == 0)
                root->orphan_item_inserted = 1;
  
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
-       if (ret)
-               goto fail;
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       ret = radix_tree_insert(&fs_info->fs_roots_radix,
-                               (unsigned long)root->root_key.objectid,
-                               root);
-       if (ret == 0)
-               root->in_radix = 1;
-       spin_unlock(&fs_info->fs_roots_radix_lock);
-       radix_tree_preload_end();
+       ret = btrfs_insert_fs_root(fs_info, root);
        if (ret) {
                if (ret == -EEXIST) {
                        free_fs_root(root);
                }
                goto fail;
        }
-       ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                   root->root_key.objectid);
-       WARN_ON(ret);
        return root;
  fail:
        free_fs_root(root);
@@@ -1677,21 -1683,37 +1684,37 @@@ static void end_workqueue_fn(struct btr
  static int cleaner_kthread(void *arg)
  {
        struct btrfs_root *root = arg;
+       int again;
  
        do {
-               int again = 0;
-               if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
-                   down_read_trylock(&root->fs_info->sb->s_umount)) {
-                       if (mutex_trylock(&root->fs_info->cleaner_mutex)) {
-                               btrfs_run_delayed_iputs(root);
-                               again = btrfs_clean_one_deleted_snapshot(root);
-                               mutex_unlock(&root->fs_info->cleaner_mutex);
-                       }
-                       btrfs_run_defrag_inodes(root->fs_info);
-                       up_read(&root->fs_info->sb->s_umount);
+               again = 0;
+               /* Make the cleaner go to sleep early. */
+               if (btrfs_need_cleaner_sleep(root))
+                       goto sleep;
+               if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+                       goto sleep;
+               /*
+                * Avoid the problem that we change the status of the fs
+                * during the above check and trylock.
+                */
+               if (btrfs_need_cleaner_sleep(root)) {
+                       mutex_unlock(&root->fs_info->cleaner_mutex);
+                       goto sleep;
                }
  
+               btrfs_run_delayed_iputs(root);
+               again = btrfs_clean_one_deleted_snapshot(root);
+               mutex_unlock(&root->fs_info->cleaner_mutex);
+               /*
+                * The defragger has dealt with the R/O remount and umount,
+                * needn't do anything special here.
+                */
+               btrfs_run_defrag_inodes(root->fs_info);
+ sleep:
                if (!try_to_freeze() && !again) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (!kthread_should_stop())
@@@ -1725,7 -1747,7 +1748,7 @@@ static int transaction_kthread(void *ar
                }
  
                now = get_seconds();
-               if (!cur->blocked &&
+               if (cur->state < TRANS_STATE_BLOCKED &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
@@@ -2035,11 -2057,11 +2058,11 @@@ static void del_fs_roots(struct btrfs_f
                list_del(&gang[0]->root_list);
  
                if (gang[0]->in_radix) {
-                       btrfs_free_fs_root(fs_info, gang[0]);
+                       btrfs_drop_and_free_fs_root(fs_info, gang[0]);
                } else {
                        free_extent_buffer(gang[0]->node);
                        free_extent_buffer(gang[0]->commit_root);
-                       kfree(gang[0]);
+                       btrfs_put_fs_root(gang[0]);
                }
        }
  
                if (!ret)
                        break;
                for (i = 0; i < ret; i++)
-                       btrfs_free_fs_root(fs_info, gang[i]);
+                       btrfs_drop_and_free_fs_root(fs_info, gang[i]);
        }
  }
  
@@@ -2082,14 -2104,8 +2105,8 @@@ int open_ctree(struct super_block *sb
        int backup_index = 0;
  
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-       extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-       csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-       dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
-       quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-       if (!tree_root || !extent_root || !csum_root ||
-           !chunk_root || !dev_root || !quota_root) {
+       if (!tree_root || !chunk_root) {
                err = -ENOMEM;
                goto fail;
        }
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
-       INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
-       spin_lock_init(&fs_info->delalloc_lock);
+       spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
-       fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
  
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
  
-       INIT_LIST_HEAD(&fs_info->ordered_extents);
-       spin_lock_init(&fs_info->ordered_extent_lock);
+       INIT_LIST_HEAD(&fs_info->ordered_roots);
+       spin_lock_init(&fs_info->ordered_root_lock);
        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
                                        GFP_NOFS);
        if (!fs_info->delayed_root) {
        fs_info->qgroup_seq = 1;
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
+       fs_info->qgroup_ulist = NULL;
        mutex_init(&fs_info->qgroup_rescan_lock);
  
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@@ -2639,33 -2655,44 +2656,44 @@@ retry_root_backup
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
  
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-       if (ret)
+       location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+       location.type = BTRFS_ROOT_ITEM_KEY;
+       location.offset = 0;
+       extent_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(extent_root)) {
+               ret = PTR_ERR(extent_root);
                goto recovery_tree_root;
+       }
        extent_root->track_dirty = 1;
+       fs_info->extent_root = extent_root;
  
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_DEV_TREE_OBJECTID, dev_root);
-       if (ret)
+       location.objectid = BTRFS_DEV_TREE_OBJECTID;
+       dev_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(dev_root)) {
+               ret = PTR_ERR(dev_root);
                goto recovery_tree_root;
+       }
        dev_root->track_dirty = 1;
+       fs_info->dev_root = dev_root;
+       btrfs_init_devices_late(fs_info);
  
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_CSUM_TREE_OBJECTID, csum_root);
-       if (ret)
+       location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+       csum_root = btrfs_read_tree_root(tree_root, &location);
+       if (IS_ERR(csum_root)) {
+               ret = PTR_ERR(csum_root);
                goto recovery_tree_root;
+       }
        csum_root->track_dirty = 1;
+       fs_info->csum_root = csum_root;
  
-       ret = find_and_setup_root(tree_root, fs_info,
-                                 BTRFS_QUOTA_TREE_OBJECTID, quota_root);
-       if (ret) {
-               kfree(quota_root);
-               quota_root = fs_info->quota_root = NULL;
-       } else {
+       location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+       quota_root = btrfs_read_tree_root(tree_root, &location);
+       if (!IS_ERR(quota_root)) {
                quota_root->track_dirty = 1;
                fs_info->quota_enabled = 1;
                fs_info->pending_quota_state = 1;
+               fs_info->quota_root = quota_root;
        }
  
        fs_info->generation = generation;
  
        location.objectid = BTRFS_FS_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
-       location.offset = (u64)-1;
+       location.offset = 0;
  
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-       if (!fs_info->fs_root)
-               goto fail_qgroup;
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
                goto fail_qgroup;
                return ret;
        }
  
+       btrfs_qgroup_rescan_resume(fs_info);
        return 0;
  
  fail_qgroup:
@@@ -3259,7 -3286,7 +3287,7 @@@ int btrfs_calc_num_tolerated_disk_barri
                                            BTRFS_BLOCK_GROUP_RAID10)) {
                                                num_tolerated_disk_barrier_failures = 1;
                                        } else if (flags &
-                                                  BTRFS_BLOCK_GROUP_RAID5) {
+                                                  BTRFS_BLOCK_GROUP_RAID6) {
                                                num_tolerated_disk_barrier_failures = 2;
                                        }
                                }
@@@ -3367,7 -3394,9 +3395,9 @@@ int write_ctree_super(struct btrfs_tran
        return ret;
  }
  
- void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+ /* Drop a fs root from the radix tree and free it. */
+ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_root *root)
  {
        spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_delete(&fs_info->fs_roots_radix,
@@@ -3398,7 -3427,12 +3428,12 @@@ static void free_fs_root(struct btrfs_r
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
        kfree(root->name);
-       kfree(root);
+       btrfs_put_fs_root(root);
+ }
+ void btrfs_free_fs_root(struct btrfs_root *root)
+ {
+       free_fs_root(root);
  }
  
  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@@ -3654,7 -3688,7 +3689,7 @@@ static void btrfs_destroy_ordered_opera
        INIT_LIST_HEAD(&splice);
  
        mutex_lock(&root->fs_info->ordered_operations_mutex);
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->fs_info->ordered_root_lock);
  
        list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                                         ordered_operations);
  
                list_del_init(&btrfs_inode->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
  
                btrfs_invalidate_inodes(btrfs_inode->root);
  
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
        }
  
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->fs_info->ordered_root_lock);
        mutex_unlock(&root->fs_info->ordered_operations_mutex);
  }
  
@@@ -3677,15 -3711,36 +3712,36 @@@ static void btrfs_destroy_ordered_exten
  {
        struct btrfs_ordered_extent *ordered;
  
-       spin_lock(&root->fs_info->ordered_extent_lock);
+       spin_lock(&root->ordered_extent_lock);
        /*
         * This will just short circuit the ordered completion stuff which will
         * make sure the ordered extent gets properly cleaned up.
         */
-       list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+       list_for_each_entry(ordered, &root->ordered_extents,
                            root_extent_list)
                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-       spin_unlock(&root->fs_info->ordered_extent_lock);
+       spin_unlock(&root->ordered_extent_lock);
+ }
+ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_root *root;
+       struct list_head splice;
+       INIT_LIST_HEAD(&splice);
+       spin_lock(&fs_info->ordered_root_lock);
+       list_splice_init(&fs_info->ordered_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       ordered_root);
+               list_del_init(&root->ordered_root);
+               btrfs_destroy_ordered_extents(root);
+               cond_resched_lock(&fs_info->ordered_root_lock);
+       }
+       spin_unlock(&fs_info->ordered_root_lock);
  }
  
  int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
  
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
                struct btrfs_delayed_ref_head *head = NULL;
+               bool pin_bytes = false;
  
                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
                        }
  
                        if (head->must_insert_reserved)
-                               btrfs_pin_extent(root, ref->bytenr,
-                                                ref->num_bytes, 1);
+                               pin_bytes = true;
                        btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-               if (head)
-                       mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
+               if (head) {
+                       if (pin_bytes)
+                               btrfs_pin_extent(root, ref->bytenr,
+                                                ref->num_bytes, 1);
+                       mutex_unlock(&head->mutex);
+               }
                btrfs_put_delayed_ref(ref);
  
                cond_resched();
@@@ -3778,24 -3837,49 +3838,49 @@@ static void btrfs_destroy_delalloc_inod
  
        INIT_LIST_HEAD(&splice);
  
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
  
        while (!list_empty(&splice)) {
-               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                   delalloc_inodes);
+               btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+                                              delalloc_inodes);
  
                list_del_init(&btrfs_inode->delalloc_inodes);
                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                          &btrfs_inode->runtime_flags);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
  
                btrfs_invalidate_inodes(btrfs_inode->root);
  
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
  
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
+ }
+ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_root *root;
+       struct list_head splice;
+       INIT_LIST_HEAD(&splice);
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                        delalloc_root);
+               list_del_init(&root->delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               spin_unlock(&fs_info->delalloc_root_lock);
+               btrfs_destroy_delalloc_inodes(root);
+               btrfs_put_fs_root(root);
+               spin_lock(&fs_info->delalloc_root_lock);
+       }
+       spin_unlock(&fs_info->delalloc_root_lock);
  }
  
  static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@@ -3879,19 -3963,14 +3964,14 @@@ void btrfs_cleanup_one_transaction(stru
        btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                cur_trans->dirty_pages.dirty_bytes);
  
-       /* FIXME: cleanup wait for commit */
-       cur_trans->in_commit = 1;
-       cur_trans->blocked = 1;
+       cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&root->fs_info->transaction_blocked_wait);
  
        btrfs_evict_pending_snapshots(cur_trans);
  
-       cur_trans->blocked = 0;
+       cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&root->fs_info->transaction_wait);
  
-       cur_trans->commit_done = 1;
-       wake_up(&cur_trans->commit_wait);
        btrfs_destroy_delayed_inodes(root);
        btrfs_assert_delayed_root_empty(root);
  
        btrfs_destroy_pinned_extent(root,
                                    root->fs_info->pinned_extents);
  
+       cur_trans->state =TRANS_STATE_COMPLETED;
+       wake_up(&cur_trans->commit_wait);
        /*
        memset(cur_trans, 0, sizeof(*cur_trans));
        kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@@ -3915,7 -3997,7 +3998,7 @@@ static int btrfs_cleanup_transaction(st
  
        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&root->fs_info->trans_list, &list);
-       root->fs_info->trans_no_join = 1;
+       root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->trans_lock);
  
        while (!list_empty(&list)) {
  
                btrfs_destroy_ordered_operations(t, root);
  
-               btrfs_destroy_ordered_extents(root);
+               btrfs_destroy_all_ordered_extents(root->fs_info);
  
                btrfs_destroy_delayed_refs(t, root);
  
-               /* FIXME: cleanup wait for commit */
-               t->in_commit = 1;
-               t->blocked = 1;
+               /*
+                *  FIXME: cleanup wait for commit
+                *  We needn't acquire the lock here, because we are during
+                *  the umount, there is no other task which will change it.
+                */
+               t->state = TRANS_STATE_COMMIT_START;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
                        wake_up(&root->fs_info->transaction_blocked_wait);
  
                btrfs_evict_pending_snapshots(t);
  
-               t->blocked = 0;
+               t->state = TRANS_STATE_UNBLOCKED;
                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
  
-               t->commit_done = 1;
-               smp_mb();
-               if (waitqueue_active(&t->commit_wait))
-                       wake_up(&t->commit_wait);
                btrfs_destroy_delayed_inodes(root);
                btrfs_assert_delayed_root_empty(root);
  
-               btrfs_destroy_delalloc_inodes(root);
-               spin_lock(&root->fs_info->trans_lock);
-               root->fs_info->running_transaction = NULL;
-               spin_unlock(&root->fs_info->trans_lock);
+               btrfs_destroy_all_delalloc_inodes(root->fs_info);
  
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
                                             EXTENT_DIRTY);
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
  
+               t->state = TRANS_STATE_COMPLETED;
+               smp_mb();
+               if (waitqueue_active(&t->commit_wait))
+                       wake_up(&t->commit_wait);
                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
        }
  
-       spin_lock(&root->fs_info->trans_lock);
-       root->fs_info->trans_no_join = 0;
-       spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
  
        return 0;
diff --combined fs/btrfs/extent_io.c
index 6bca9472f313cda2cb7ad1f230dda69bf4b1e8a9,f8586a957a020cc62591ddee1489d6f379308c89..583d98bd065ed83ca979a2786b59ae4342380c47
@@@ -77,10 -77,29 +77,29 @@@ void btrfs_leak_debug_check(void
                kmem_cache_free(extent_buffer_cache, eb);
        }
  }
+ #define btrfs_debug_check_extent_io_range(inode, start, end)          \
+       __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+               struct inode *inode, u64 start, u64 end)
+ {
+       u64 isize = i_size_read(inode);
+       if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+               printk_ratelimited(KERN_DEBUG
+                   "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+                               caller,
+                               (unsigned long long)btrfs_ino(inode),
+                               (unsigned long long)isize,
+                               (unsigned long long)start,
+                               (unsigned long long)end);
+       }
+ }
  #else
  #define btrfs_leak_debug_add(new, head)       do {} while (0)
  #define btrfs_leak_debug_del(entry)   do {} while (0)
  #define btrfs_leak_debug_check()      do {} while (0)
+ #define btrfs_debug_check_extent_io_range(c, s, e)    do {} while (0)
  #endif
  
  #define BUFFER_LRU_MAX 64
@@@ -522,6 -541,11 +541,11 @@@ int clear_extent_bit(struct extent_io_t
        int err;
        int clear = 0;
  
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+       if (bits & EXTENT_DELALLOC)
+               bits |= EXTENT_NORESERVE;
        if (delete)
                bits |= ~EXTENT_CTLBITS;
        bits |= EXTENT_FIRST_DELALLOC;
@@@ -677,6 -701,8 +701,8 @@@ static void wait_extent_bit(struct exte
        struct extent_state *state;
        struct rb_node *node;
  
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
        spin_lock(&tree->lock);
  again:
        while (1) {
@@@ -769,6 -795,8 +795,8 @@@ __set_extent_bit(struct extent_io_tree 
        u64 last_start;
        u64 last_end;
  
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
        bits |= EXTENT_FIRST_DELALLOC;
  again:
        if (!prealloc && (mask & __GFP_WAIT)) {
@@@ -989,6 -1017,8 +1017,8 @@@ int convert_extent_bit(struct extent_io
        u64 last_start;
        u64 last_end;
  
+       btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
  again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@@ -2450,11 -2480,12 +2480,12 @@@ static void end_bio_extent_readpage(str
                struct extent_state *cached = NULL;
                struct extent_state *state;
                struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+               struct inode *inode = page->mapping->host;
  
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
                         "mirror=%lu\n", (u64)bio->bi_sector, err,
                         io_bio->mirror_num);
-               tree = &BTRFS_I(page->mapping->host)->io_tree;
+               tree = &BTRFS_I(inode)->io_tree;
  
                /* We always issue full-page reads, but if some block
                 * in a page fails to read, blk_update_request() will
                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
  
                if (uptodate) {
+                       loff_t i_size = i_size_read(inode);
+                       pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+                       unsigned offset;
+                       /* Zero out the end if this page straddles i_size */
+                       offset = i_size & (PAGE_CACHE_SIZE-1);
+                       if (page->index == end_index && offset)
+                               zero_user_segment(page, offset, PAGE_CACHE_SIZE);
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
@@@ -2643,7 -2682,8 +2682,7 @@@ static int submit_extent_page(int rw, s
                if (old_compressed)
                        contig = bio->bi_sector == sector;
                else
 -                      contig = bio->bi_sector + (bio->bi_size >> 9) ==
 -                              sector;
 +                      contig = bio_end_sector(bio) == sector;
  
                if (prev_bio_flags != bio_flags || !contig ||
                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
@@@ -2957,7 -2997,7 +2996,7 @@@ static int __extent_writepage(struct pa
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
 -              page->mapping->a_ops->invalidatepage(page, 0);
 +              page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
                unlock_page(page);
                return 0;
        }
diff --combined fs/btrfs/file.c
index 89da56a58b635c9bf80197c0cf32e2dc3f698442,2d70849cec92b714476657d5a75441a47c4812d5..a005fe2c072ad0751254adba0fa4e04db10cc996
@@@ -24,7 -24,6 +24,7 @@@
  #include <linux/string.h>
  #include <linux/backing-dev.h>
  #include <linux/mpage.h>
 +#include <linux/aio.h>
  #include <linux/falloc.h>
  #include <linux/swap.h>
  #include <linux/writeback.h>
@@@ -309,10 -308,6 +309,6 @@@ static int __btrfs_run_defrag_inode(str
                ret = PTR_ERR(inode_root);
                goto cleanup;
        }
-       if (btrfs_root_refs(&inode_root->root_item) == 0) {
-               ret = -ENOENT;
-               goto cleanup;
-       }
  
        key.objectid = defrag->ino;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@@ -1317,6 -1312,56 +1313,56 @@@ fail
  
  }
  
+ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+                                   size_t *write_bytes)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_ordered_extent *ordered;
+       u64 lockstart, lockend;
+       u64 num_bytes;
+       int ret;
+       lockstart = round_down(pos, root->sectorsize);
+       lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+       while (1) {
+               lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered) {
+                       break;
+               }
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+       }
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+               return PTR_ERR(trans);
+       }
+       num_bytes = lockend - lockstart + 1;
+       ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+                              NULL);
+       btrfs_end_transaction(trans, root);
+       if (ret <= 0) {
+               ret = 0;
+       } else {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                EXTENT_DIRTY | EXTENT_DELALLOC |
+                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+                                NULL, GFP_NOFS);
+               *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+       }
+       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+       return ret;
+ }
  static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                               struct iov_iter *i,
                                               loff_t pos)
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct page **pages = NULL;
+       u64 release_bytes = 0;
        unsigned long first_index;
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
+       bool only_release_metadata = false;
        bool force_page_uptodate = false;
  
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                                         offset);
                size_t num_pages = (write_bytes + offset +
                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+               size_t reserve_bytes;
                size_t dirty_pages;
                size_t copied;
  
                        break;
                }
  
-               ret = btrfs_delalloc_reserve_space(inode,
-                                       num_pages << PAGE_CACHE_SHIFT);
+               reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+               ret = btrfs_check_data_free_space(inode, reserve_bytes);
+               if (ret == -ENOSPC &&
+                   (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                             BTRFS_INODE_PREALLOC))) {
+                       ret = check_can_nocow(inode, pos, &write_bytes);
+                       if (ret > 0) {
+                               only_release_metadata = true;
+                               /*
+                                * our prealloc extent may be smaller than
+                                * write_bytes, so scale down.
+                                */
+                               num_pages = (write_bytes + offset +
+                                            PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
+                               reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+                               ret = 0;
+                       } else {
+                               ret = -ENOSPC;
+                       }
+               }
                if (ret)
                        break;
  
+               ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+               if (ret) {
+                       if (!only_release_metadata)
+                               btrfs_free_reserved_data_space(inode,
+                                                              reserve_bytes);
+                       break;
+               }
+               release_bytes = reserve_bytes;
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
                ret = prepare_pages(root, file, pages, num_pages,
                                    pos, first_index, write_bytes,
                                    force_page_uptodate);
-               if (ret) {
-                       btrfs_delalloc_release_space(inode,
-                                       num_pages << PAGE_CACHE_SHIFT);
+               if (ret)
                        break;
-               }
  
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, i);
                 * managed to copy.
                 */
                if (num_pages > dirty_pages) {
+                       release_bytes = (num_pages - dirty_pages) <<
+                               PAGE_CACHE_SHIFT;
                        if (copied > 0) {
                                spin_lock(&BTRFS_I(inode)->lock);
                                BTRFS_I(inode)->outstanding_extents++;
                                spin_unlock(&BTRFS_I(inode)->lock);
                        }
-                       btrfs_delalloc_release_space(inode,
-                                       (num_pages - dirty_pages) <<
-                                       PAGE_CACHE_SHIFT);
+                       if (only_release_metadata)
+                               btrfs_delalloc_release_metadata(inode,
+                                                               release_bytes);
+                       else
+                               btrfs_delalloc_release_space(inode,
+                                                            release_bytes);
                }
  
+               release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
                if (copied > 0) {
                        ret = btrfs_dirty_pages(root, inode, pages,
                                                dirty_pages, pos, copied,
                                                NULL);
                        if (ret) {
-                               btrfs_delalloc_release_space(inode,
-                                       dirty_pages << PAGE_CACHE_SHIFT);
                                btrfs_drop_pages(pages, num_pages);
                                break;
                        }
                }
  
+               release_bytes = 0;
                btrfs_drop_pages(pages, num_pages);
  
+               if (only_release_metadata && copied > 0) {
+                       u64 lockstart = round_down(pos, root->sectorsize);
+                       u64 lockend = lockstart +
+                               (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+                       set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                      lockend, EXTENT_NORESERVE, NULL,
+                                      NULL, GFP_NOFS);
+                       only_release_metadata = false;
+               }
                cond_resched();
  
                balance_dirty_pages_ratelimited(inode->i_mapping);
  
        kfree(pages);
  
+       if (release_bytes) {
+               if (only_release_metadata)
+                       btrfs_delalloc_release_metadata(inode, release_bytes);
+               else
+                       btrfs_delalloc_release_space(inode, release_bytes);
+       }
        return num_written ? num_written : ret;
  }
  
@@@ -1518,6 -1616,8 +1617,6 @@@ static ssize_t btrfs_file_aio_write(str
        size_t count, ocount;
        bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
  
 -      sb_start_write(inode->i_sb);
 -
        mutex_lock(&inode->i_mutex);
  
        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
        if (sync)
                atomic_dec(&BTRFS_I(inode)->sync_writers);
  out:
 -      sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
  }
@@@ -2175,12 -2276,6 +2274,6 @@@ static long btrfs_fallocate(struct fil
                        goto out_reserve_fail;
        }
  
-       /*
-        * wait for ordered IO before we have any locks.  We'll loop again
-        * below with the locks held.
-        */
-       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        mutex_lock(&inode->i_mutex);
        ret = inode_newsize_ok(inode, alloc_end);
        if (ret)
                                        alloc_start);
                if (ret)
                        goto out;
+       } else {
+               /*
+                * If we are fallocating from the end of the file onward we
+                * need to zero out the end of the page if i_size lands in the
+                * middle of a page.
+                */
+               ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+               if (ret)
+                       goto out;
        }
  
+       /*
+        * wait for ordered IO before we have any locks.  We'll loop again
+        * below with the locks held.
+        */
+       btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@@ -2425,7 -2535,20 +2533,7 @@@ static loff_t btrfs_file_llseek(struct 
                }
        }
  
 -      if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
 -              offset = -EINVAL;
 -              goto out;
 -      }
 -      if (offset > inode->i_sb->s_maxbytes) {
 -              offset = -EINVAL;
 -              goto out;
 -      }
 -
 -      /* Special lock needed here? */
 -      if (offset != file->f_pos) {
 -              file->f_pos = offset;
 -              file->f_version = 0;
 -      }
 +      offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
  out:
        mutex_unlock(&inode->i_mutex);
        return offset;
index 2750b50235269d2304ce45a906b2c5b43dae8b5c,75172853d7139a0d136d95abc420df4ab2e816ea..b21a3cd667d8cc656878b8d462aa7cd45ebc8435
@@@ -213,7 -213,7 +213,7 @@@ int btrfs_check_trunc_cache_free_space(
        else
                ret = 0;
        spin_unlock(&rsv->lock);
-       return 0;
+       return ret;
  }
  
  int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@@ -3150,6 -3150,8 +3150,8 @@@ again
        return 0;
  }
  
+ #define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
  /*
   * This test just does basic sanity checking, making sure we can add an exten
   * entry and remove space from either end and the middle, and make sure we can
@@@ -3159,63 -3161,63 +3161,63 @@@ static int test_extents(struct btrfs_bl
  {
        int ret = 0;
  
-       printk(KERN_ERR "Running extent only tests\n");
+       test_msg("Running extent only tests\n");
  
        /* First just make sure we can remove an entire entry */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error adding initial extents %d\n", ret);
+               test_msg("Error adding initial extents %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing extent %d\n", ret);
+               test_msg("Error removing extent %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Full remove left some lingering space\n");
+               test_msg("Full remove left some lingering space\n");
                return -1;
        }
  
        /* Ok edge and middle cases now */
        ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error adding half extent %d\n", ret);
+               test_msg("Error adding half extent %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing tail end %d\n", ret);
+               test_msg("Error removing tail end %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing front end %d\n", ret);
+               test_msg("Error removing front end %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
        if (ret) {
-               printk(KERN_ERR "Error removing middle piece %d\n", ret);
 -              test_msg("Error removing middle peice %d\n", ret);
++              test_msg("Error removing middle piece %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Still have space at the front\n");
+               test_msg("Still have space at the front\n");
                return -1;
        }
  
        if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
-               printk(KERN_ERR "Still have space in the middle\n");
+               test_msg("Still have space in the middle\n");
                return -1;
        }
  
        if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Still have space at the end\n");
+               test_msg("Still have space at the end\n");
                return -1;
        }
  
@@@ -3230,34 -3232,34 +3232,34 @@@ static int test_bitmaps(struct btrfs_bl
        u64 next_bitmap_offset;
        int ret;
  
-       printk(KERN_ERR "Running bitmap only tests\n");
+       test_msg("Running bitmap only tests\n");
  
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret);
+               test_msg("Couldn't create a bitmap entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing bitmap full range %d\n", ret);
+               test_msg("Error removing bitmap full range %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Left some space in bitmap\n");
+               test_msg("Left some space in bitmap\n");
                return -1;
        }
  
        ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret);
+               test_msg("Couldn't add to our bitmap entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret);
+               test_msg("Couldn't remove middle chunk %d\n", ret);
                return ret;
        }
  
        ret = add_free_space_entry(cache, next_bitmap_offset -
                                   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add space that straddles two bitmaps"
-                      " %d\n", ret);
+               test_msg("Couldn't add space that straddles two bitmaps %d\n",
+                               ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, next_bitmap_offset -
                                      (1 * 1024 * 1024), 2 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+               test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
                         2 * 1024 * 1024)) {
-               printk(KERN_ERR "Left some space when removing overlapping\n");
+               test_msg("Left some space when removing overlapping\n");
                return -1;
        }
  
@@@ -3300,7 -3302,7 +3302,7 @@@ static int test_bitmaps_and_extents(str
        u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
        int ret;
  
-       printk(KERN_ERR "Running bitmap and extent tests\n");
+       test_msg("Running bitmap and extent tests\n");
  
        /*
         * First let's do something simple, an extent at the same offset as the
         */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret);
+               test_msg("Couldn't create bitmap entry %d\n", ret);
                return ret;
        }
  
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove extent entry %d\n", ret);
+               test_msg("Couldn't remove extent entry %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Left remnants after our remove\n");
+               test_msg("Left remnants after our remove\n");
                return -1;
        }
  
        /* Now to add back the extent entry and remove from the bitmap */
        ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret);
+               test_msg("Couldn't re-add extent entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret);
+               test_msg("Couldn't remove from bitmap %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
-               printk(KERN_ERR "Left remnants in the bitmap\n");
+               test_msg("Left remnants in the bitmap\n");
                return -1;
        }
  
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret);
+               test_msg("Couldn't add to a bitmap %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret);
+               test_msg("Couldn't remove overlapping space %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-               printk(KERN_ERR "Left over peices after removing "
-                      "overlapping\n");
+               test_msg("Left over peices after removing overlapping\n");
                return -1;
        }
  
        /* Now with the extent entry offset into the bitmap */
        ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret);
+               test_msg("Couldn't add space to the bitmap %d\n", ret);
                return ret;
        }
  
        ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret);
+               test_msg("Couldn't add extent to the cache %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Problem removing overlapping space %d\n", ret);
+               test_msg("Problem removing overlapping space %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
-               printk(KERN_ERR "Left something behind when removing space");
+               test_msg("Left something behind when removing space");
                return -1;
        }
  
        ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
                                   4 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add bitmap %d\n", ret);
+               test_msg("Couldn't add bitmap %d\n", ret);
                return ret;
        }
  
        ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
                                   5 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
                                      5 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Failed to free our space %d\n", ret);
+               test_msg("Failed to free our space %d\n", ret);
                return ret;
        }
  
        if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
                         5 * 1024 * 1024)) {
-               printk(KERN_ERR "Left stuff over\n");
+               test_msg("Left stuff over\n");
                return -1;
        }
  
         */
        ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
        if (ret) {
-               printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret);
+               test_msg("Couldn't add bitmap entry %d\n", ret);
                return ret;
        }
  
        ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
        if (ret) {
-               printk(KERN_ERR "Couldn't add extent entry %d\n", ret);
+               test_msg("Couldn't add extent entry %d\n", ret);
                return ret;
        }
  
        ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
        if (ret) {
-               printk(KERN_ERR "Error removing bitmap and extent "
-                      "overlapping %d\n", ret);
+               test_msg("Error removing bitmap and extent overlapping %d\n", ret);
                return ret;
        }
  
@@@ -3469,11 -3469,11 +3469,11 @@@ void btrfs_test_free_space_cache(void
  {
        struct btrfs_block_group_cache *cache;
  
-       printk(KERN_ERR "Running btrfs free space cache tests\n");
+       test_msg("Running btrfs free space cache tests\n");
  
        cache = init_test_block_group();
        if (!cache) {
-               printk(KERN_ERR "Couldn't run the tests\n");
+               test_msg("Couldn't run the tests\n");
                return;
        }
  
@@@ -3487,6 -3487,9 +3487,9 @@@ out
        __btrfs_remove_free_space_cache(cache->free_space_ctl);
        kfree(cache->free_space_ctl);
        kfree(cache);
-       printk(KERN_ERR "Free space cache tests finished\n");
+       test_msg("Free space cache tests finished\n");
  }
- #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+ #undef test_msg
+ #else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+ void btrfs_test_free_space_cache(void) {}
+ #endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --combined fs/btrfs/inode.c
index 4f9d16b70d3d87da9dd6e3cae926dbaaf4fa3345,55dda871437fdae659ec710cd7d4a03125104165..6d1b93c8aafb8a4d7b832cab8585ebf1ac1ced42
@@@ -32,7 -32,6 +32,7 @@@
  #include <linux/writeback.h>
  #include <linux/statfs.h>
  #include <linux/compat.h>
 +#include <linux/aio.h>
  #include <linux/bit_spinlock.h>
  #include <linux/xattr.h>
  #include <linux/posix_acl.h>
@@@ -42,6 -41,7 +42,7 @@@
  #include <linux/mount.h>
  #include <linux/btrfs.h>
  #include <linux/blkdev.h>
+ #include <linux/posix_acl_xattr.h>
  #include "compat.h"
  #include "ctree.h"
  #include "disk-io.h"
@@@ -57,6 -57,7 +58,7 @@@
  #include "free-space-cache.h"
  #include "inode-map.h"
  #include "backref.h"
+ #include "hash.h"
  
  struct btrfs_iget_args {
        u64 ino;
@@@ -701,8 -702,12 +703,12 @@@ retry
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
  
-                       if (ret == -ENOSPC)
+                       if (ret == -ENOSPC) {
+                               unlock_extent(io_tree, async_extent->start,
+                                             async_extent->start +
+                                             async_extent->ram_size - 1);
                                goto retry;
+                       }
                        goto out_free;
                }
  
@@@ -1529,6 -1534,46 +1535,46 @@@ static void btrfs_merge_extent_hook(str
        spin_unlock(&BTRFS_I(inode)->lock);
  }
  
+ static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+                                     struct inode *inode)
+ {
+       spin_lock(&root->delalloc_lock);
+       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                             &root->delalloc_inodes);
+               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                       &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes++;
+               if (root->nr_delalloc_inodes == 1) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(!list_empty(&root->delalloc_root));
+                       list_add_tail(&root->delalloc_root,
+                                     &root->fs_info->delalloc_roots);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+ }
+ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+                                    struct inode *inode)
+ {
+       spin_lock(&root->delalloc_lock);
+       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                         &BTRFS_I(inode)->runtime_flags);
+               root->nr_delalloc_inodes--;
+               if (!root->nr_delalloc_inodes) {
+                       spin_lock(&root->fs_info->delalloc_root_lock);
+                       BUG_ON(list_empty(&root->delalloc_root));
+                       list_del_init(&root->delalloc_root);
+                       spin_unlock(&root->fs_info->delalloc_root_lock);
+               }
+       }
+       spin_unlock(&root->delalloc_lock);
+ }
  /*
   * extent_io.c set_bit_hook, used to track delayed allocation
   * bytes in this file, and to maintain the list of inodes that
@@@ -1561,16 -1606,8 +1607,8 @@@ static void btrfs_set_bit_hook(struct i
                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                        &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                             &root->fs_info->delalloc_inodes);
-                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                       &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                                        &BTRFS_I(inode)->runtime_flags))
+                       btrfs_add_delalloc_inodes(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
  }
@@@ -1604,7 -1641,7 +1642,7 @@@ static void btrfs_clear_bit_hook(struc
                        btrfs_delalloc_release_metadata(inode, len);
  
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-                   && do_list)
+                   && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
  
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                            &BTRFS_I(inode)->runtime_flags)) {
-                       spin_lock(&root->fs_info->delalloc_lock);
-                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                         &BTRFS_I(inode)->runtime_flags);
-                       }
-                       spin_unlock(&root->fs_info->delalloc_lock);
-               }
+                            &BTRFS_I(inode)->runtime_flags))
+                       btrfs_del_delalloc_inode(root, inode);
                spin_unlock(&BTRFS_I(inode)->lock);
        }
  }
@@@ -2263,11 -2293,6 +2294,6 @@@ static noinline int relink_extent_backr
                        return 0;
                return PTR_ERR(root);
        }
-       if (btrfs_root_refs(&root->root_item) == 0) {
-               srcu_read_unlock(&fs_info->subvol_srcu, index);
-               /* parse ENOENT to 0 */
-               return 0;
-       }
  
        /* step 2: get inode */
        key.objectid = backref->inum;
@@@ -3215,13 -3240,16 +3241,16 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                        /* 1 for the orphan item deletion. */
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
+                               iput(inode);
                                ret = PTR_ERR(trans);
                                goto out;
                        }
                        ret = btrfs_orphan_add(trans, inode);
                        btrfs_end_transaction(trans, root);
-                       if (ret)
+                       if (ret) {
+                               iput(inode);
                                goto out;
+                       }
  
                        ret = btrfs_truncate(inode);
                        if (ret)
@@@ -3274,8 -3302,17 +3303,17 @@@ static noinline int acls_after_inode_it
  {
        u32 nritems = btrfs_header_nritems(leaf);
        struct btrfs_key found_key;
+       static u64 xattr_access = 0;
+       static u64 xattr_default = 0;
        int scanned = 0;
  
+       if (!xattr_access) {
+               xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+                                       strlen(POSIX_ACL_XATTR_ACCESS));
+               xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+                                       strlen(POSIX_ACL_XATTR_DEFAULT));
+       }
        slot++;
        while (slot < nritems) {
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                        return 0;
  
                /* we found an xattr, assume we've got an acl */
-               if (found_key.type == BTRFS_XATTR_ITEM_KEY)
-                       return 1;
+               if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+                       if (found_key.offset == xattr_access ||
+                           found_key.offset == xattr_default)
+                               return 1;
+               }
  
                /*
                 * we found a key greater than an xattr key, there can't
@@@ -3660,53 -3700,20 +3701,20 @@@ int btrfs_unlink_inode(struct btrfs_tra
        }
        return ret;
  }
-               
- /* helper to check if there is any shared block in the path */
- static int check_path_shared(struct btrfs_root *root,
-                            struct btrfs_path *path)
- {
-       struct extent_buffer *eb;
-       int level;
-       u64 refs = 1;
-       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-               int ret;
-               if (!path->nodes[level])
-                       break;
-               eb = path->nodes[level];
-               if (!btrfs_block_can_be_shared(root, eb))
-                       continue;
-               ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
-                                              &refs, NULL);
-               if (refs > 1)
-                       return 1;
-       }
-       return 0;
- }
  
  /*
   * helper to start transaction for unlink and rmdir.
   *
-  * unlink and rmdir are special in btrfs, they do not always free space.
-  * so in enospc case, we should make sure they will free space before
-  * allowing them to use the global metadata reservation.
+  * unlink and rmdir are special in btrfs, they do not always free space, so
+  * if we cannot make our reservations the normal way try and see if there is
+  * plenty of slack room in the global reserve to migrate, otherwise we cannot
+  * allow the unlink to occur.
   */
- static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-                                                      struct dentry *dentry)
+ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
  {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-       struct btrfs_path *path;
-       struct btrfs_dir_item *di;
-       struct inode *inode = dentry->d_inode;
-       u64 index;
-       int check_link = 1;
-       int err = -ENOSPC;
        int ret;
-       u64 ino = btrfs_ino(inode);
-       u64 dir_ino = btrfs_ino(dir);
  
        /*
         * 1 for the possible orphan item
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
  
-       if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-               return ERR_PTR(-ENOSPC);
-       /* check if there is someone else holds reference */
-       if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-               return ERR_PTR(-ENOSPC);
-       if (atomic_read(&inode->i_count) > 2)
-               return ERR_PTR(-ENOSPC);
-       if (xchg(&root->fs_info->enospc_unlink, 1))
-               return ERR_PTR(-ENOSPC);
-       path = btrfs_alloc_path();
-       if (!path) {
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(-ENOMEM);
-       }
+       if (PTR_ERR(trans) == -ENOSPC) {
+               u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
  
-       /* 1 for the orphan item */
-       trans = btrfs_start_transaction(root, 1);
-       if (IS_ERR(trans)) {
-               btrfs_free_path(path);
-               root->fs_info->enospc_unlink = 0;
-               return trans;
-       }
-       path->skip_locking = 1;
-       path->search_commit_root = 1;
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(dir)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
-       ret = btrfs_lookup_inode(trans, root, path,
-                               &BTRFS_I(inode)->location, 0);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       if (ret == 0) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               check_link = 0;
-       }
-       btrfs_release_path(path);
-       if (ret == 0 && S_ISREG(inode->i_mode)) {
-               ret = btrfs_lookup_file_extent(trans, root, path,
-                                              ino, (u64)-1, 0);
-               if (ret < 0) {
-                       err = ret;
-                       goto out;
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans))
+                       return trans;
+               ret = btrfs_cond_migrate_bytes(root->fs_info,
+                                              &root->fs_info->trans_block_rsv,
+                                              num_bytes, 5);
+               if (ret) {
+                       btrfs_end_transaction(trans, root);
+                       return ERR_PTR(ret);
                }
-               BUG_ON(ret == 0); /* Corruption */
-               if (check_path_shared(root, path))
-                       goto out;
-               btrfs_release_path(path);
-       }
-       if (!check_link) {
-               err = 0;
-               goto out;
-       }
-       di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       if (di) {
-               if (check_path_shared(root, path))
-                       goto out;
-       } else {
-               err = 0;
-               goto out;
-       }
-       btrfs_release_path(path);
-       ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-                                       dentry->d_name.len, ino, dir_ino, 0,
-                                       &index);
-       if (ret) {
-               err = ret;
-               goto out;
-       }
-       if (check_path_shared(root, path))
-               goto out;
-       btrfs_release_path(path);
-       /*
-        * This is a commit root search, if we can lookup inode item and other
-        * relative items in the commit root, it means the transaction of
-        * dir/file creation has been committed, and the dir index item that we
-        * delay to insert has also been inserted into the commit root. So
-        * we needn't worry about the delayed insertion of the dir index item
-        * here.
-        */
-       di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-                               dentry->d_name.name, dentry->d_name.len, 0);
-       if (IS_ERR(di)) {
-               err = PTR_ERR(di);
-               goto out;
-       }
-       BUG_ON(ret == -ENOENT);
-       if (check_path_shared(root, path))
-               goto out;
-       err = 0;
- out:
-       btrfs_free_path(path);
-       /* Migrate the orphan reservation over */
-       if (!err)
-               err = btrfs_block_rsv_migrate(trans->block_rsv,
-                               &root->fs_info->global_block_rsv,
-                               trans->bytes_reserved);
-       if (err) {
-               btrfs_end_transaction(trans, root);
-               root->fs_info->enospc_unlink = 0;
-               return ERR_PTR(err);
-       }
-       trans->block_rsv = &root->fs_info->global_block_rsv;
-       return trans;
- }
- static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root)
- {
-       if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
-               btrfs_block_rsv_release(root, trans->block_rsv,
-                                       trans->bytes_reserved);
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-               BUG_ON(!root->fs_info->enospc_unlink);
-               root->fs_info->enospc_unlink = 0;
+               trans->bytes_reserved = num_bytes;
        }
-       btrfs_end_transaction(trans, root);
+       return trans;
  }
  
  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int ret;
  
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
  
        }
  
  out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
        return ret;
  }
@@@ -3995,7 -3867,7 +3868,7 @@@ static int btrfs_rmdir(struct inode *di
        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -EPERM;
  
-       trans = __unlink_start_trans(dir, dentry);
+       trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
  
        if (!err)
                btrfs_i_size_write(inode, 0);
  out:
-       __unlink_end_trans(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
  
        return err;
@@@ -4395,6 -4267,15 +4268,15 @@@ int btrfs_cont_expand(struct inode *ino
        u64 hole_size;
        int err = 0;
  
+       /*
+        * If our size started in the middle of a page we need to zero out the
+        * rest of the page before we expand the i_size, otherwise we could
+        * expose stale data.
+        */
+       err = btrfs_truncate_page(inode, oldsize, 0, 0);
+       if (err)
+               return err;
        if (size <= hole_start)
                return 0;
  
@@@ -4822,11 -4703,6 +4704,6 @@@ static int fixup_tree_root_location(str
                goto out;
        }
  
-       if (btrfs_root_refs(&new_root->root_item) == 0) {
-               err = -ENOENT;
-               goto out;
-       }
        *sub_root = new_root;
        location->objectid = btrfs_root_dirid(&new_root->root_item);
        location->type = BTRFS_INODE_ITEM_KEY;
@@@ -5092,8 -4968,10 +4969,10 @@@ struct inode *btrfs_lookup_dentry(struc
                if (!(inode->i_sb->s_flags & MS_RDONLY))
                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
-               if (ret)
+               if (ret) {
+                       iput(inode);
                        inode = ERR_PTR(ret);
+               }
        }
  
        return inode;
@@@ -5137,9 -5015,10 +5016,9 @@@ unsigned char btrfs_filetype_table[] = 
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
  };
  
 -static int btrfs_real_readdir(struct file *filp, void *dirent,
 -                            filldir_t filldir)
 +static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
  {
 -      struct inode *inode = file_inode(filp);
 +      struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_item *item;
        struct btrfs_dir_item *di;
        char tmp_name[32];
        char *name_ptr;
        int name_len;
 -      int is_curr = 0;        /* filp->f_pos points to the current index? */
 +      int is_curr = 0;        /* ctx->pos points to the current index? */
  
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
                key_type = BTRFS_DIR_ITEM_KEY;
  
 -      /* special case for "." */
 -      if (filp->f_pos == 0) {
 -              over = filldir(dirent, ".", 1,
 -                             filp->f_pos, btrfs_ino(inode), DT_DIR);
 -              if (over)
 -                      return 0;
 -              filp->f_pos = 1;
 -      }
 -      /* special case for .., just use the back ref */
 -      if (filp->f_pos == 1) {
 -              u64 pino = parent_ino(filp->f_path.dentry);
 -              over = filldir(dirent, "..", 2,
 -                             filp->f_pos, pino, DT_DIR);
 -              if (over)
 -                      return 0;
 -              filp->f_pos = 2;
 -      }
 +      if (!dir_emit_dots(file, ctx))
 +              return 0;
 +
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        }
  
        btrfs_set_key_type(&key, key_type);
 -      key.offset = filp->f_pos;
 +      key.offset = ctx->pos;
        key.objectid = btrfs_ino(inode);
  
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                        break;
                if (btrfs_key_type(&found_key) != key_type)
                        break;
 -              if (found_key.offset < filp->f_pos)
 +              if (found_key.offset < ctx->pos)
                        goto next;
                if (key_type == BTRFS_DIR_INDEX_KEY &&
                    btrfs_should_delete_dir_index(&del_list,
                                                  found_key.offset))
                        goto next;
  
 -              filp->f_pos = found_key.offset;
 +              ctx->pos = found_key.offset;
                is_curr = 1;
  
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                                over = 0;
                                goto skip;
                        }
 -                      over = filldir(dirent, name_ptr, name_len,
 -                                     found_key.offset, location.objectid,
 -                                     d_type);
 +                      over = !dir_emit(ctx, name_ptr, name_len,
 +                                     location.objectid, d_type);
  
  skip:
                        if (name_ptr != tmp_name)
@@@ -5279,8 -5173,9 +5158,8 @@@ next
  
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
 -                      filp->f_pos++;
 -              ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
 -                                                    &ins_list);
 +                      ctx->pos++;
 +              ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
                if (ret)
                        goto nopos;
        }
                 * 32-bit glibc will use getdents64, but then strtol -
                 * so the last number we can serve is this.
                 */
 -              filp->f_pos = 0x7fffffff;
 +              ctx->pos = 0x7fffffff;
        else
 -              filp->f_pos++;
 +              ctx->pos++;
  nopos:
        ret = 0;
  err:
   * returns 1 when the nocow is safe, < 1 on error, 0 if the
   * block must be cow'd
   */
static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, u64 offset, u64 *len,
-                                     u64 *orig_start, u64 *orig_block_len,
-                                     u64 *ram_bytes)
noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+                             struct inode *inode, u64 offset, u64 *len,
+                             u64 *orig_start, u64 *orig_block_len,
+                             u64 *ram_bytes)
  {
        struct btrfs_path *path;
        int ret;
        u64 num_bytes;
        int slot;
        int found_type;
+       bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
                /* not a regular extent, must cow */
                goto out;
        }
+       if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+               goto out;
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       if (disk_bytenr == 0)
+               goto out;
+       if (btrfs_file_extent_compression(leaf, fi) ||
+           btrfs_file_extent_encryption(leaf, fi) ||
+           btrfs_file_extent_other_encoding(leaf, fi))
+               goto out;
        backref_offset = btrfs_file_extent_offset(leaf, fi);
  
-       *orig_start = key.offset - backref_offset;
-       *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
-       *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+       if (orig_start) {
+               *orig_start = key.offset - backref_offset;
+               *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+               *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+       }
  
        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-       if (extent_end < offset + *len) {
-               /* extent doesn't include our full range, must cow */
-               goto out;
-       }
  
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
@@@ -6813,8 -6718,8 +6702,8 @@@ static int btrfs_get_blocks_direct(stru
                if (IS_ERR(trans))
                        goto must_cow;
  
-               if (can_nocow_odirect(trans, inode, start, &len, &orig_start,
-                                     &orig_block_len, &ram_bytes) == 1) {
+               if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+                                    &orig_block_len, &ram_bytes) == 1) {
                        if (type == BTRFS_ORDERED_PREALLOC) {
                                free_extent_map(em);
                                em = create_pinned_em(inode, start, len,
@@@ -7243,7 -7148,6 +7132,6 @@@ static void btrfs_submit_direct(int rw
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
-       struct bio_vec *bvec = dio_bio->bi_io_vec;
        struct bio *io_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
        }
  
        dip->private = dio_bio->bi_private;
-       io_bio->bi_private = dio_bio->bi_private;
        dip->inode = inode;
        dip->logical_offset = file_offset;
-       dip->bytes = 0;
-       do {
-               dip->bytes += bvec->bv_len;
-               bvec++;
-       } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
+       dip->bytes = dio_bio->bi_size;
        dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
        io_bio->bi_private = dip;
        dip->errors = 0;
@@@ -7373,8 -7270,16 +7254,16 @@@ static ssize_t btrfs_direct_IO(int rw, 
        atomic_inc(&inode->i_dio_count);
        smp_mb__after_atomic_inc();
  
+       /*
+        * The generic stuff only does filemap_write_and_wait_range, which isn't
+        * enough if we've written compressed pages to this area, so we need to
+        * call btrfs_wait_ordered_range to make absolutely sure that any
+        * outstanding dirty pages are on disk.
+        */
+       count = iov_length(iov, nr_segs);
+       btrfs_wait_ordered_range(inode, offset, count);
        if (rw & WRITE) {
-               count = iov_length(iov, nr_segs);
                /*
                 * If the write DIO is beyond the EOF, we need update
                 * the isize, but it is protected by i_mutex. So we can
@@@ -7493,8 -7398,7 +7382,8 @@@ static int btrfs_releasepage(struct pag
        return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
  }
  
 -static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 +static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 +                               unsigned int length)
  {
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree;
@@@ -7694,16 -7598,12 +7583,12 @@@ static int btrfs_truncate(struct inode 
  {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv;
-       int ret;
+       int ret = 0;
        int err = 0;
        struct btrfs_trans_handle *trans;
        u64 mask = root->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
  
-       ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
-       if (ret)
-               return ret;
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
  
@@@ -7961,9 -7861,9 +7846,9 @@@ void btrfs_destroy_inode(struct inode *
         */
        smp_mb();
        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-               spin_lock(&root->fs_info->ordered_extent_lock);
+               spin_lock(&root->fs_info->ordered_root_lock);
                list_del_init(&BTRFS_I(inode)->ordered_operations);
-               spin_unlock(&root->fs_info->ordered_extent_lock);
+               spin_unlock(&root->fs_info->ordered_root_lock);
        }
  
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@@ -8333,7 -8233,7 +8218,7 @@@ void btrfs_wait_and_free_delalloc_work(
   * some fairly slow code that needs optimization. This walks the list
   * of all the inodes with pending delalloc and forces them to disk.
   */
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
  {
        struct btrfs_inode *binode;
        struct inode *inode;
        struct list_head splice;
        int ret = 0;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
  
-       spin_lock(&root->fs_info->delalloc_lock);
-       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       spin_lock(&root->delalloc_lock);
+       list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
  
-               list_del_init(&binode->delalloc_inodes);
+               list_move_tail(&binode->delalloc_inodes,
+                              &root->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode) {
-                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                                 &binode->runtime_flags);
+                       cond_resched_lock(&root->delalloc_lock);
                        continue;
                }
-               list_add_tail(&binode->delalloc_inodes,
-                             &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&root->delalloc_lock);
  
                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
                if (unlikely(!work)) {
                                   &work->work);
  
                cond_resched();
-               spin_lock(&root->fs_info->delalloc_lock);
+               spin_lock(&root->delalloc_lock);
        }
-       spin_unlock(&root->fs_info->delalloc_lock);
+       spin_unlock(&root->delalloc_lock);
  
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+       return 0;
+ out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->delalloc_lock);
+               list_splice_tail(&splice, &root->delalloc_inodes);
+               spin_unlock(&root->delalloc_lock);
+       }
+       return ret;
+ }
+ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+ {
+       int ret;
  
-       /* the filemap_flush will queue IO into the worker threads, but
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       ret = __start_delalloc_inodes(root, delay_iput);
+       /*
+        * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
         */
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
-       return 0;
- out:
-       list_for_each_entry_safe(work, next, &works, list) {
-               list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
+       return ret;
+ }
+ int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+                                   int delay_iput)
+ {
+       struct btrfs_root *root;
+       struct list_head splice;
+       int ret;
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       INIT_LIST_HEAD(&splice);
+       spin_lock(&fs_info->delalloc_root_lock);
+       list_splice_init(&fs_info->delalloc_roots, &splice);
+       while (!list_empty(&splice)) {
+               root = list_first_entry(&splice, struct btrfs_root,
+                                       delalloc_root);
+               root = btrfs_grab_fs_root(root);
+               BUG_ON(!root);
+               list_move_tail(&root->delalloc_root,
+                              &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
+               ret = __start_delalloc_inodes(root, delay_iput);
+               btrfs_put_fs_root(root);
+               if (ret)
+                       goto out;
+               spin_lock(&fs_info->delalloc_root_lock);
        }
+       spin_unlock(&fs_info->delalloc_root_lock);
  
+       atomic_inc(&fs_info->async_submit_draining);
+       while (atomic_read(&fs_info->nr_async_submits) ||
+             atomic_read(&fs_info->async_delalloc_pages)) {
+               wait_event(fs_info->async_submit_wait,
+                  (atomic_read(&fs_info->nr_async_submits) == 0 &&
+                   atomic_read(&fs_info->async_delalloc_pages) == 0));
+       }
+       atomic_dec(&fs_info->async_submit_draining);
+       return 0;
+ out:
        if (!list_empty_careful(&splice)) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_lock(&fs_info->delalloc_root_lock);
+               list_splice_tail(&splice, &fs_info->delalloc_roots);
+               spin_unlock(&fs_info->delalloc_root_lock);
        }
        return ret;
  }
@@@ -8715,7 -8669,7 +8654,7 @@@ static const struct inode_operations bt
  static const struct file_operations btrfs_dir_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
 -      .readdir        = btrfs_real_readdir,
 +      .iterate        = btrfs_real_readdir,
        .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --combined fs/btrfs/ioctl.c
index cd7e96c73cb71df0589f1866346ab5ff2714eb96,0e17a30f39a2f38798394fc4bca1e73c7ee1875e..238a05545ee2230629fc850191f348b94cadd8cf
@@@ -555,6 -555,12 +555,12 @@@ static int create_snapshot(struct btrfs
        if (!root->ref_cows)
                return -EINVAL;
  
+       ret = btrfs_start_delalloc_inodes(root, 0);
+       if (ret)
+               return ret;
+       btrfs_wait_ordered_extents(root, 0);
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
        if (!pending_snapshot)
                return -ENOMEM;
@@@ -2354,14 -2360,6 +2360,6 @@@ static long btrfs_ioctl_rm_dev(struct f
        if (ret)
                return ret;
  
-       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-                       1)) {
-               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-               mnt_drop_write_file(file);
-               return -EINVAL;
-       }
-       mutex_lock(&root->fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
        }
  
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-       ret = btrfs_rm_device(root, vol_args->name);
  
-       kfree(vol_args);
- out:
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+               goto out;
+       }
+       mutex_lock(&root->fs_info->volume_mutex);
+       ret = btrfs_rm_device(root, vol_args->name);
        mutex_unlock(&root->fs_info->volume_mutex);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ out:
+       kfree(vol_args);
        mnt_drop_write_file(file);
        return ret;
  }
@@@ -2480,6 -2486,7 +2486,7 @@@ static noinline long btrfs_ioctl_clone(
        int ret;
        u64 len = olen;
        u64 bs = root->fs_info->sb->s_blocksize;
+       int same_inode = 0;
  
        /*
         * TODO:
  
        ret = -EINVAL;
        if (src == inode)
-               goto out_fput;
+               same_inode = 1;
  
        /* the src must be open for reading */
        if (!(src_file.file->f_mode & FMODE_READ))
        }
        path->reada = 2;
  
-       if (inode < src) {
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+       if (!same_inode) {
+               if (inode < src) {
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+                       mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+               } else {
+                       mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+                       mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               }
        } else {
-               mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-               mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+               mutex_lock(&src->i_mutex);
        }
  
        /* determine range to clone */
            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
  
+       /* verify if ranges are overlapped within the same file */
+       if (same_inode) {
+               if (destoff + len > off && destoff < off + len)
+                       goto out_unlock;
+       }
        if (destoff > inode->i_size) {
                ret = btrfs_cont_expand(inode, inode->i_size, destoff);
                if (ret)
@@@ -2846,7 -2863,8 +2863,8 @@@ out
        unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
  out_unlock:
        mutex_unlock(&src->i_mutex);
-       mutex_unlock(&inode->i_mutex);
+       if (!same_inode)
+               mutex_unlock(&inode->i_mutex);
        vfree(buf);
        btrfs_free_path(path);
  out_fput:
@@@ -2951,11 -2969,6 +2969,6 @@@ static long btrfs_ioctl_default_subvol(
                goto out;
        }
  
-       if (btrfs_root_refs(&new_root->root_item) == 0) {
-               ret = -ENOENT;
-               goto out;
-       }
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
@@@ -3719,9 -3732,6 +3732,6 @@@ static long btrfs_ioctl_quota_ctl(struc
                break;
        }
  
-       if (copy_to_user(arg, sa, sizeof(*sa)))
-               ret = -EFAULT;
        err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
        if (err && !ret)
                ret = err;
@@@ -3881,7 -3891,7 +3891,7 @@@ drop_write
  
  static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 +      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret;
  
@@@ -3914,7 -3924,7 +3924,7 @@@ drop_write
  
  static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 +      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret = 0;
  
        return ret;
  }
  
+ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+ {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       return btrfs_qgroup_wait_for_completion(root->fs_info);
+ }
  static long btrfs_ioctl_set_received_subvol(struct file *file,
                                            void __user *arg)
  {
@@@ -4020,7 -4040,7 +4040,7 @@@ out
  
  static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 +      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        const char *label = root->fs_info->super_copy->label;
        size_t len = strnlen(label, BTRFS_LABEL_SIZE);
        int ret;
  
  static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 +      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_super_block *super_block = root->fs_info->super_copy;
        struct btrfs_trans_handle *trans;
        char label[BTRFS_LABEL_SIZE];
@@@ -4179,6 -4199,8 +4199,8 @@@ long btrfs_ioctl(struct file *file, uns
                return btrfs_ioctl_quota_rescan(file, argp);
        case BTRFS_IOC_QUOTA_RESCAN_STATUS:
                return btrfs_ioctl_quota_rescan_status(file, argp);
+       case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+               return btrfs_ioctl_quota_rescan_wait(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(root, argp);
        case BTRFS_IOC_GET_FSLABEL:
diff --combined fs/btrfs/volumes.c
index 8bffb9174afba04d8375b96f754256b68ff9b4ef,b2d1eacc07c99684f3611b7dc7084b7d2cfdc1d2..78b871753cb61e099abdfca27a0e316c37c329ee
@@@ -982,6 -982,35 +982,35 @@@ out
        return ret;
  }
  
+ static int contains_pending_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_device *device,
+                                  u64 *start, u64 len)
+ {
+       struct extent_map *em;
+       int ret = 0;
+       list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+               struct map_lookup *map;
+               int i;
+               map = (struct map_lookup *)em->bdev;
+               for (i = 0; i < map->num_stripes; i++) {
+                       if (map->stripes[i].dev != device)
+                               continue;
+                       if (map->stripes[i].physical >= *start + len ||
+                           map->stripes[i].physical + em->orig_block_len <=
+                           *start)
+                               continue;
+                       *start = map->stripes[i].physical +
+                               em->orig_block_len;
+                       ret = 1;
+               }
+       }
+       return ret;
+ }
  /*
   * find_free_dev_extent - find free space in the specified device
   * @device:   the device which we search the free space in
   * But if we don't find suitable free space, it is used to store the size of
   * the max free space.
   */
- int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+ int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                        struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
  {
        struct btrfs_key key;
         */
        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
  
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+ again:
        max_hole_start = search_start;
        max_hole_size = 0;
        hole_size = 0;
  
        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
-               goto error;
+               goto out;
        }
  
-       path = btrfs_alloc_path();
-       if (!path) {
-               ret = -ENOMEM;
-               goto error;
-       }
        path->reada = 2;
+       path->search_commit_root = 1;
+       path->skip_locking = 1;
  
        key.objectid = device->devid;
        key.offset = search_start;
                if (key.offset > search_start) {
                        hole_size = key.offset - search_start;
  
+                       /*
+                        * Have to check before we set max_hole_start, otherwise
+                        * we could end up sending back this offset anyway.
+                        */
+                       if (contains_pending_extent(trans, device,
+                                                   &search_start,
+                                                   hole_size))
+                               hole_size = 0;
                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
                                max_hole_size = hole_size;
@@@ -1124,6 -1164,11 +1164,11 @@@ next
                max_hole_size = hole_size;
        }
  
+       if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+               btrfs_release_path(path);
+               goto again;
+       }
        /* See above. */
        if (hole_size < num_bytes)
                ret = -ENOSPC;
  
  out:
        btrfs_free_path(path);
- error:
        *start = max_hole_start;
        if (len)
                *len = max_hole_size;
        return ret;
  }
  
- static noinline int find_next_chunk(struct btrfs_root *root,
-                                   u64 objectid, u64 *offset)
+ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_path *path;
-       int ret;
-       struct btrfs_key key;
-       struct btrfs_chunk *chunk;
-       struct btrfs_key found_key;
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       key.objectid = objectid;
-       key.offset = (u64)-1;
-       key.type = BTRFS_CHUNK_ITEM_KEY;
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0)
-               goto error;
-       BUG_ON(ret == 0); /* Corruption */
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct rb_node *n;
+       u64 ret = 0;
  
-       ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
-       if (ret) {
-               *offset = 0;
-       } else {
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
-               if (found_key.objectid != objectid)
-                       *offset = 0;
-               else {
-                       chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                                              struct btrfs_chunk);
-                       *offset = found_key.offset +
-                               btrfs_chunk_length(path->nodes[0], chunk);
-               }
+       em_tree = &fs_info->mapping_tree.map_tree;
+       read_lock(&em_tree->lock);
+       n = rb_last(&em_tree->map);
+       if (n) {
+               em = rb_entry(n, struct extent_map, rb_node);
+               ret = em->start + em->len;
        }
-       ret = 0;
- error:
-       btrfs_free_path(path);
+       read_unlock(&em_tree->lock);
        return ret;
  }
  
@@@ -1462,31 -1481,23 +1481,23 @@@ int btrfs_rm_device(struct btrfs_root *
        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
  
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-               printk(KERN_ERR "btrfs: unable to go below four devices "
-                      "on raid10\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
                goto out;
        }
  
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-               printk(KERN_ERR "btrfs: unable to go below two "
-                      "devices on raid1\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
                goto out;
        }
  
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
            root->fs_info->fs_devices->rw_devices <= 2) {
-               printk(KERN_ERR "btrfs: unable to go below two "
-                      "devices on raid5\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
                goto out;
        }
        if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
            root->fs_info->fs_devices->rw_devices <= 3) {
-               printk(KERN_ERR "btrfs: unable to go below three "
-                      "devices on raid6\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
                goto out;
        }
  
                bh = NULL;
                disk_super = NULL;
                if (!device) {
-                       printk(KERN_ERR "btrfs: no missing devices found to "
-                              "remove\n");
+                       ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                        goto out;
                }
        } else {
        }
  
        if (device->is_tgtdev_for_dev_replace) {
-               pr_err("btrfs: unable to remove the dev_replace target dev\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_TGT_REPLACE;
                goto error_brelse;
        }
  
        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-               printk(KERN_ERR "btrfs: unable to remove the only writeable "
-                      "device\n");
-               ret = -EINVAL;
+               ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
                goto error_brelse;
        }
  
@@@ -3295,10 -3302,7 +3302,7 @@@ int btrfs_resume_balance_async(struct b
        }
  
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
-       if (IS_ERR(tsk))
-               return PTR_ERR(tsk);
-       return 0;
+       return PTR_RET(tsk);
  }
  
  int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@@ -3681,10 -3685,8 +3685,8 @@@ static void check_raid56_incompat_flag(
  }
  
  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root,
-                              struct map_lookup **map_ret,
-                              u64 *num_bytes_out, u64 *stripe_size_out,
-                              u64 start, u64 type)
+                              struct btrfs_root *extent_root, u64 start,
+                              u64 type)
  {
        struct btrfs_fs_info *info = extent_root->fs_info;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
                if (total_avail == 0)
                        continue;
  
-               ret = find_free_dev_extent(device,
+               ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
        map->type = type;
        map->sub_stripes = sub_stripes;
  
-       *map_ret = map;
        num_bytes = stripe_size * data_stripes;
  
-       *stripe_size_out = stripe_size;
-       *num_bytes_out = num_bytes;
        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
  
        em = alloc_extent_map();
        em->len = num_bytes;
        em->block_start = 0;
        em->block_len = em->len;
+       em->orig_block_len = stripe_size;
  
        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em, 0);
+       if (!ret) {
+               list_add_tail(&em->list, &trans->transaction->pending_chunks);
+               atomic_inc(&em->refs);
+       }
        write_unlock(&em_tree->lock);
        if (ret) {
                free_extent_map(em);
                goto error;
        }
  
-       for (i = 0; i < map->num_stripes; ++i) {
-               struct btrfs_device *device;
-               u64 dev_offset;
-               device = map->stripes[i].dev;
-               dev_offset = map->stripes[i].physical;
-               ret = btrfs_alloc_dev_extent(trans, device,
-                               info->chunk_root->root_key.objectid,
-                               BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                               start, dev_offset, stripe_size);
-               if (ret)
-                       goto error_dev_extent;
-       }
        ret = btrfs_make_block_group(trans, extent_root, 0, type,
                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                     start, num_bytes);
-       if (ret) {
-               i = map->num_stripes - 1;
-               goto error_dev_extent;
-       }
+       if (ret)
+               goto error_del_extent;
  
        free_extent_map(em);
        check_raid56_incompat_flag(extent_root->fs_info, type);
        kfree(devices_info);
        return 0;
  
- error_dev_extent:
-       for (; i >= 0; i--) {
-               struct btrfs_device *device;
-               int err;
-               device = map->stripes[i].dev;
-               err = btrfs_free_dev_extent(trans, device, start);
-               if (err) {
-                       btrfs_abort_transaction(trans, extent_root, err);
-                       break;
-               }
-       }
+ error_del_extent:
        write_lock(&em_tree->lock);
        remove_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
@@@ -3986,33 -3961,68 +3961,68 @@@ error
        return ret;
  }
  
static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
-                               struct map_lookup *map, u64 chunk_offset,
-                               u64 chunk_size, u64 stripe_size)
+                               u64 chunk_offset, u64 chunk_size)
  {
-       u64 dev_offset;
        struct btrfs_key key;
        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
        struct btrfs_device *device;
        struct btrfs_chunk *chunk;
        struct btrfs_stripe *stripe;
-       size_t item_size = btrfs_chunk_item_size(map->num_stripes);
-       int index = 0;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       size_t item_size;
+       u64 dev_offset;
+       u64 stripe_size;
+       int i = 0;
        int ret;
  
+       em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+       read_unlock(&em_tree->lock);
+       if (!em) {
+               btrfs_crit(extent_root->fs_info, "unable to find logical "
+                          "%Lu len %Lu", chunk_offset, chunk_size);
+               return -EINVAL;
+       }
+       if (em->start != chunk_offset || em->len != chunk_size) {
+               btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+                         " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+                         chunk_size, em->start, em->len);
+               free_extent_map(em);
+               return -EINVAL;
+       }
+       map = (struct map_lookup *)em->bdev;
+       item_size = btrfs_chunk_item_size(map->num_stripes);
+       stripe_size = em->orig_block_len;
        chunk = kzalloc(item_size, GFP_NOFS);
-       if (!chunk)
-               return -ENOMEM;
+       if (!chunk) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
  
-       index = 0;
-       while (index < map->num_stripes) {
-               device = map->stripes[index].dev;
                device->bytes_used += stripe_size;
                ret = btrfs_update_device(trans, device);
                if (ret)
-                       goto out_free;
-               index++;
+                       goto out;
+               ret = btrfs_alloc_dev_extent(trans, device,
+                                            chunk_root->root_key.objectid,
+                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                            chunk_offset, dev_offset,
+                                            stripe_size);
+               if (ret)
+                       goto out;
        }
  
        spin_lock(&extent_root->fs_info->free_chunk_lock);
                                                   map->num_stripes);
        spin_unlock(&extent_root->fs_info->free_chunk_lock);
  
-       index = 0;
        stripe = &chunk->stripe;
-       while (index < map->num_stripes) {
-               device = map->stripes[index].dev;
-               dev_offset = map->stripes[index].physical;
+       for (i = 0; i < map->num_stripes; i++) {
+               device = map->stripes[i].dev;
+               dev_offset = map->stripes[i].physical;
  
                btrfs_set_stack_stripe_devid(stripe, device->devid);
                btrfs_set_stack_stripe_offset(stripe, dev_offset);
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
-               index++;
        }
  
        btrfs_set_stack_chunk_length(chunk, chunk_size);
        key.offset = chunk_offset;
  
        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                /*
                 * TODO: Cleanup of inserted chunk root in case of
                                             item_size);
        }
  
- out_free:
+ out:
        kfree(chunk);
+       free_extent_map(em);
        return ret;
  }
  
@@@ -4074,27 -4082,9 +4082,9 @@@ int btrfs_alloc_chunk(struct btrfs_tran
                      struct btrfs_root *extent_root, u64 type)
  {
        u64 chunk_offset;
-       u64 chunk_size;
-       u64 stripe_size;
-       struct map_lookup *map;
-       struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-       int ret;
-       ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                             &chunk_offset);
-       if (ret)
-               return ret;
  
-       ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-                                 &stripe_size, chunk_offset, type);
-       if (ret)
-               return ret;
-       ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                  chunk_size, stripe_size);
-       if (ret)
-               return ret;
-       return 0;
+       chunk_offset = find_next_chunk(extent_root->fs_info);
+       return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
  }
  
  static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
  {
        u64 chunk_offset;
        u64 sys_chunk_offset;
-       u64 chunk_size;
-       u64 sys_chunk_size;
-       u64 stripe_size;
-       u64 sys_stripe_size;
        u64 alloc_profile;
-       struct map_lookup *map;
-       struct map_lookup *sys_map;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
  
-       ret = find_next_chunk(fs_info->chunk_root,
-                             BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-       if (ret)
-               return ret;
+       chunk_offset = find_next_chunk(fs_info);
        alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
-       ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-                                 &stripe_size, chunk_offset, alloc_profile);
+       ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+                                 alloc_profile);
        if (ret)
                return ret;
  
-       sys_chunk_offset = chunk_offset + chunk_size;
+       sys_chunk_offset = find_next_chunk(root->fs_info);
        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
-       ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-                                 &sys_chunk_size, &sys_stripe_size,
-                                 sys_chunk_offset, alloc_profile);
+       ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+                                 alloc_profile);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out;
        }
  
        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto out;
-       }
-       /*
-        * Modifying chunk tree needs allocating new blocks from both
-        * system block group and metadata block group. So we only can
-        * do operations require modifying the chunk tree after both
-        * block groups were created.
-        */
-       ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-                                  chunk_size, stripe_size);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto out;
-       }
-       ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-                                  sys_chunk_offset, sys_chunk_size,
-                                  sys_stripe_size);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
  out:
        return ret;
  }
  
@@@ -4435,9 -4390,6 +4390,6 @@@ static int __btrfs_map_block(struct btr
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
  
-       if (mirror_num > map->num_stripes)
-               mirror_num = 0;
        stripe_len = map->stripe_len;
        stripe_nr = offset;
        /*
@@@ -5164,7 -5116,7 +5116,7 @@@ static int bio_size_ok(struct block_dev
        }
  
        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
 -      if ((bio->bi_size >> 9) > max_sectors)
 +      if (bio_sectors(bio) > max_sectors)
                return 0;
  
        if (!q->merge_bvec_fn)
@@@ -5367,7 -5319,6 +5319,6 @@@ static struct btrfs_device *add_missing
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-       device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
@@@ -5593,7 -5544,6 +5544,6 @@@ static int read_one_dev(struct btrfs_ro
        }
  
        fill_device_from_item(leaf, dev_item, device);
-       device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
@@@ -5751,6 -5701,17 +5701,17 @@@ error
        return ret;
  }
  
+ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list)
+               device->dev_root = fs_info->dev_root;
+       mutex_unlock(&fs_devices->device_list_mutex);
+ }
  static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
  {
        int i;