Btrfs: mark delayed refs as for cow

[karo-tx-linux.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 782eb3ea8edfe7ac99a585129a7c0bdaf817bf05..dc8b9a834596470522f02b070039543cf6ead0a8 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                              struct btrfs_root *root,
                              int load_cache_only)
  {
+       DEFINE_WAIT(wait);
         struct btrfs_fs_info *fs_info = cache->fs_info;
         struct btrfs_caching_control *caching_ctl;
         int ret = 0;
  
-       smp_mb();
-       if (cache->cached != BTRFS_CACHE_NO)
+       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
+       BUG_ON(!caching_ctl);
+
+       INIT_LIST_HEAD(&caching_ctl->list);
+       mutex_init(&caching_ctl->mutex);
+       init_waitqueue_head(&caching_ctl->wait);
+       caching_ctl->block_group = cache;
+       caching_ctl->progress = cache->key.objectid;
+       atomic_set(&caching_ctl->count, 1);
+       caching_ctl->work.func = caching_thread;
+
+       spin_lock(&cache->lock);
+       /*
+        * This should be a rare occasion, but this could happen I think in the
+        * case where one thread starts to load the space cache info, and then
+        * some other thread starts a transaction commit which tries to do an
+        * allocation while the other thread is still loading the space cache
+        * info.  The previous loop should have kept us from choosing this block
+        * group, but if we've moved to the state where we will wait on caching
+        * block groups we need to first check if we're doing a fast load here,
+        * so we can wait for it to finish, otherwise we could end up allocating
+        * from a block group who's cache gets evicted for one reason or
+        * another.
+        */
+       while (cache->cached == BTRFS_CACHE_FAST) {
+               struct btrfs_caching_control *ctl;
+
+               ctl = cache->caching_ctl;
+               atomic_inc(&ctl->count);
+               prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+               spin_unlock(&cache->lock);
+
+               schedule();
+
+               finish_wait(&ctl->wait, &wait);
+               put_caching_control(ctl);
+               spin_lock(&cache->lock);
+       }
+
+       if (cache->cached != BTRFS_CACHE_NO) {
+               spin_unlock(&cache->lock);
+               kfree(caching_ctl);
                 return 0;
+       }
+       WARN_ON(cache->caching_ctl);
+       cache->caching_ctl = caching_ctl;
+       cache->cached = BTRFS_CACHE_FAST;
+       spin_unlock(&cache->lock);
  
         /*
          * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         if (trans && (!trans->transaction->in_commit) &&
             (root && root != root->fs_info->tree_root) &&
             btrfs_test_opt(root, SPACE_CACHE)) {
-               spin_lock(&cache->lock);
-               if (cache->cached != BTRFS_CACHE_NO) {
-                       spin_unlock(&cache->lock);
-                       return 0;
-               }
-               cache->cached = BTRFS_CACHE_STARTED;
-               spin_unlock(&cache->lock);
-
                 ret = load_free_space_cache(fs_info, cache);
  
                 spin_lock(&cache->lock);
                 if (ret == 1) {
+                       cache->caching_ctl = NULL;
                         cache->cached = BTRFS_CACHE_FINISHED;
                         cache->last_byte_to_unpin = (u64)-1;
                 } else {
-                       cache->cached = BTRFS_CACHE_NO;
+                       if (load_cache_only) {
+                               cache->caching_ctl = NULL;
+                               cache->cached = BTRFS_CACHE_NO;
+                       } else {
+                               cache->cached = BTRFS_CACHE_STARTED;
+                       }
                 }
                 spin_unlock(&cache->lock);
+               wake_up(&caching_ctl->wait);
                 if (ret == 1) {
+                       put_caching_control(caching_ctl);
                         free_excluded_extents(fs_info->extent_root, cache);
                         return 0;
                 }
+       } else {
+               /*
+                * We are not going to do the fast caching, set cached to the
+                * appropriate value and wakeup any waiters.
+                */
+               spin_lock(&cache->lock);
+               if (load_cache_only) {
+                       cache->caching_ctl = NULL;
+                       cache->cached = BTRFS_CACHE_NO;
+               } else {
+                       cache->cached = BTRFS_CACHE_STARTED;
+               }
+               spin_unlock(&cache->lock);
+               wake_up(&caching_ctl->wait);
         }
  
-       if (load_cache_only)
-               return 0;
-
-       caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-       BUG_ON(!caching_ctl);
-
-       INIT_LIST_HEAD(&caching_ctl->list);
-       mutex_init(&caching_ctl->mutex);
-       init_waitqueue_head(&caching_ctl->wait);
-       caching_ctl->block_group = cache;
-       caching_ctl->progress = cache->key.objectid;
-       /* one for caching kthread, one for caching block group list */
-       atomic_set(&caching_ctl->count, 2);
-       caching_ctl->work.func = caching_thread;
-
-       spin_lock(&cache->lock);
-       if (cache->cached != BTRFS_CACHE_NO) {
-               spin_unlock(&cache->lock);
-               kfree(caching_ctl);
+       if (load_cache_only) {
+               put_caching_control(caching_ctl);
                 return 0;
         }
-       cache->caching_ctl = caching_ctl;
-       cache->cached = BTRFS_CACHE_STARTED;
-       spin_unlock(&cache->lock);
  
         down_write(&fs_info->extent_commit_sem);
+       atomic_inc(&caching_ctl->count);
         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
         up_write(&fs_info->extent_commit_sem);
  
@@ -1788,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  {
         int ret;
         u64 discarded_bytes = 0;
-       struct btrfs_multi_bio *multi = NULL;
+       struct btrfs_bio *bbio = NULL;
  
  
         /* Tell the block device(s) that the sectors can be discarded */
         ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                             bytenr, &num_bytes, &multi, 0);
+                             bytenr, &num_bytes, &bbio, 0);
         if (!ret) {
-               struct btrfs_bio_stripe *stripe = multi->stripes;
+               struct btrfs_bio_stripe *stripe = bbio->stripes;
                 int i;
  
  
-               for (i = 0; i < multi->num_stripes; i++, stripe++) {
+               for (i = 0; i < bbio->num_stripes; i++, stripe++) {
                         if (!stripe->dev->can_discard)
                                 continue;
  
@@ -1818,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                          */
                         ret = 0;
                 }
-               kfree(multi);
+               kfree(bbio);
         }
  
         if (actual_bytes)
@@ -1831,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
  {
         int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
                root_objectid == BTRFS_TREE_LOG_OBJECTID);
  
         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
         } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                         parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
         }
         return ret;
  }
@@ -2364,7 +2409,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
         extent_op->update_key = 0;
         extent_op->is_data = is_data ? 1 : 0;
  
-       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                         num_bytes, extent_op);
         if (ret)
                 kfree(extent_op);
         return ret;
@@ -2549,7 +2595,7 @@ out:
  static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct extent_buffer *buf,
-                          int full_backref, int inc)
+                          int full_backref, int inc, int for_cow)
  {
         u64 bytenr;
         u64 num_bytes;
@@ -2562,7 +2608,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
         int level;
         int ret = 0;
         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, int);
  
         ref_root = btrfs_header_owner(buf);
         nritems = btrfs_header_nritems(buf);
@@ -2599,14 +2645,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                         key.offset -= btrfs_file_extent_offset(buf, fi);
                         ret = process_func(trans, root, bytenr, num_bytes,
                                            parent, ref_root, key.objectid,
-                                          key.offset);
+                                          key.offset, for_cow);
                         if (ret)
                                 goto fail;
                 } else {
                         bytenr = btrfs_node_blockptr(buf, i);
                         num_bytes = btrfs_level_size(root, level - 1);
                         ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                                          parent, ref_root, level - 1, 0,
+                                          for_cow);
                         if (ret)
                                 goto fail;
                 }
@@ -2618,15 +2665,15 @@ fail:
  }
  
  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
  {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
  }
  
  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
  {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
  }
  
  static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3334,12 +3381,12 @@ out:
  /*
   * shrink metadata reservation for delalloc
   */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 to_reclaim,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
                            bool wait_ordered)
  {
         struct btrfs_block_rsv *block_rsv;
         struct btrfs_space_info *space_info;
+       struct btrfs_trans_handle *trans;
         u64 reserved;
         u64 max_reclaim;
         u64 reclaimed = 0;
@@ -3348,6 +3395,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
         int loops = 0;
         unsigned long progress;
  
+       trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
         space_info = block_rsv->space_info;
  
@@ -3417,6 +3465,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
         return reclaimed >= to_reclaim;
  }
  
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+                                 struct btrfs_space_info *space_info,
+                                 u64 bytes, int force)
+{
+       struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+       struct btrfs_trans_handle *trans;
+
+       trans = (struct btrfs_trans_handle *)current->journal_info;
+       if (trans)
+               return -EAGAIN;
+
+       if (force)
+               goto commit;
+
+       /* See if there is enough pinned space to make this reservation */
+       spin_lock(&space_info->lock);
+       if (space_info->bytes_pinned >= bytes) {
+               spin_unlock(&space_info->lock);
+               goto commit;
+       }
+       spin_unlock(&space_info->lock);
+
+       /*
+        * See if there is some space in the delayed insertion reservation for
+        * this reservation.
+        */
+       if (space_info != delayed_rsv->space_info)
+               return -ENOSPC;
+
+       spin_lock(&delayed_rsv->lock);
+       if (delayed_rsv->size < bytes) {
+               spin_unlock(&delayed_rsv->lock);
+               return -ENOSPC;
+       }
+       spin_unlock(&delayed_rsv->lock);
+
+commit:
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans))
+               return -ENOSPC;
+
+       return btrfs_commit_transaction(trans, root);
+}
+
  /**
   * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
   * @root - the root we're allocating for
@@ -3436,7 +3538,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
                                   u64 orig_bytes, int flush)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
-       struct btrfs_trans_handle *trans;
         u64 used;
         u64 num_bytes = orig_bytes;
         int retries = 0;
@@ -3445,7 +3546,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
         bool flushing = false;
         bool wait_ordered = false;
  
-       trans = (struct btrfs_trans_handle *)current->journal_info;
  again:
         ret = 0;
         spin_lock(&space_info->lock);
@@ -3461,7 +3561,7 @@ again:
                  * deadlock since we are waiting for the flusher to finish, but
                  * hold the current transaction open.
                  */
-               if (trans)
+               if (current->journal_info)
                         return -EAGAIN;
                 ret = wait_event_interruptible(space_info->wait,
                                                !space_info->flush);
@@ -3517,12 +3617,16 @@ again:
                  */
                 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
                 do_div(avail, 10);
-               if (space_info->bytes_pinned >= avail && flush && !trans &&
-                   !committed) {
+               if (space_info->bytes_pinned >= avail && flush && !committed) {
                         space_info->flush = 1;
                         flushing = true;
                         spin_unlock(&space_info->lock);
-                       goto commit;
+                       ret = may_commit_transaction(root, space_info,
+                                                    orig_bytes, 1);
+                       if (ret)
+                               goto out;
+                       committed = true;
+                       goto again;
                 }
  
                 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3575,7 +3679,7 @@ again:
          * We do synchronous shrinking since we don't actually unreserve
          * metadata until after the IO is completed.
          */
-       ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
+       ret = shrink_delalloc(root, num_bytes, wait_ordered);
         if (ret < 0)
                 goto out;
  
@@ -3592,21 +3696,12 @@ again:
                 goto again;
         }
  
-       ret = -EAGAIN;
-       if (trans)
-               goto out;
-
-commit:
         ret = -ENOSPC;
         if (committed)
                 goto out;
  
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans))
-               goto out;
-       ret = btrfs_commit_transaction(trans, root);
+       ret = may_commit_transaction(root, space_info, orig_bytes, 0);
         if (!ret) {
-               trans = NULL;
                 committed = true;
                 goto again;
         }
@@ -3748,16 +3843,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
         kfree(rsv);
  }
  
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
+static inline int __block_rsv_add(struct btrfs_root *root,
+                                 struct btrfs_block_rsv *block_rsv,
+                                 u64 num_bytes, int flush)
  {
         int ret;
  
         if (num_bytes == 0)
                 return 0;
  
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
                 return 0;
@@ -3766,6 +3861,20 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
         return ret;
  }
  
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv,
+                       u64 num_bytes)
+{
+       return __block_rsv_add(root, block_rsv, num_bytes, 1);
+}
+
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+                               struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+       return __block_rsv_add(root, block_rsv, num_bytes, 0);
+}
+
  int btrfs_block_rsv_check(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, int min_factor)
  {
@@ -3784,9 +3893,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
         return ret;
  }
  
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                         struct btrfs_block_rsv *block_rsv,
-                         u64 min_reserved)
+static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
+                                          struct btrfs_block_rsv *block_rsv,
+                                          u64 min_reserved, int flush)
  {
         u64 num_bytes = 0;
         int ret = -ENOSPC;
@@ -3805,7 +3914,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
         if (!ret)
                 return 0;
  
-       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+       ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
                 return 0;
@@ -3814,6 +3923,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
         return ret;
  }
  
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved)
+{
+       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
+}
+
+int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
+                                  u64 min_reserved)
+{
+       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
+}
+
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes)
@@ -3914,6 +4037,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
         fs_info->delalloc_block_rsv.space_info = space_info;
         fs_info->trans_block_rsv.space_info = space_info;
         fs_info->empty_block_rsv.space_info = space_info;
+       fs_info->delayed_block_rsv.space_info = space_info;
  
         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3933,6 +4057,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
         WARN_ON(fs_info->chunk_block_rsv.size > 0);
         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+       WARN_ON(fs_info->delayed_block_rsv.size > 0);
+       WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
  }
  
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3994,23 +4120,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
   */
  static unsigned drop_outstanding_extent(struct inode *inode)
  {
+       unsigned drop_inode_space = 0;
         unsigned dropped_extents = 0;
  
         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
         BTRFS_I(inode)->outstanding_extents--;
  
+       if (BTRFS_I(inode)->outstanding_extents == 0 &&
+           BTRFS_I(inode)->delalloc_meta_reserved) {
+               drop_inode_space = 1;
+               BTRFS_I(inode)->delalloc_meta_reserved = 0;
+       }
+
         /*
          * If we have more or the same amount of outsanding extents than we have
          * reserved then we need to leave the reserved extents count alone.
          */
         if (BTRFS_I(inode)->outstanding_extents >=
             BTRFS_I(inode)->reserved_extents)
-               return 0;
+               return drop_inode_space;
  
         dropped_extents = BTRFS_I(inode)->reserved_extents -
                 BTRFS_I(inode)->outstanding_extents;
         BTRFS_I(inode)->reserved_extents -= dropped_extents;
-       return dropped_extents;
+       return dropped_extents + drop_inode_space;
  }
  
  /**
@@ -4096,9 +4229,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                 nr_extents = BTRFS_I(inode)->outstanding_extents -
                         BTRFS_I(inode)->reserved_extents;
                 BTRFS_I(inode)->reserved_extents += nr_extents;
+       }
  
-               to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+       /*
+        * Add an item to reserve for updating the inode when we complete the
+        * delalloc io.
+        */
+       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+               nr_extents++;
+               BTRFS_I(inode)->delalloc_meta_reserved = 1;
         }
+
+       to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
         spin_unlock(&BTRFS_I(inode)->lock);
  
@@ -4801,16 +4943,17 @@ out:
  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct extent_buffer *buf,
-                          u64 parent, int last_ref)
+                          u64 parent, int last_ref, int for_cow)
  {
         struct btrfs_block_group_cache *cache = NULL;
         int ret;
  
         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-                                               parent, root->root_key.objectid,
-                                               btrfs_header_level(buf),
-                                               BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       buf->start, buf->len,
+                                       parent, root->root_key.objectid,
+                                       btrfs_header_level(buf),
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                 BUG_ON(ret);
         }
  
@@ -4845,12 +4988,12 @@ out:
         btrfs_put_block_group(cache);
  }
  
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow)
  {
         int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
  
         /*
          * tree log blocks never actually go into the extent allocation
@@ -4862,14 +5005,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
                 ret = 0;
         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL);
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                 BUG_ON(ret);
         } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-                                       parent, root_objectid, owner,
-                                       offset, BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                               num_bytes,
+                                               parent, root_objectid, owner,
+                                               offset, BTRFS_DROP_DELAYED_REF,
+                                               NULL, for_cow);
                 BUG_ON(ret);
         }
         return ret;
@@ -5096,13 +5242,15 @@ search:
                 }
  
  have_block_group:
-               if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+               cached = block_group_cache_done(block_group);
+               if (unlikely(!cached)) {
                         u64 free_percent;
  
+                       found_uncached_bg = true;
                         ret = cache_block_group(block_group, trans,
                                                 orig_root, 1);
                         if (block_group->cached == BTRFS_CACHE_FINISHED)
-                               goto have_block_group;
+                               goto alloc;
  
                         free_percent = btrfs_block_group_used(&block_group->item);
                         free_percent *= 100;
@@ -5124,7 +5272,6 @@ have_block_group:
                                                         orig_root, 0);
                                 BUG_ON(ret);
                         }
-                       found_uncached_bg = true;
  
                         /*
                          * If loop is set for cached only, try the next block
@@ -5134,17 +5281,14 @@ have_block_group:
                                 goto loop;
                 }
  
-               cached = block_group_cache_done(block_group);
-               if (unlikely(!cached))
-                       found_uncached_bg = true;
-
+alloc:
                 if (unlikely(block_group->ro))
                         goto loop;
  
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
-                   num_bytes + empty_size) {
+                   num_bytes + empty_cluster + empty_size) {
                         spin_unlock(&block_group->free_space_ctl->tree_lock);
                         goto loop;
                 }
@@ -5165,12 +5309,10 @@ have_block_group:
                          * people trying to start a new cluster
                          */
                         spin_lock(&last_ptr->refill_lock);
-                       if (last_ptr->block_group &&
-                           (last_ptr->block_group->ro ||
-                           !block_group_bits(last_ptr->block_group, data))) {
-                               offset = 0;
+                       if (!last_ptr->block_group ||
+                           last_ptr->block_group->ro ||
+                           !block_group_bits(last_ptr->block_group, data))
                                 goto refill_cluster;
-                       }
  
                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
                                                  num_bytes, search_start);
@@ -5221,7 +5363,7 @@ refill_cluster:
                         /* allocate a cluster in this block group */
                         ret = btrfs_find_space_cluster(trans, root,
                                                block_group, last_ptr,
-                                              offset, num_bytes,
+                                              search_start, num_bytes,
                                                empty_cluster + empty_size);
                         if (ret == 0) {
                                 /*
@@ -5694,9 +5836,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  
         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
  
-       ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-                                        0, root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL);
+       ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+                                        ins->offset, 0,
+                                        root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
         return ret;
  }
  
@@ -5866,7 +6009,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u32 blocksize,
                                         u64 parent, u64 root_objectid,
                                         struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size)
+                                       u64 hint, u64 empty_size, int for_cow)
  {
         struct btrfs_key ins;
         struct btrfs_block_rsv *block_rsv;
@@ -5910,10 +6053,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                 extent_op->update_flags = 1;
                 extent_op->is_data = 0;
  
-               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       ins.objectid,
                                         ins.offset, parent, root_objectid,
                                         level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op);
+                                       extent_op, for_cow);
                 BUG_ON(ret);
         }
         return buf;
@@ -5930,6 +6074,7 @@ struct walk_control {
         int keep_locks;
         int reada_slot;
         int reada_count;
+       int for_reloc;
  };
  
  #define DROP_REFERENCE 1
@@ -6068,9 +6213,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
         /* wc->stage == UPDATE_BACKREF */
         if (!(wc->flags[level] & flag)) {
                 BUG_ON(!path->locks[level]);
-               ret = btrfs_inc_ref(trans, root, eb, 1);
+               ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
                 BUG_ON(ret);
-               ret = btrfs_dec_ref(trans, root, eb, 0);
+               ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
                 BUG_ON(ret);
                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                   eb->len, flag, 0);
@@ -6214,7 +6359,7 @@ skip:
                 }
  
                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                                       root->root_key.objectid, level - 1, 0);
+                               root->root_key.objectid, level - 1, 0, 0);
                 BUG_ON(ret);
         }
         btrfs_tree_unlock(next);
@@ -6288,9 +6433,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
         if (wc->refs[level] == 1) {
                 if (level == 0) {
                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                               ret = btrfs_dec_ref(trans, root, eb, 1);
+                               ret = btrfs_dec_ref(trans, root, eb, 1,
+                                                   wc->for_reloc);
                         else
-                               ret = btrfs_dec_ref(trans, root, eb, 0);
+                               ret = btrfs_dec_ref(trans, root, eb, 0,
+                                                   wc->for_reloc);
                         BUG_ON(ret);
                 }
                 /* make block locked assertion in clean_tree_block happy */
@@ -6317,7 +6464,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                btrfs_header_owner(path->nodes[level + 1]));
         }
  
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
  out:
         wc->refs[level] = 0;
         wc->flags[level] = 0;
@@ -6401,7 +6548,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
   * blocks are properly updated.
   */
  void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc)
  {
         struct btrfs_path *path;
         struct btrfs_trans_handle *trans;
@@ -6489,6 +6637,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
         wc->stage = DROP_REFERENCE;
         wc->update_ref = update_ref;
         wc->keep_locks = 0;
+       wc->for_reloc = for_reloc;
         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
  
         while (1) {
@@ -6573,6 +6722,7 @@ out:
   * drop subtree rooted at tree block 'node'.
   *
   * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
   */
  int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
@@ -6617,6 +6767,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
         wc->stage = DROP_REFERENCE;
         wc->update_ref = 0;
         wc->keep_locks = 1;
+       wc->for_reloc = 1;
         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
  
         while (1) {