]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/btrfs/extent-tree.c
Btrfs: load free space cache if it exists
[karo-tx-linux.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include "compat.h"
27 #include "hash.h"
28 #include "ctree.h"
29 #include "disk-io.h"
30 #include "print-tree.h"
31 #include "transaction.h"
32 #include "volumes.h"
33 #include "locking.h"
34 #include "free-space-cache.h"
35
36 static int update_block_group(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               u64 bytenr, u64 num_bytes, int alloc);
39 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40                                  u64 num_bytes, int reserve, int sinfo);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *extent_root, u64 alloc_bytes,
62                           u64 flags, int force);
63 static int find_next_key(struct btrfs_path *path, int level,
64                          struct btrfs_key *key);
65 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
66                             int dump_block_groups);
67
68 static noinline int
69 block_group_cache_done(struct btrfs_block_group_cache *cache)
70 {
71         smp_mb();
72         return cache->cached == BTRFS_CACHE_FINISHED;
73 }
74
75 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
76 {
77         return (cache->flags & bits) == bits;
78 }
79
80 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
81 {
82         atomic_inc(&cache->count);
83 }
84
85 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
86 {
87         if (atomic_dec_and_test(&cache->count)) {
88                 WARN_ON(cache->pinned > 0);
89                 WARN_ON(cache->reserved > 0);
90                 WARN_ON(cache->reserved_pinned > 0);
91                 kfree(cache);
92         }
93 }
94
95 /*
96  * this adds the block group to the fs_info rb tree for the block group
97  * cache
98  */
99 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
100                                 struct btrfs_block_group_cache *block_group)
101 {
102         struct rb_node **p;
103         struct rb_node *parent = NULL;
104         struct btrfs_block_group_cache *cache;
105
106         spin_lock(&info->block_group_cache_lock);
107         p = &info->block_group_cache_tree.rb_node;
108
109         while (*p) {
110                 parent = *p;
111                 cache = rb_entry(parent, struct btrfs_block_group_cache,
112                                  cache_node);
113                 if (block_group->key.objectid < cache->key.objectid) {
114                         p = &(*p)->rb_left;
115                 } else if (block_group->key.objectid > cache->key.objectid) {
116                         p = &(*p)->rb_right;
117                 } else {
118                         spin_unlock(&info->block_group_cache_lock);
119                         return -EEXIST;
120                 }
121         }
122
123         rb_link_node(&block_group->cache_node, parent, p);
124         rb_insert_color(&block_group->cache_node,
125                         &info->block_group_cache_tree);
126         spin_unlock(&info->block_group_cache_lock);
127
128         return 0;
129 }
130
131 /*
132  * This will return the block group at or after bytenr if contains is 0, else
133  * it will return the block group that contains the bytenr
134  */
135 static struct btrfs_block_group_cache *
136 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
137                               int contains)
138 {
139         struct btrfs_block_group_cache *cache, *ret = NULL;
140         struct rb_node *n;
141         u64 end, start;
142
143         spin_lock(&info->block_group_cache_lock);
144         n = info->block_group_cache_tree.rb_node;
145
146         while (n) {
147                 cache = rb_entry(n, struct btrfs_block_group_cache,
148                                  cache_node);
149                 end = cache->key.objectid + cache->key.offset - 1;
150                 start = cache->key.objectid;
151
152                 if (bytenr < start) {
153                         if (!contains && (!ret || start < ret->key.objectid))
154                                 ret = cache;
155                         n = n->rb_left;
156                 } else if (bytenr > start) {
157                         if (contains && bytenr <= end) {
158                                 ret = cache;
159                                 break;
160                         }
161                         n = n->rb_right;
162                 } else {
163                         ret = cache;
164                         break;
165                 }
166         }
167         if (ret)
168                 btrfs_get_block_group(ret);
169         spin_unlock(&info->block_group_cache_lock);
170
171         return ret;
172 }
173
174 static int add_excluded_extent(struct btrfs_root *root,
175                                u64 start, u64 num_bytes)
176 {
177         u64 end = start + num_bytes - 1;
178         set_extent_bits(&root->fs_info->freed_extents[0],
179                         start, end, EXTENT_UPTODATE, GFP_NOFS);
180         set_extent_bits(&root->fs_info->freed_extents[1],
181                         start, end, EXTENT_UPTODATE, GFP_NOFS);
182         return 0;
183 }
184
185 static void free_excluded_extents(struct btrfs_root *root,
186                                   struct btrfs_block_group_cache *cache)
187 {
188         u64 start, end;
189
190         start = cache->key.objectid;
191         end = start + cache->key.offset - 1;
192
193         clear_extent_bits(&root->fs_info->freed_extents[0],
194                           start, end, EXTENT_UPTODATE, GFP_NOFS);
195         clear_extent_bits(&root->fs_info->freed_extents[1],
196                           start, end, EXTENT_UPTODATE, GFP_NOFS);
197 }
198
199 static int exclude_super_stripes(struct btrfs_root *root,
200                                  struct btrfs_block_group_cache *cache)
201 {
202         u64 bytenr;
203         u64 *logical;
204         int stripe_len;
205         int i, nr, ret;
206
207         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
208                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
209                 cache->bytes_super += stripe_len;
210                 ret = add_excluded_extent(root, cache->key.objectid,
211                                           stripe_len);
212                 BUG_ON(ret);
213         }
214
215         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
216                 bytenr = btrfs_sb_offset(i);
217                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
218                                        cache->key.objectid, bytenr,
219                                        0, &logical, &nr, &stripe_len);
220                 BUG_ON(ret);
221
222                 while (nr--) {
223                         cache->bytes_super += stripe_len;
224                         ret = add_excluded_extent(root, logical[nr],
225                                                   stripe_len);
226                         BUG_ON(ret);
227                 }
228
229                 kfree(logical);
230         }
231         return 0;
232 }
233
234 static struct btrfs_caching_control *
235 get_caching_control(struct btrfs_block_group_cache *cache)
236 {
237         struct btrfs_caching_control *ctl;
238
239         spin_lock(&cache->lock);
240         if (cache->cached != BTRFS_CACHE_STARTED) {
241                 spin_unlock(&cache->lock);
242                 return NULL;
243         }
244
245         ctl = cache->caching_ctl;
246         atomic_inc(&ctl->count);
247         spin_unlock(&cache->lock);
248         return ctl;
249 }
250
251 static void put_caching_control(struct btrfs_caching_control *ctl)
252 {
253         if (atomic_dec_and_test(&ctl->count))
254                 kfree(ctl);
255 }
256
257 /*
258  * this is only called by cache_block_group, since we could have freed extents
259  * we need to check the pinned_extents for any extents that can't be used yet
260  * since their free space will be released as soon as the transaction commits.
261  */
262 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
263                               struct btrfs_fs_info *info, u64 start, u64 end)
264 {
265         u64 extent_start, extent_end, size, total_added = 0;
266         int ret;
267
268         while (start < end) {
269                 ret = find_first_extent_bit(info->pinned_extents, start,
270                                             &extent_start, &extent_end,
271                                             EXTENT_DIRTY | EXTENT_UPTODATE);
272                 if (ret)
273                         break;
274
275                 if (extent_start <= start) {
276                         start = extent_end + 1;
277                 } else if (extent_start > start && extent_start < end) {
278                         size = extent_start - start;
279                         total_added += size;
280                         ret = btrfs_add_free_space(block_group, start,
281                                                    size);
282                         BUG_ON(ret);
283                         start = extent_end + 1;
284                 } else {
285                         break;
286                 }
287         }
288
289         if (start < end) {
290                 size = end - start;
291                 total_added += size;
292                 ret = btrfs_add_free_space(block_group, start, size);
293                 BUG_ON(ret);
294         }
295
296         return total_added;
297 }
298
299 static int caching_kthread(void *data)
300 {
301         struct btrfs_block_group_cache *block_group = data;
302         struct btrfs_fs_info *fs_info = block_group->fs_info;
303         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
304         struct btrfs_root *extent_root = fs_info->extent_root;
305         struct btrfs_path *path;
306         struct extent_buffer *leaf;
307         struct btrfs_key key;
308         u64 total_found = 0;
309         u64 last = 0;
310         u32 nritems;
311         int ret = 0;
312
313         path = btrfs_alloc_path();
314         if (!path)
315                 return -ENOMEM;
316
317         exclude_super_stripes(extent_root, block_group);
318         spin_lock(&block_group->space_info->lock);
319         block_group->space_info->bytes_readonly += block_group->bytes_super;
320         spin_unlock(&block_group->space_info->lock);
321
322         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
323
324         /*
325          * We don't want to deadlock with somebody trying to allocate a new
326          * extent for the extent root while also trying to search the extent
327          * root to add free space.  So we skip locking and search the commit
328          * root, since its read-only
329          */
330         path->skip_locking = 1;
331         path->search_commit_root = 1;
332         path->reada = 2;
333
334         key.objectid = last;
335         key.offset = 0;
336         key.type = BTRFS_EXTENT_ITEM_KEY;
337 again:
338         mutex_lock(&caching_ctl->mutex);
339         /* need to make sure the commit_root doesn't disappear */
340         down_read(&fs_info->extent_commit_sem);
341
342         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
343         if (ret < 0)
344                 goto err;
345
346         leaf = path->nodes[0];
347         nritems = btrfs_header_nritems(leaf);
348
349         while (1) {
350                 smp_mb();
351                 if (fs_info->closing > 1) {
352                         last = (u64)-1;
353                         break;
354                 }
355
356                 if (path->slots[0] < nritems) {
357                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
358                 } else {
359                         ret = find_next_key(path, 0, &key);
360                         if (ret)
361                                 break;
362
363                         caching_ctl->progress = last;
364                         btrfs_release_path(extent_root, path);
365                         up_read(&fs_info->extent_commit_sem);
366                         mutex_unlock(&caching_ctl->mutex);
367                         if (btrfs_transaction_in_commit(fs_info))
368                                 schedule_timeout(1);
369                         else
370                                 cond_resched();
371                         goto again;
372                 }
373
374                 if (key.objectid < block_group->key.objectid) {
375                         path->slots[0]++;
376                         continue;
377                 }
378
379                 if (key.objectid >= block_group->key.objectid +
380                     block_group->key.offset)
381                         break;
382
383                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
384                         total_found += add_new_free_space(block_group,
385                                                           fs_info, last,
386                                                           key.objectid);
387                         last = key.objectid + key.offset;
388
389                         if (total_found > (1024 * 1024 * 2)) {
390                                 total_found = 0;
391                                 wake_up(&caching_ctl->wait);
392                         }
393                 }
394                 path->slots[0]++;
395         }
396         ret = 0;
397
398         total_found += add_new_free_space(block_group, fs_info, last,
399                                           block_group->key.objectid +
400                                           block_group->key.offset);
401         caching_ctl->progress = (u64)-1;
402
403         spin_lock(&block_group->lock);
404         block_group->caching_ctl = NULL;
405         block_group->cached = BTRFS_CACHE_FINISHED;
406         spin_unlock(&block_group->lock);
407
408 err:
409         btrfs_free_path(path);
410         up_read(&fs_info->extent_commit_sem);
411
412         free_excluded_extents(extent_root, block_group);
413
414         mutex_unlock(&caching_ctl->mutex);
415         wake_up(&caching_ctl->wait);
416
417         put_caching_control(caching_ctl);
418         atomic_dec(&block_group->space_info->caching_threads);
419         btrfs_put_block_group(block_group);
420
421         return 0;
422 }
423
424 static int cache_block_group(struct btrfs_block_group_cache *cache,
425                              struct btrfs_trans_handle *trans,
426                              int load_cache_only)
427 {
428         struct btrfs_fs_info *fs_info = cache->fs_info;
429         struct btrfs_caching_control *caching_ctl;
430         struct task_struct *tsk;
431         int ret = 0;
432
433         smp_mb();
434         if (cache->cached != BTRFS_CACHE_NO)
435                 return 0;
436
437         /*
438          * We can't do the read from on-disk cache during a commit since we need
439          * to have the normal tree locking.
440          */
441         if (!trans->transaction->in_commit) {
442                 spin_lock(&cache->lock);
443                 if (cache->cached != BTRFS_CACHE_NO) {
444                         spin_unlock(&cache->lock);
445                         return 0;
446                 }
447                 cache->cached = BTRFS_CACHE_STARTED;
448                 spin_unlock(&cache->lock);
449
450                 ret = load_free_space_cache(fs_info, cache);
451
452                 spin_lock(&cache->lock);
453                 if (ret == 1) {
454                         cache->cached = BTRFS_CACHE_FINISHED;
455                         cache->last_byte_to_unpin = (u64)-1;
456                 } else {
457                         cache->cached = BTRFS_CACHE_NO;
458                 }
459                 spin_unlock(&cache->lock);
460                 if (ret == 1)
461                         return 0;
462         }
463
464         if (load_cache_only)
465                 return 0;
466
467         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
468         BUG_ON(!caching_ctl);
469
470         INIT_LIST_HEAD(&caching_ctl->list);
471         mutex_init(&caching_ctl->mutex);
472         init_waitqueue_head(&caching_ctl->wait);
473         caching_ctl->block_group = cache;
474         caching_ctl->progress = cache->key.objectid;
475         /* one for caching kthread, one for caching block group list */
476         atomic_set(&caching_ctl->count, 2);
477
478         spin_lock(&cache->lock);
479         if (cache->cached != BTRFS_CACHE_NO) {
480                 spin_unlock(&cache->lock);
481                 kfree(caching_ctl);
482                 return 0;
483         }
484         cache->caching_ctl = caching_ctl;
485         cache->cached = BTRFS_CACHE_STARTED;
486         spin_unlock(&cache->lock);
487
488         down_write(&fs_info->extent_commit_sem);
489         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
490         up_write(&fs_info->extent_commit_sem);
491
492         atomic_inc(&cache->space_info->caching_threads);
493         btrfs_get_block_group(cache);
494
495         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
496                           cache->key.objectid);
497         if (IS_ERR(tsk)) {
498                 ret = PTR_ERR(tsk);
499                 printk(KERN_ERR "error running thread %d\n", ret);
500                 BUG();
501         }
502
503         return ret;
504 }
505
506 /*
507  * return the block group that starts at or after bytenr
508  */
509 static struct btrfs_block_group_cache *
510 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
511 {
512         struct btrfs_block_group_cache *cache;
513
514         cache = block_group_cache_tree_search(info, bytenr, 0);
515
516         return cache;
517 }
518
519 /*
520  * return the block group that contains the given bytenr
521  */
522 struct btrfs_block_group_cache *btrfs_lookup_block_group(
523                                                  struct btrfs_fs_info *info,
524                                                  u64 bytenr)
525 {
526         struct btrfs_block_group_cache *cache;
527
528         cache = block_group_cache_tree_search(info, bytenr, 1);
529
530         return cache;
531 }
532
533 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
534                                                   u64 flags)
535 {
536         struct list_head *head = &info->space_info;
537         struct btrfs_space_info *found;
538
539         flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
540                  BTRFS_BLOCK_GROUP_METADATA;
541
542         rcu_read_lock();
543         list_for_each_entry_rcu(found, head, list) {
544                 if (found->flags == flags) {
545                         rcu_read_unlock();
546                         return found;
547                 }
548         }
549         rcu_read_unlock();
550         return NULL;
551 }
552
553 /*
554  * after adding space to the filesystem, we need to clear the full flags
555  * on all the space infos.
556  */
557 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
558 {
559         struct list_head *head = &info->space_info;
560         struct btrfs_space_info *found;
561
562         rcu_read_lock();
563         list_for_each_entry_rcu(found, head, list)
564                 found->full = 0;
565         rcu_read_unlock();
566 }
567
568 static u64 div_factor(u64 num, int factor)
569 {
570         if (factor == 10)
571                 return num;
572         num *= factor;
573         do_div(num, 10);
574         return num;
575 }
576
577 u64 btrfs_find_block_group(struct btrfs_root *root,
578                            u64 search_start, u64 search_hint, int owner)
579 {
580         struct btrfs_block_group_cache *cache;
581         u64 used;
582         u64 last = max(search_hint, search_start);
583         u64 group_start = 0;
584         int full_search = 0;
585         int factor = 9;
586         int wrapped = 0;
587 again:
588         while (1) {
589                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
590                 if (!cache)
591                         break;
592
593                 spin_lock(&cache->lock);
594                 last = cache->key.objectid + cache->key.offset;
595                 used = btrfs_block_group_used(&cache->item);
596
597                 if ((full_search || !cache->ro) &&
598                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
599                         if (used + cache->pinned + cache->reserved <
600                             div_factor(cache->key.offset, factor)) {
601                                 group_start = cache->key.objectid;
602                                 spin_unlock(&cache->lock);
603                                 btrfs_put_block_group(cache);
604                                 goto found;
605                         }
606                 }
607                 spin_unlock(&cache->lock);
608                 btrfs_put_block_group(cache);
609                 cond_resched();
610         }
611         if (!wrapped) {
612                 last = search_start;
613                 wrapped = 1;
614                 goto again;
615         }
616         if (!full_search && factor < 10) {
617                 last = search_start;
618                 full_search = 1;
619                 factor = 10;
620                 goto again;
621         }
622 found:
623         return group_start;
624 }
625
626 /* simple helper to search for an existing extent at a given offset */
627 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
628 {
629         int ret;
630         struct btrfs_key key;
631         struct btrfs_path *path;
632
633         path = btrfs_alloc_path();
634         BUG_ON(!path);
635         key.objectid = start;
636         key.offset = len;
637         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
638         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
639                                 0, 0);
640         btrfs_free_path(path);
641         return ret;
642 }
643
644 /*
645  * helper function to lookup reference count and flags of extent.
646  *
647  * the head node for delayed ref is used to store the sum of all the
648  * reference count modifications queued up in the rbtree. the head
649  * node may also store the extent flags to set. This way you can check
650  * to see what the reference count and extent flags would be if all of
651  * the delayed refs are not processed.
652  */
653 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
654                              struct btrfs_root *root, u64 bytenr,
655                              u64 num_bytes, u64 *refs, u64 *flags)
656 {
657         struct btrfs_delayed_ref_head *head;
658         struct btrfs_delayed_ref_root *delayed_refs;
659         struct btrfs_path *path;
660         struct btrfs_extent_item *ei;
661         struct extent_buffer *leaf;
662         struct btrfs_key key;
663         u32 item_size;
664         u64 num_refs;
665         u64 extent_flags;
666         int ret;
667
668         path = btrfs_alloc_path();
669         if (!path)
670                 return -ENOMEM;
671
672         key.objectid = bytenr;
673         key.type = BTRFS_EXTENT_ITEM_KEY;
674         key.offset = num_bytes;
675         if (!trans) {
676                 path->skip_locking = 1;
677                 path->search_commit_root = 1;
678         }
679 again:
680         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
681                                 &key, path, 0, 0);
682         if (ret < 0)
683                 goto out_free;
684
685         if (ret == 0) {
686                 leaf = path->nodes[0];
687                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
688                 if (item_size >= sizeof(*ei)) {
689                         ei = btrfs_item_ptr(leaf, path->slots[0],
690                                             struct btrfs_extent_item);
691                         num_refs = btrfs_extent_refs(leaf, ei);
692                         extent_flags = btrfs_extent_flags(leaf, ei);
693                 } else {
694 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
695                         struct btrfs_extent_item_v0 *ei0;
696                         BUG_ON(item_size != sizeof(*ei0));
697                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
698                                              struct btrfs_extent_item_v0);
699                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
700                         /* FIXME: this isn't correct for data */
701                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
702 #else
703                         BUG();
704 #endif
705                 }
706                 BUG_ON(num_refs == 0);
707         } else {
708                 num_refs = 0;
709                 extent_flags = 0;
710                 ret = 0;
711         }
712
713         if (!trans)
714                 goto out;
715
716         delayed_refs = &trans->transaction->delayed_refs;
717         spin_lock(&delayed_refs->lock);
718         head = btrfs_find_delayed_ref_head(trans, bytenr);
719         if (head) {
720                 if (!mutex_trylock(&head->mutex)) {
721                         atomic_inc(&head->node.refs);
722                         spin_unlock(&delayed_refs->lock);
723
724                         btrfs_release_path(root->fs_info->extent_root, path);
725
726                         mutex_lock(&head->mutex);
727                         mutex_unlock(&head->mutex);
728                         btrfs_put_delayed_ref(&head->node);
729                         goto again;
730                 }
731                 if (head->extent_op && head->extent_op->update_flags)
732                         extent_flags |= head->extent_op->flags_to_set;
733                 else
734                         BUG_ON(num_refs == 0);
735
736                 num_refs += head->node.ref_mod;
737                 mutex_unlock(&head->mutex);
738         }
739         spin_unlock(&delayed_refs->lock);
740 out:
741         WARN_ON(num_refs == 0);
742         if (refs)
743                 *refs = num_refs;
744         if (flags)
745                 *flags = extent_flags;
746 out_free:
747         btrfs_free_path(path);
748         return ret;
749 }
750
751 /*
752  * Back reference rules.  Back refs have three main goals:
753  *
754  * 1) differentiate between all holders of references to an extent so that
755  *    when a reference is dropped we can make sure it was a valid reference
756  *    before freeing the extent.
757  *
758  * 2) Provide enough information to quickly find the holders of an extent
759  *    if we notice a given block is corrupted or bad.
760  *
761  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
762  *    maintenance.  This is actually the same as #2, but with a slightly
763  *    different use case.
764  *
765  * There are two kinds of back refs. The implicit back refs is optimized
766  * for pointers in non-shared tree blocks. For a given pointer in a block,
767  * back refs of this kind provide information about the block's owner tree
768  * and the pointer's key. These information allow us to find the block by
769  * b-tree searching. The full back refs is for pointers in tree blocks not
770  * referenced by their owner trees. The location of tree block is recorded
771  * in the back refs. Actually the full back refs is generic, and can be
772  * used in all cases the implicit back refs is used. The major shortcoming
773  * of the full back refs is its overhead. Every time a tree block gets
774  * COWed, we have to update back refs entry for all pointers in it.
775  *
776  * For a newly allocated tree block, we use implicit back refs for
777  * pointers in it. This means most tree related operations only involve
778  * implicit back refs. For a tree block created in old transaction, the
779  * only way to drop a reference to it is COW it. So we can detect the
780  * event that tree block loses its owner tree's reference and do the
781  * back refs conversion.
782  *
783  * When a tree block is COW'd through a tree, there are four cases:
784  *
785  * The reference count of the block is one and the tree is the block's
786  * owner tree. Nothing to do in this case.
787  *
788  * The reference count of the block is one and the tree is not the
789  * block's owner tree. In this case, full back refs is used for pointers
790  * in the block. Remove these full back refs, add implicit back refs for
791  * every pointers in the new block.
792  *
793  * The reference count of the block is greater than one and the tree is
794  * the block's owner tree. In this case, implicit back refs is used for
795  * pointers in the block. Add full back refs for every pointers in the
796  * block, increase lower level extents' reference counts. The original
797  * implicit back refs are entailed to the new block.
798  *
799  * The reference count of the block is greater than one and the tree is
800  * not the block's owner tree. Add implicit back refs for every pointer in
801  * the new block, increase lower level extents' reference count.
802  *
803  * Back Reference Key composing:
804  *
805  * The key objectid corresponds to the first byte in the extent,
806  * The key type is used to differentiate between types of back refs.
807  * There are different meanings of the key offset for different types
808  * of back refs.
809  *
810  * File extents can be referenced by:
811  *
812  * - multiple snapshots, subvolumes, or different generations in one subvol
813  * - different files inside a single subvolume
814  * - different offsets inside a file (bookend extents in file.c)
815  *
816  * The extent ref structure for the implicit back refs has fields for:
817  *
818  * - Objectid of the subvolume root
819  * - objectid of the file holding the reference
820  * - original offset in the file
821  * - how many bookend extents
822  *
823  * The key offset for the implicit back refs is hash of the first
824  * three fields.
825  *
826  * The extent ref structure for the full back refs has field for:
827  *
828  * - number of pointers in the tree leaf
829  *
830  * The key offset for the implicit back refs is the first byte of
831  * the tree leaf
832  *
833  * When a file extent is allocated, The implicit back refs is used.
834  * the fields are filled in:
835  *
836  *     (root_key.objectid, inode objectid, offset in file, 1)
837  *
838  * When a file extent is removed file truncation, we find the
839  * corresponding implicit back refs and check the following fields:
840  *
841  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
842  *
843  * Btree extents can be referenced by:
844  *
845  * - Different subvolumes
846  *
847  * Both the implicit back refs and the full back refs for tree blocks
848  * only consist of key. The key offset for the implicit back refs is
849  * objectid of block's owner tree. The key offset for the full back refs
850  * is the first byte of parent block.
851  *
852  * When implicit back refs is used, information about the lowest key and
853  * level of the tree block are required. These information are stored in
854  * tree block info structure.
855  */
856
857 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
858 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
859                                   struct btrfs_root *root,
860                                   struct btrfs_path *path,
861                                   u64 owner, u32 extra_size)
862 {
863         struct btrfs_extent_item *item;
864         struct btrfs_extent_item_v0 *ei0;
865         struct btrfs_extent_ref_v0 *ref0;
866         struct btrfs_tree_block_info *bi;
867         struct extent_buffer *leaf;
868         struct btrfs_key key;
869         struct btrfs_key found_key;
870         u32 new_size = sizeof(*item);
871         u64 refs;
872         int ret;
873
874         leaf = path->nodes[0];
875         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
876
877         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
878         ei0 = btrfs_item_ptr(leaf, path->slots[0],
879                              struct btrfs_extent_item_v0);
880         refs = btrfs_extent_refs_v0(leaf, ei0);
881
882         if (owner == (u64)-1) {
883                 while (1) {
884                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
885                                 ret = btrfs_next_leaf(root, path);
886                                 if (ret < 0)
887                                         return ret;
888                                 BUG_ON(ret > 0);
889                                 leaf = path->nodes[0];
890                         }
891                         btrfs_item_key_to_cpu(leaf, &found_key,
892                                               path->slots[0]);
893                         BUG_ON(key.objectid != found_key.objectid);
894                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
895                                 path->slots[0]++;
896                                 continue;
897                         }
898                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
899                                               struct btrfs_extent_ref_v0);
900                         owner = btrfs_ref_objectid_v0(leaf, ref0);
901                         break;
902                 }
903         }
904         btrfs_release_path(root, path);
905
906         if (owner < BTRFS_FIRST_FREE_OBJECTID)
907                 new_size += sizeof(*bi);
908
909         new_size -= sizeof(*ei0);
910         ret = btrfs_search_slot(trans, root, &key, path,
911                                 new_size + extra_size, 1);
912         if (ret < 0)
913                 return ret;
914         BUG_ON(ret);
915
916         ret = btrfs_extend_item(trans, root, path, new_size);
917         BUG_ON(ret);
918
919         leaf = path->nodes[0];
920         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
921         btrfs_set_extent_refs(leaf, item, refs);
922         /* FIXME: get real generation */
923         btrfs_set_extent_generation(leaf, item, 0);
924         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
925                 btrfs_set_extent_flags(leaf, item,
926                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
927                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
928                 bi = (struct btrfs_tree_block_info *)(item + 1);
929                 /* FIXME: get first key of the block */
930                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
931                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
932         } else {
933                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
934         }
935         btrfs_mark_buffer_dirty(leaf);
936         return 0;
937 }
938 #endif
939
940 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
941 {
942         u32 high_crc = ~(u32)0;
943         u32 low_crc = ~(u32)0;
944         __le64 lenum;
945
946         lenum = cpu_to_le64(root_objectid);
947         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
948         lenum = cpu_to_le64(owner);
949         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
950         lenum = cpu_to_le64(offset);
951         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
952
953         return ((u64)high_crc << 31) ^ (u64)low_crc;
954 }
955
956 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
957                                      struct btrfs_extent_data_ref *ref)
958 {
959         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
960                                     btrfs_extent_data_ref_objectid(leaf, ref),
961                                     btrfs_extent_data_ref_offset(leaf, ref));
962 }
963
964 static int match_extent_data_ref(struct extent_buffer *leaf,
965                                  struct btrfs_extent_data_ref *ref,
966                                  u64 root_objectid, u64 owner, u64 offset)
967 {
968         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
969             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
970             btrfs_extent_data_ref_offset(leaf, ref) != offset)
971                 return 0;
972         return 1;
973 }
974
975 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
976                                            struct btrfs_root *root,
977                                            struct btrfs_path *path,
978                                            u64 bytenr, u64 parent,
979                                            u64 root_objectid,
980                                            u64 owner, u64 offset)
981 {
982         struct btrfs_key key;
983         struct btrfs_extent_data_ref *ref;
984         struct extent_buffer *leaf;
985         u32 nritems;
986         int ret;
987         int recow;
988         int err = -ENOENT;
989
990         key.objectid = bytenr;
991         if (parent) {
992                 key.type = BTRFS_SHARED_DATA_REF_KEY;
993                 key.offset = parent;
994         } else {
995                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
996                 key.offset = hash_extent_data_ref(root_objectid,
997                                                   owner, offset);
998         }
999 again:
1000         recow = 0;
1001         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1002         if (ret < 0) {
1003                 err = ret;
1004                 goto fail;
1005         }
1006
1007         if (parent) {
1008                 if (!ret)
1009                         return 0;
1010 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1011                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1012                 btrfs_release_path(root, path);
1013                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1014                 if (ret < 0) {
1015                         err = ret;
1016                         goto fail;
1017                 }
1018                 if (!ret)
1019                         return 0;
1020 #endif
1021                 goto fail;
1022         }
1023
1024         leaf = path->nodes[0];
1025         nritems = btrfs_header_nritems(leaf);
1026         while (1) {
1027                 if (path->slots[0] >= nritems) {
1028                         ret = btrfs_next_leaf(root, path);
1029                         if (ret < 0)
1030                                 err = ret;
1031                         if (ret)
1032                                 goto fail;
1033
1034                         leaf = path->nodes[0];
1035                         nritems = btrfs_header_nritems(leaf);
1036                         recow = 1;
1037                 }
1038
1039                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1040                 if (key.objectid != bytenr ||
1041                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1042                         goto fail;
1043
1044                 ref = btrfs_item_ptr(leaf, path->slots[0],
1045                                      struct btrfs_extent_data_ref);
1046
1047                 if (match_extent_data_ref(leaf, ref, root_objectid,
1048                                           owner, offset)) {
1049                         if (recow) {
1050                                 btrfs_release_path(root, path);
1051                                 goto again;
1052                         }
1053                         err = 0;
1054                         break;
1055                 }
1056                 path->slots[0]++;
1057         }
1058 fail:
1059         return err;
1060 }
1061
1062 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1063                                            struct btrfs_root *root,
1064                                            struct btrfs_path *path,
1065                                            u64 bytenr, u64 parent,
1066                                            u64 root_objectid, u64 owner,
1067                                            u64 offset, int refs_to_add)
1068 {
1069         struct btrfs_key key;
1070         struct extent_buffer *leaf;
1071         u32 size;
1072         u32 num_refs;
1073         int ret;
1074
1075         key.objectid = bytenr;
1076         if (parent) {
1077                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1078                 key.offset = parent;
1079                 size = sizeof(struct btrfs_shared_data_ref);
1080         } else {
1081                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1082                 key.offset = hash_extent_data_ref(root_objectid,
1083                                                   owner, offset);
1084                 size = sizeof(struct btrfs_extent_data_ref);
1085         }
1086
1087         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1088         if (ret && ret != -EEXIST)
1089                 goto fail;
1090
1091         leaf = path->nodes[0];
1092         if (parent) {
1093                 struct btrfs_shared_data_ref *ref;
1094                 ref = btrfs_item_ptr(leaf, path->slots[0],
1095                                      struct btrfs_shared_data_ref);
1096                 if (ret == 0) {
1097                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1098                 } else {
1099                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1100                         num_refs += refs_to_add;
1101                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1102                 }
1103         } else {
1104                 struct btrfs_extent_data_ref *ref;
1105                 while (ret == -EEXIST) {
1106                         ref = btrfs_item_ptr(leaf, path->slots[0],
1107                                              struct btrfs_extent_data_ref);
1108                         if (match_extent_data_ref(leaf, ref, root_objectid,
1109                                                   owner, offset))
1110                                 break;
1111                         btrfs_release_path(root, path);
1112                         key.offset++;
1113                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1114                                                       size);
1115                         if (ret && ret != -EEXIST)
1116                                 goto fail;
1117
1118                         leaf = path->nodes[0];
1119                 }
1120                 ref = btrfs_item_ptr(leaf, path->slots[0],
1121                                      struct btrfs_extent_data_ref);
1122                 if (ret == 0) {
1123                         btrfs_set_extent_data_ref_root(leaf, ref,
1124                                                        root_objectid);
1125                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1126                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1127                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1128                 } else {
1129                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1130                         num_refs += refs_to_add;
1131                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1132                 }
1133         }
1134         btrfs_mark_buffer_dirty(leaf);
1135         ret = 0;
1136 fail:
1137         btrfs_release_path(root, path);
1138         return ret;
1139 }
1140
1141 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1142                                            struct btrfs_root *root,
1143                                            struct btrfs_path *path,
1144                                            int refs_to_drop)
1145 {
1146         struct btrfs_key key;
1147         struct btrfs_extent_data_ref *ref1 = NULL;
1148         struct btrfs_shared_data_ref *ref2 = NULL;
1149         struct extent_buffer *leaf;
1150         u32 num_refs = 0;
1151         int ret = 0;
1152
1153         leaf = path->nodes[0];
1154         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1155
1156         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1157                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1158                                       struct btrfs_extent_data_ref);
1159                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1160         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1161                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1162                                       struct btrfs_shared_data_ref);
1163                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1164 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1165         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1166                 struct btrfs_extent_ref_v0 *ref0;
1167                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1168                                       struct btrfs_extent_ref_v0);
1169                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1170 #endif
1171         } else {
1172                 BUG();
1173         }
1174
1175         BUG_ON(num_refs < refs_to_drop);
1176         num_refs -= refs_to_drop;
1177
1178         if (num_refs == 0) {
1179                 ret = btrfs_del_item(trans, root, path);
1180         } else {
1181                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1182                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1183                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1184                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1185 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1186                 else {
1187                         struct btrfs_extent_ref_v0 *ref0;
1188                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1189                                         struct btrfs_extent_ref_v0);
1190                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1191                 }
1192 #endif
1193                 btrfs_mark_buffer_dirty(leaf);
1194         }
1195         return ret;
1196 }
1197
1198 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1199                                           struct btrfs_path *path,
1200                                           struct btrfs_extent_inline_ref *iref)
1201 {
1202         struct btrfs_key key;
1203         struct extent_buffer *leaf;
1204         struct btrfs_extent_data_ref *ref1;
1205         struct btrfs_shared_data_ref *ref2;
1206         u32 num_refs = 0;
1207
1208         leaf = path->nodes[0];
1209         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1210         if (iref) {
1211                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1212                     BTRFS_EXTENT_DATA_REF_KEY) {
1213                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1214                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1215                 } else {
1216                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1217                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1218                 }
1219         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1220                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1221                                       struct btrfs_extent_data_ref);
1222                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1223         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1224                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1225                                       struct btrfs_shared_data_ref);
1226                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1227 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1228         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1229                 struct btrfs_extent_ref_v0 *ref0;
1230                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1231                                       struct btrfs_extent_ref_v0);
1232                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1233 #endif
1234         } else {
1235                 WARN_ON(1);
1236         }
1237         return num_refs;
1238 }
1239
1240 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1241                                           struct btrfs_root *root,
1242                                           struct btrfs_path *path,
1243                                           u64 bytenr, u64 parent,
1244                                           u64 root_objectid)
1245 {
1246         struct btrfs_key key;
1247         int ret;
1248
1249         key.objectid = bytenr;
1250         if (parent) {
1251                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1252                 key.offset = parent;
1253         } else {
1254                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1255                 key.offset = root_objectid;
1256         }
1257
1258         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1259         if (ret > 0)
1260                 ret = -ENOENT;
1261 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1262         if (ret == -ENOENT && parent) {
1263                 btrfs_release_path(root, path);
1264                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1265                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1266                 if (ret > 0)
1267                         ret = -ENOENT;
1268         }
1269 #endif
1270         return ret;
1271 }
1272
1273 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1274                                           struct btrfs_root *root,
1275                                           struct btrfs_path *path,
1276                                           u64 bytenr, u64 parent,
1277                                           u64 root_objectid)
1278 {
1279         struct btrfs_key key;
1280         int ret;
1281
1282         key.objectid = bytenr;
1283         if (parent) {
1284                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1285                 key.offset = parent;
1286         } else {
1287                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1288                 key.offset = root_objectid;
1289         }
1290
1291         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1292         btrfs_release_path(root, path);
1293         return ret;
1294 }
1295
1296 static inline int extent_ref_type(u64 parent, u64 owner)
1297 {
1298         int type;
1299         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1300                 if (parent > 0)
1301                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1302                 else
1303                         type = BTRFS_TREE_BLOCK_REF_KEY;
1304         } else {
1305                 if (parent > 0)
1306                         type = BTRFS_SHARED_DATA_REF_KEY;
1307                 else
1308                         type = BTRFS_EXTENT_DATA_REF_KEY;
1309         }
1310         return type;
1311 }
1312
1313 static int find_next_key(struct btrfs_path *path, int level,
1314                          struct btrfs_key *key)
1315
1316 {
1317         for (; level < BTRFS_MAX_LEVEL; level++) {
1318                 if (!path->nodes[level])
1319                         break;
1320                 if (path->slots[level] + 1 >=
1321                     btrfs_header_nritems(path->nodes[level]))
1322                         continue;
1323                 if (level == 0)
1324                         btrfs_item_key_to_cpu(path->nodes[level], key,
1325                                               path->slots[level] + 1);
1326                 else
1327                         btrfs_node_key_to_cpu(path->nodes[level], key,
1328                                               path->slots[level] + 1);
1329                 return 0;
1330         }
1331         return 1;
1332 }
1333
1334 /*
1335  * look for inline back ref. if back ref is found, *ref_ret is set
1336  * to the address of inline back ref, and 0 is returned.
1337  *
1338  * if back ref isn't found, *ref_ret is set to the address where it
1339  * should be inserted, and -ENOENT is returned.
1340  *
1341  * if insert is true and there are too many inline back refs, the path
1342  * points to the extent item, and -EAGAIN is returned.
1343  *
1344  * NOTE: inline back refs are ordered in the same way that back ref
1345  *       items in the tree are ordered.
1346  */
1347 static noinline_for_stack
1348 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1349                                  struct btrfs_root *root,
1350                                  struct btrfs_path *path,
1351                                  struct btrfs_extent_inline_ref **ref_ret,
1352                                  u64 bytenr, u64 num_bytes,
1353                                  u64 parent, u64 root_objectid,
1354                                  u64 owner, u64 offset, int insert)
1355 {
1356         struct btrfs_key key;
1357         struct extent_buffer *leaf;
1358         struct btrfs_extent_item *ei;
1359         struct btrfs_extent_inline_ref *iref;
1360         u64 flags;
1361         u64 item_size;
1362         unsigned long ptr;
1363         unsigned long end;
1364         int extra_size;
1365         int type;
1366         int want;
1367         int ret;
1368         int err = 0;
1369
1370         key.objectid = bytenr;
1371         key.type = BTRFS_EXTENT_ITEM_KEY;
1372         key.offset = num_bytes;
1373
1374         want = extent_ref_type(parent, owner);
1375         if (insert) {
1376                 extra_size = btrfs_extent_inline_ref_size(want);
1377                 path->keep_locks = 1;
1378         } else
1379                 extra_size = -1;
1380         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1381         if (ret < 0) {
1382                 err = ret;
1383                 goto out;
1384         }
1385         BUG_ON(ret);
1386
1387         leaf = path->nodes[0];
1388         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1389 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1390         if (item_size < sizeof(*ei)) {
1391                 if (!insert) {
1392                         err = -ENOENT;
1393                         goto out;
1394                 }
1395                 ret = convert_extent_item_v0(trans, root, path, owner,
1396                                              extra_size);
1397                 if (ret < 0) {
1398                         err = ret;
1399                         goto out;
1400                 }
1401                 leaf = path->nodes[0];
1402                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1403         }
1404 #endif
1405         BUG_ON(item_size < sizeof(*ei));
1406
1407         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1408         flags = btrfs_extent_flags(leaf, ei);
1409
1410         ptr = (unsigned long)(ei + 1);
1411         end = (unsigned long)ei + item_size;
1412
1413         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1414                 ptr += sizeof(struct btrfs_tree_block_info);
1415                 BUG_ON(ptr > end);
1416         } else {
1417                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1418         }
1419
1420         err = -ENOENT;
1421         while (1) {
1422                 if (ptr >= end) {
1423                         WARN_ON(ptr > end);
1424                         break;
1425                 }
1426                 iref = (struct btrfs_extent_inline_ref *)ptr;
1427                 type = btrfs_extent_inline_ref_type(leaf, iref);
1428                 if (want < type)
1429                         break;
1430                 if (want > type) {
1431                         ptr += btrfs_extent_inline_ref_size(type);
1432                         continue;
1433                 }
1434
1435                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1436                         struct btrfs_extent_data_ref *dref;
1437                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1438                         if (match_extent_data_ref(leaf, dref, root_objectid,
1439                                                   owner, offset)) {
1440                                 err = 0;
1441                                 break;
1442                         }
1443                         if (hash_extent_data_ref_item(leaf, dref) <
1444                             hash_extent_data_ref(root_objectid, owner, offset))
1445                                 break;
1446                 } else {
1447                         u64 ref_offset;
1448                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1449                         if (parent > 0) {
1450                                 if (parent == ref_offset) {
1451                                         err = 0;
1452                                         break;
1453                                 }
1454                                 if (ref_offset < parent)
1455                                         break;
1456                         } else {
1457                                 if (root_objectid == ref_offset) {
1458                                         err = 0;
1459                                         break;
1460                                 }
1461                                 if (ref_offset < root_objectid)
1462                                         break;
1463                         }
1464                 }
1465                 ptr += btrfs_extent_inline_ref_size(type);
1466         }
1467         if (err == -ENOENT && insert) {
1468                 if (item_size + extra_size >=
1469                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1470                         err = -EAGAIN;
1471                         goto out;
1472                 }
1473                 /*
1474                  * To add new inline back ref, we have to make sure
1475                  * there is no corresponding back ref item.
1476                  * For simplicity, we just do not add new inline back
1477                  * ref if there is any kind of item for this block
1478                  */
1479                 if (find_next_key(path, 0, &key) == 0 &&
1480                     key.objectid == bytenr &&
1481                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1482                         err = -EAGAIN;
1483                         goto out;
1484                 }
1485         }
1486         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1487 out:
1488         if (insert) {
1489                 path->keep_locks = 0;
1490                 btrfs_unlock_up_safe(path, 1);
1491         }
1492         return err;
1493 }
1494
1495 /*
1496  * helper to add new inline back ref
1497  */
1498 static noinline_for_stack
1499 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1500                                 struct btrfs_root *root,
1501                                 struct btrfs_path *path,
1502                                 struct btrfs_extent_inline_ref *iref,
1503                                 u64 parent, u64 root_objectid,
1504                                 u64 owner, u64 offset, int refs_to_add,
1505                                 struct btrfs_delayed_extent_op *extent_op)
1506 {
1507         struct extent_buffer *leaf;
1508         struct btrfs_extent_item *ei;
1509         unsigned long ptr;
1510         unsigned long end;
1511         unsigned long item_offset;
1512         u64 refs;
1513         int size;
1514         int type;
1515         int ret;
1516
1517         leaf = path->nodes[0];
1518         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1519         item_offset = (unsigned long)iref - (unsigned long)ei;
1520
1521         type = extent_ref_type(parent, owner);
1522         size = btrfs_extent_inline_ref_size(type);
1523
1524         ret = btrfs_extend_item(trans, root, path, size);
1525         BUG_ON(ret);
1526
1527         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1528         refs = btrfs_extent_refs(leaf, ei);
1529         refs += refs_to_add;
1530         btrfs_set_extent_refs(leaf, ei, refs);
1531         if (extent_op)
1532                 __run_delayed_extent_op(extent_op, leaf, ei);
1533
1534         ptr = (unsigned long)ei + item_offset;
1535         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1536         if (ptr < end - size)
1537                 memmove_extent_buffer(leaf, ptr + size, ptr,
1538                                       end - size - ptr);
1539
1540         iref = (struct btrfs_extent_inline_ref *)ptr;
1541         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1542         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1543                 struct btrfs_extent_data_ref *dref;
1544                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1545                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1546                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1547                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1548                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1549         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1550                 struct btrfs_shared_data_ref *sref;
1551                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1552                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1553                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1554         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1555                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1556         } else {
1557                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1558         }
1559         btrfs_mark_buffer_dirty(leaf);
1560         return 0;
1561 }
1562
1563 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1564                                  struct btrfs_root *root,
1565                                  struct btrfs_path *path,
1566                                  struct btrfs_extent_inline_ref **ref_ret,
1567                                  u64 bytenr, u64 num_bytes, u64 parent,
1568                                  u64 root_objectid, u64 owner, u64 offset)
1569 {
1570         int ret;
1571
1572         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1573                                            bytenr, num_bytes, parent,
1574                                            root_objectid, owner, offset, 0);
1575         if (ret != -ENOENT)
1576                 return ret;
1577
1578         btrfs_release_path(root, path);
1579         *ref_ret = NULL;
1580
1581         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1582                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1583                                             root_objectid);
1584         } else {
1585                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1586                                              root_objectid, owner, offset);
1587         }
1588         return ret;
1589 }
1590
1591 /*
1592  * helper to update/remove inline back ref
1593  */
1594 static noinline_for_stack
1595 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1596                                  struct btrfs_root *root,
1597                                  struct btrfs_path *path,
1598                                  struct btrfs_extent_inline_ref *iref,
1599                                  int refs_to_mod,
1600                                  struct btrfs_delayed_extent_op *extent_op)
1601 {
1602         struct extent_buffer *leaf;
1603         struct btrfs_extent_item *ei;
1604         struct btrfs_extent_data_ref *dref = NULL;
1605         struct btrfs_shared_data_ref *sref = NULL;
1606         unsigned long ptr;
1607         unsigned long end;
1608         u32 item_size;
1609         int size;
1610         int type;
1611         int ret;
1612         u64 refs;
1613
1614         leaf = path->nodes[0];
1615         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1616         refs = btrfs_extent_refs(leaf, ei);
1617         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1618         refs += refs_to_mod;
1619         btrfs_set_extent_refs(leaf, ei, refs);
1620         if (extent_op)
1621                 __run_delayed_extent_op(extent_op, leaf, ei);
1622
1623         type = btrfs_extent_inline_ref_type(leaf, iref);
1624
1625         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1626                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1627                 refs = btrfs_extent_data_ref_count(leaf, dref);
1628         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1629                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1630                 refs = btrfs_shared_data_ref_count(leaf, sref);
1631         } else {
1632                 refs = 1;
1633                 BUG_ON(refs_to_mod != -1);
1634         }
1635
1636         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1637         refs += refs_to_mod;
1638
1639         if (refs > 0) {
1640                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1641                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1642                 else
1643                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1644         } else {
1645                 size =  btrfs_extent_inline_ref_size(type);
1646                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1647                 ptr = (unsigned long)iref;
1648                 end = (unsigned long)ei + item_size;
1649                 if (ptr + size < end)
1650                         memmove_extent_buffer(leaf, ptr, ptr + size,
1651                                               end - ptr - size);
1652                 item_size -= size;
1653                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1654                 BUG_ON(ret);
1655         }
1656         btrfs_mark_buffer_dirty(leaf);
1657         return 0;
1658 }
1659
1660 static noinline_for_stack
1661 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1662                                  struct btrfs_root *root,
1663                                  struct btrfs_path *path,
1664                                  u64 bytenr, u64 num_bytes, u64 parent,
1665                                  u64 root_objectid, u64 owner,
1666                                  u64 offset, int refs_to_add,
1667                                  struct btrfs_delayed_extent_op *extent_op)
1668 {
1669         struct btrfs_extent_inline_ref *iref;
1670         int ret;
1671
1672         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1673                                            bytenr, num_bytes, parent,
1674                                            root_objectid, owner, offset, 1);
1675         if (ret == 0) {
1676                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1677                 ret = update_inline_extent_backref(trans, root, path, iref,
1678                                                    refs_to_add, extent_op);
1679         } else if (ret == -ENOENT) {
1680                 ret = setup_inline_extent_backref(trans, root, path, iref,
1681                                                   parent, root_objectid,
1682                                                   owner, offset, refs_to_add,
1683                                                   extent_op);
1684         }
1685         return ret;
1686 }
1687
1688 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1689                                  struct btrfs_root *root,
1690                                  struct btrfs_path *path,
1691                                  u64 bytenr, u64 parent, u64 root_objectid,
1692                                  u64 owner, u64 offset, int refs_to_add)
1693 {
1694         int ret;
1695         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1696                 BUG_ON(refs_to_add != 1);
1697                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1698                                             parent, root_objectid);
1699         } else {
1700                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1701                                              parent, root_objectid,
1702                                              owner, offset, refs_to_add);
1703         }
1704         return ret;
1705 }
1706
1707 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1708                                  struct btrfs_root *root,
1709                                  struct btrfs_path *path,
1710                                  struct btrfs_extent_inline_ref *iref,
1711                                  int refs_to_drop, int is_data)
1712 {
1713         int ret;
1714
1715         BUG_ON(!is_data && refs_to_drop != 1);
1716         if (iref) {
1717                 ret = update_inline_extent_backref(trans, root, path, iref,
1718                                                    -refs_to_drop, NULL);
1719         } else if (is_data) {
1720                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1721         } else {
1722                 ret = btrfs_del_item(trans, root, path);
1723         }
1724         return ret;
1725 }
1726
1727 static void btrfs_issue_discard(struct block_device *bdev,
1728                                 u64 start, u64 len)
1729 {
1730         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1731                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1732 }
1733
1734 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1735                                 u64 num_bytes)
1736 {
1737         int ret;
1738         u64 map_length = num_bytes;
1739         struct btrfs_multi_bio *multi = NULL;
1740
1741         if (!btrfs_test_opt(root, DISCARD))
1742                 return 0;
1743
1744         /* Tell the block device(s) that the sectors can be discarded */
1745         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1746                               bytenr, &map_length, &multi, 0);
1747         if (!ret) {
1748                 struct btrfs_bio_stripe *stripe = multi->stripes;
1749                 int i;
1750
1751                 if (map_length > num_bytes)
1752                         map_length = num_bytes;
1753
1754                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1755                         btrfs_issue_discard(stripe->dev->bdev,
1756                                             stripe->physical,
1757                                             map_length);
1758                 }
1759                 kfree(multi);
1760         }
1761
1762         return ret;
1763 }
1764
1765 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1766                          struct btrfs_root *root,
1767                          u64 bytenr, u64 num_bytes, u64 parent,
1768                          u64 root_objectid, u64 owner, u64 offset)
1769 {
1770         int ret;
1771         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1772                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1773
1774         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1775                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1776                                         parent, root_objectid, (int)owner,
1777                                         BTRFS_ADD_DELAYED_REF, NULL);
1778         } else {
1779                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1780                                         parent, root_objectid, owner, offset,
1781                                         BTRFS_ADD_DELAYED_REF, NULL);
1782         }
1783         return ret;
1784 }
1785
1786 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1787                                   struct btrfs_root *root,
1788                                   u64 bytenr, u64 num_bytes,
1789                                   u64 parent, u64 root_objectid,
1790                                   u64 owner, u64 offset, int refs_to_add,
1791                                   struct btrfs_delayed_extent_op *extent_op)
1792 {
1793         struct btrfs_path *path;
1794         struct extent_buffer *leaf;
1795         struct btrfs_extent_item *item;
1796         u64 refs;
1797         int ret;
1798         int err = 0;
1799
1800         path = btrfs_alloc_path();
1801         if (!path)
1802                 return -ENOMEM;
1803
1804         path->reada = 1;
1805         path->leave_spinning = 1;
1806         /* this will setup the path even if it fails to insert the back ref */
1807         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1808                                            path, bytenr, num_bytes, parent,
1809                                            root_objectid, owner, offset,
1810                                            refs_to_add, extent_op);
1811         if (ret == 0)
1812                 goto out;
1813
1814         if (ret != -EAGAIN) {
1815                 err = ret;
1816                 goto out;
1817         }
1818
1819         leaf = path->nodes[0];
1820         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1821         refs = btrfs_extent_refs(leaf, item);
1822         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1823         if (extent_op)
1824                 __run_delayed_extent_op(extent_op, leaf, item);
1825
1826         btrfs_mark_buffer_dirty(leaf);
1827         btrfs_release_path(root->fs_info->extent_root, path);
1828
1829         path->reada = 1;
1830         path->leave_spinning = 1;
1831
1832         /* now insert the actual backref */
1833         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1834                                     path, bytenr, parent, root_objectid,
1835                                     owner, offset, refs_to_add);
1836         BUG_ON(ret);
1837 out:
1838         btrfs_free_path(path);
1839         return err;
1840 }
1841
1842 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1843                                 struct btrfs_root *root,
1844                                 struct btrfs_delayed_ref_node *node,
1845                                 struct btrfs_delayed_extent_op *extent_op,
1846                                 int insert_reserved)
1847 {
1848         int ret = 0;
1849         struct btrfs_delayed_data_ref *ref;
1850         struct btrfs_key ins;
1851         u64 parent = 0;
1852         u64 ref_root = 0;
1853         u64 flags = 0;
1854
1855         ins.objectid = node->bytenr;
1856         ins.offset = node->num_bytes;
1857         ins.type = BTRFS_EXTENT_ITEM_KEY;
1858
1859         ref = btrfs_delayed_node_to_data_ref(node);
1860         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1861                 parent = ref->parent;
1862         else
1863                 ref_root = ref->root;
1864
1865         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1866                 if (extent_op) {
1867                         BUG_ON(extent_op->update_key);
1868                         flags |= extent_op->flags_to_set;
1869                 }
1870                 ret = alloc_reserved_file_extent(trans, root,
1871                                                  parent, ref_root, flags,
1872                                                  ref->objectid, ref->offset,
1873                                                  &ins, node->ref_mod);
1874         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1875                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1876                                              node->num_bytes, parent,
1877                                              ref_root, ref->objectid,
1878                                              ref->offset, node->ref_mod,
1879                                              extent_op);
1880         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1881                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1882                                           node->num_bytes, parent,
1883                                           ref_root, ref->objectid,
1884                                           ref->offset, node->ref_mod,
1885                                           extent_op);
1886         } else {
1887                 BUG();
1888         }
1889         return ret;
1890 }
1891
1892 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1893                                     struct extent_buffer *leaf,
1894                                     struct btrfs_extent_item *ei)
1895 {
1896         u64 flags = btrfs_extent_flags(leaf, ei);
1897         if (extent_op->update_flags) {
1898                 flags |= extent_op->flags_to_set;
1899                 btrfs_set_extent_flags(leaf, ei, flags);
1900         }
1901
1902         if (extent_op->update_key) {
1903                 struct btrfs_tree_block_info *bi;
1904                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1905                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1906                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1907         }
1908 }
1909
1910 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1911                                  struct btrfs_root *root,
1912                                  struct btrfs_delayed_ref_node *node,
1913                                  struct btrfs_delayed_extent_op *extent_op)
1914 {
1915         struct btrfs_key key;
1916         struct btrfs_path *path;
1917         struct btrfs_extent_item *ei;
1918         struct extent_buffer *leaf;
1919         u32 item_size;
1920         int ret;
1921         int err = 0;
1922
1923         path = btrfs_alloc_path();
1924         if (!path)
1925                 return -ENOMEM;
1926
1927         key.objectid = node->bytenr;
1928         key.type = BTRFS_EXTENT_ITEM_KEY;
1929         key.offset = node->num_bytes;
1930
1931         path->reada = 1;
1932         path->leave_spinning = 1;
1933         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1934                                 path, 0, 1);
1935         if (ret < 0) {
1936                 err = ret;
1937                 goto out;
1938         }
1939         if (ret > 0) {
1940                 err = -EIO;
1941                 goto out;
1942         }
1943
1944         leaf = path->nodes[0];
1945         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1946 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1947         if (item_size < sizeof(*ei)) {
1948                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1949                                              path, (u64)-1, 0);
1950                 if (ret < 0) {
1951                         err = ret;
1952                         goto out;
1953                 }
1954                 leaf = path->nodes[0];
1955                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1956         }
1957 #endif
1958         BUG_ON(item_size < sizeof(*ei));
1959         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1960         __run_delayed_extent_op(extent_op, leaf, ei);
1961
1962         btrfs_mark_buffer_dirty(leaf);
1963 out:
1964         btrfs_free_path(path);
1965         return err;
1966 }
1967
1968 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1969                                 struct btrfs_root *root,
1970                                 struct btrfs_delayed_ref_node *node,
1971                                 struct btrfs_delayed_extent_op *extent_op,
1972                                 int insert_reserved)
1973 {
1974         int ret = 0;
1975         struct btrfs_delayed_tree_ref *ref;
1976         struct btrfs_key ins;
1977         u64 parent = 0;
1978         u64 ref_root = 0;
1979
1980         ins.objectid = node->bytenr;
1981         ins.offset = node->num_bytes;
1982         ins.type = BTRFS_EXTENT_ITEM_KEY;
1983
1984         ref = btrfs_delayed_node_to_tree_ref(node);
1985         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1986                 parent = ref->parent;
1987         else
1988                 ref_root = ref->root;
1989
1990         BUG_ON(node->ref_mod != 1);
1991         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1992                 BUG_ON(!extent_op || !extent_op->update_flags ||
1993                        !extent_op->update_key);
1994                 ret = alloc_reserved_tree_block(trans, root,
1995                                                 parent, ref_root,
1996                                                 extent_op->flags_to_set,
1997                                                 &extent_op->key,
1998                                                 ref->level, &ins);
1999         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2000                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2001                                              node->num_bytes, parent, ref_root,
2002                                              ref->level, 0, 1, extent_op);
2003         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2004                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2005                                           node->num_bytes, parent, ref_root,
2006                                           ref->level, 0, 1, extent_op);
2007         } else {
2008                 BUG();
2009         }
2010         return ret;
2011 }
2012
2013 /* helper function to actually process a single delayed ref entry */
2014 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2015                                struct btrfs_root *root,
2016                                struct btrfs_delayed_ref_node *node,
2017                                struct btrfs_delayed_extent_op *extent_op,
2018                                int insert_reserved)
2019 {
2020         int ret;
2021         if (btrfs_delayed_ref_is_head(node)) {
2022                 struct btrfs_delayed_ref_head *head;
2023                 /*
2024                  * we've hit the end of the chain and we were supposed
2025                  * to insert this extent into the tree.  But, it got
2026                  * deleted before we ever needed to insert it, so all
2027                  * we have to do is clean up the accounting
2028                  */
2029                 BUG_ON(extent_op);
2030                 head = btrfs_delayed_node_to_head(node);
2031                 if (insert_reserved) {
2032                         btrfs_pin_extent(root, node->bytenr,
2033                                          node->num_bytes, 1);
2034                         if (head->is_data) {
2035                                 ret = btrfs_del_csums(trans, root,
2036                                                       node->bytenr,
2037                                                       node->num_bytes);
2038                                 BUG_ON(ret);
2039                         }
2040                 }
2041                 mutex_unlock(&head->mutex);
2042                 return 0;
2043         }
2044
2045         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2046             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2047                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2048                                            insert_reserved);
2049         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2050                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2051                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2052                                            insert_reserved);
2053         else
2054                 BUG();
2055         return ret;
2056 }
2057
2058 static noinline struct btrfs_delayed_ref_node *
2059 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2060 {
2061         struct rb_node *node;
2062         struct btrfs_delayed_ref_node *ref;
2063         int action = BTRFS_ADD_DELAYED_REF;
2064 again:
2065         /*
2066          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2067          * this prevents ref count from going down to zero when
2068          * there still are pending delayed ref.
2069          */
2070         node = rb_prev(&head->node.rb_node);
2071         while (1) {
2072                 if (!node)
2073                         break;
2074                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2075                                 rb_node);
2076                 if (ref->bytenr != head->node.bytenr)
2077                         break;
2078                 if (ref->action == action)
2079                         return ref;
2080                 node = rb_prev(node);
2081         }
2082         if (action == BTRFS_ADD_DELAYED_REF) {
2083                 action = BTRFS_DROP_DELAYED_REF;
2084                 goto again;
2085         }
2086         return NULL;
2087 }
2088
2089 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2090                                        struct btrfs_root *root,
2091                                        struct list_head *cluster)
2092 {
2093         struct btrfs_delayed_ref_root *delayed_refs;
2094         struct btrfs_delayed_ref_node *ref;
2095         struct btrfs_delayed_ref_head *locked_ref = NULL;
2096         struct btrfs_delayed_extent_op *extent_op;
2097         int ret;
2098         int count = 0;
2099         int must_insert_reserved = 0;
2100
2101         delayed_refs = &trans->transaction->delayed_refs;
2102         while (1) {
2103                 if (!locked_ref) {
2104                         /* pick a new head ref from the cluster list */
2105                         if (list_empty(cluster))
2106                                 break;
2107
2108                         locked_ref = list_entry(cluster->next,
2109                                      struct btrfs_delayed_ref_head, cluster);
2110
2111                         /* grab the lock that says we are going to process
2112                          * all the refs for this head */
2113                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2114
2115                         /*
2116                          * we may have dropped the spin lock to get the head
2117                          * mutex lock, and that might have given someone else
2118                          * time to free the head.  If that's true, it has been
2119                          * removed from our list and we can move on.
2120                          */
2121                         if (ret == -EAGAIN) {
2122                                 locked_ref = NULL;
2123                                 count++;
2124                                 continue;
2125                         }
2126                 }
2127
2128                 /*
2129                  * record the must insert reserved flag before we
2130                  * drop the spin lock.
2131                  */
2132                 must_insert_reserved = locked_ref->must_insert_reserved;
2133                 locked_ref->must_insert_reserved = 0;
2134
2135                 extent_op = locked_ref->extent_op;
2136                 locked_ref->extent_op = NULL;
2137
2138                 /*
2139                  * locked_ref is the head node, so we have to go one
2140                  * node back for any delayed ref updates
2141                  */
2142                 ref = select_delayed_ref(locked_ref);
2143                 if (!ref) {
2144                         /* All delayed refs have been processed, Go ahead
2145                          * and send the head node to run_one_delayed_ref,
2146                          * so that any accounting fixes can happen
2147                          */
2148                         ref = &locked_ref->node;
2149
2150                         if (extent_op && must_insert_reserved) {
2151                                 kfree(extent_op);
2152                                 extent_op = NULL;
2153                         }
2154
2155                         if (extent_op) {
2156                                 spin_unlock(&delayed_refs->lock);
2157
2158                                 ret = run_delayed_extent_op(trans, root,
2159                                                             ref, extent_op);
2160                                 BUG_ON(ret);
2161                                 kfree(extent_op);
2162
2163                                 cond_resched();
2164                                 spin_lock(&delayed_refs->lock);
2165                                 continue;
2166                         }
2167
2168                         list_del_init(&locked_ref->cluster);
2169                         locked_ref = NULL;
2170                 }
2171
2172                 ref->in_tree = 0;
2173                 rb_erase(&ref->rb_node, &delayed_refs->root);
2174                 delayed_refs->num_entries--;
2175
2176                 spin_unlock(&delayed_refs->lock);
2177
2178                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2179                                           must_insert_reserved);
2180                 BUG_ON(ret);
2181
2182                 btrfs_put_delayed_ref(ref);
2183                 kfree(extent_op);
2184                 count++;
2185
2186                 cond_resched();
2187                 spin_lock(&delayed_refs->lock);
2188         }
2189         return count;
2190 }
2191
2192 /*
2193  * this starts processing the delayed reference count updates and
2194  * extent insertions we have queued up so far.  count can be
2195  * 0, which means to process everything in the tree at the start
2196  * of the run (but not newly added entries), or it can be some target
2197  * number you'd like to process.
2198  */
2199 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2200                            struct btrfs_root *root, unsigned long count)
2201 {
2202         struct rb_node *node;
2203         struct btrfs_delayed_ref_root *delayed_refs;
2204         struct btrfs_delayed_ref_node *ref;
2205         struct list_head cluster;
2206         int ret;
2207         int run_all = count == (unsigned long)-1;
2208         int run_most = 0;
2209
2210         if (root == root->fs_info->extent_root)
2211                 root = root->fs_info->tree_root;
2212
2213         delayed_refs = &trans->transaction->delayed_refs;
2214         INIT_LIST_HEAD(&cluster);
2215 again:
2216         spin_lock(&delayed_refs->lock);
2217         if (count == 0) {
2218                 count = delayed_refs->num_entries * 2;
2219                 run_most = 1;
2220         }
2221         while (1) {
2222                 if (!(run_all || run_most) &&
2223                     delayed_refs->num_heads_ready < 64)
2224                         break;
2225
2226                 /*
2227                  * go find something we can process in the rbtree.  We start at
2228                  * the beginning of the tree, and then build a cluster
2229                  * of refs to process starting at the first one we are able to
2230                  * lock
2231                  */
2232                 ret = btrfs_find_ref_cluster(trans, &cluster,
2233                                              delayed_refs->run_delayed_start);
2234                 if (ret)
2235                         break;
2236
2237                 ret = run_clustered_refs(trans, root, &cluster);
2238                 BUG_ON(ret < 0);
2239
2240                 count -= min_t(unsigned long, ret, count);
2241
2242                 if (count == 0)
2243                         break;
2244         }
2245
2246         if (run_all) {
2247                 node = rb_first(&delayed_refs->root);
2248                 if (!node)
2249                         goto out;
2250                 count = (unsigned long)-1;
2251
2252                 while (node) {
2253                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2254                                        rb_node);
2255                         if (btrfs_delayed_ref_is_head(ref)) {
2256                                 struct btrfs_delayed_ref_head *head;
2257
2258                                 head = btrfs_delayed_node_to_head(ref);
2259                                 atomic_inc(&ref->refs);
2260
2261                                 spin_unlock(&delayed_refs->lock);
2262                                 mutex_lock(&head->mutex);
2263                                 mutex_unlock(&head->mutex);
2264
2265                                 btrfs_put_delayed_ref(ref);
2266                                 cond_resched();
2267                                 goto again;
2268                         }
2269                         node = rb_next(node);
2270                 }
2271                 spin_unlock(&delayed_refs->lock);
2272                 schedule_timeout(1);
2273                 goto again;
2274         }
2275 out:
2276         spin_unlock(&delayed_refs->lock);
2277         return 0;
2278 }
2279
2280 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2281                                 struct btrfs_root *root,
2282                                 u64 bytenr, u64 num_bytes, u64 flags,
2283                                 int is_data)
2284 {
2285         struct btrfs_delayed_extent_op *extent_op;
2286         int ret;
2287
2288         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2289         if (!extent_op)
2290                 return -ENOMEM;
2291
2292         extent_op->flags_to_set = flags;
2293         extent_op->update_flags = 1;
2294         extent_op->update_key = 0;
2295         extent_op->is_data = is_data ? 1 : 0;
2296
2297         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2298         if (ret)
2299                 kfree(extent_op);
2300         return ret;
2301 }
2302
2303 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2304                                       struct btrfs_root *root,
2305                                       struct btrfs_path *path,
2306                                       u64 objectid, u64 offset, u64 bytenr)
2307 {
2308         struct btrfs_delayed_ref_head *head;
2309         struct btrfs_delayed_ref_node *ref;
2310         struct btrfs_delayed_data_ref *data_ref;
2311         struct btrfs_delayed_ref_root *delayed_refs;
2312         struct rb_node *node;
2313         int ret = 0;
2314
2315         ret = -ENOENT;
2316         delayed_refs = &trans->transaction->delayed_refs;
2317         spin_lock(&delayed_refs->lock);
2318         head = btrfs_find_delayed_ref_head(trans, bytenr);
2319         if (!head)
2320                 goto out;
2321
2322         if (!mutex_trylock(&head->mutex)) {
2323                 atomic_inc(&head->node.refs);
2324                 spin_unlock(&delayed_refs->lock);
2325
2326                 btrfs_release_path(root->fs_info->extent_root, path);
2327
2328                 mutex_lock(&head->mutex);
2329                 mutex_unlock(&head->mutex);
2330                 btrfs_put_delayed_ref(&head->node);
2331                 return -EAGAIN;
2332         }
2333
2334         node = rb_prev(&head->node.rb_node);
2335         if (!node)
2336                 goto out_unlock;
2337
2338         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2339
2340         if (ref->bytenr != bytenr)
2341                 goto out_unlock;
2342
2343         ret = 1;
2344         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2345                 goto out_unlock;
2346
2347         data_ref = btrfs_delayed_node_to_data_ref(ref);
2348
2349         node = rb_prev(node);
2350         if (node) {
2351                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2352                 if (ref->bytenr == bytenr)
2353                         goto out_unlock;
2354         }
2355
2356         if (data_ref->root != root->root_key.objectid ||
2357             data_ref->objectid != objectid || data_ref->offset != offset)
2358                 goto out_unlock;
2359
2360         ret = 0;
2361 out_unlock:
2362         mutex_unlock(&head->mutex);
2363 out:
2364         spin_unlock(&delayed_refs->lock);
2365         return ret;
2366 }
2367
2368 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2369                                         struct btrfs_root *root,
2370                                         struct btrfs_path *path,
2371                                         u64 objectid, u64 offset, u64 bytenr)
2372 {
2373         struct btrfs_root *extent_root = root->fs_info->extent_root;
2374         struct extent_buffer *leaf;
2375         struct btrfs_extent_data_ref *ref;
2376         struct btrfs_extent_inline_ref *iref;
2377         struct btrfs_extent_item *ei;
2378         struct btrfs_key key;
2379         u32 item_size;
2380         int ret;
2381
2382         key.objectid = bytenr;
2383         key.offset = (u64)-1;
2384         key.type = BTRFS_EXTENT_ITEM_KEY;
2385
2386         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2387         if (ret < 0)
2388                 goto out;
2389         BUG_ON(ret == 0);
2390
2391         ret = -ENOENT;
2392         if (path->slots[0] == 0)
2393                 goto out;
2394
2395         path->slots[0]--;
2396         leaf = path->nodes[0];
2397         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2398
2399         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2400                 goto out;
2401
2402         ret = 1;
2403         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2404 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2405         if (item_size < sizeof(*ei)) {
2406                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2407                 goto out;
2408         }
2409 #endif
2410         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2411
2412         if (item_size != sizeof(*ei) +
2413             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2414                 goto out;
2415
2416         if (btrfs_extent_generation(leaf, ei) <=
2417             btrfs_root_last_snapshot(&root->root_item))
2418                 goto out;
2419
2420         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2421         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2422             BTRFS_EXTENT_DATA_REF_KEY)
2423                 goto out;
2424
2425         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2426         if (btrfs_extent_refs(leaf, ei) !=
2427             btrfs_extent_data_ref_count(leaf, ref) ||
2428             btrfs_extent_data_ref_root(leaf, ref) !=
2429             root->root_key.objectid ||
2430             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2431             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2432                 goto out;
2433
2434         ret = 0;
2435 out:
2436         return ret;
2437 }
2438
2439 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2440                           struct btrfs_root *root,
2441                           u64 objectid, u64 offset, u64 bytenr)
2442 {
2443         struct btrfs_path *path;
2444         int ret;
2445         int ret2;
2446
2447         path = btrfs_alloc_path();
2448         if (!path)
2449                 return -ENOENT;
2450
2451         do {
2452                 ret = check_committed_ref(trans, root, path, objectid,
2453                                           offset, bytenr);
2454                 if (ret && ret != -ENOENT)
2455                         goto out;
2456
2457                 ret2 = check_delayed_ref(trans, root, path, objectid,
2458                                          offset, bytenr);
2459         } while (ret2 == -EAGAIN);
2460
2461         if (ret2 && ret2 != -ENOENT) {
2462                 ret = ret2;
2463                 goto out;
2464         }
2465
2466         if (ret != -ENOENT || ret2 != -ENOENT)
2467                 ret = 0;
2468 out:
2469         btrfs_free_path(path);
2470         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2471                 WARN_ON(ret > 0);
2472         return ret;
2473 }
2474
2475 #if 0
2476 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2477                     struct extent_buffer *buf, u32 nr_extents)
2478 {
2479         struct btrfs_key key;
2480         struct btrfs_file_extent_item *fi;
2481         u64 root_gen;
2482         u32 nritems;
2483         int i;
2484         int level;
2485         int ret = 0;
2486         int shared = 0;
2487
2488         if (!root->ref_cows)
2489                 return 0;
2490
2491         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2492                 shared = 0;
2493                 root_gen = root->root_key.offset;
2494         } else {
2495                 shared = 1;
2496                 root_gen = trans->transid - 1;
2497         }
2498
2499         level = btrfs_header_level(buf);
2500         nritems = btrfs_header_nritems(buf);
2501
2502         if (level == 0) {
2503                 struct btrfs_leaf_ref *ref;
2504                 struct btrfs_extent_info *info;
2505
2506                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2507                 if (!ref) {
2508                         ret = -ENOMEM;
2509                         goto out;
2510                 }
2511
2512                 ref->root_gen = root_gen;
2513                 ref->bytenr = buf->start;
2514                 ref->owner = btrfs_header_owner(buf);
2515                 ref->generation = btrfs_header_generation(buf);
2516                 ref->nritems = nr_extents;
2517                 info = ref->extents;
2518
2519                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2520                         u64 disk_bytenr;
2521                         btrfs_item_key_to_cpu(buf, &key, i);
2522                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2523                                 continue;
2524                         fi = btrfs_item_ptr(buf, i,
2525                                             struct btrfs_file_extent_item);
2526                         if (btrfs_file_extent_type(buf, fi) ==
2527                             BTRFS_FILE_EXTENT_INLINE)
2528                                 continue;
2529                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2530                         if (disk_bytenr == 0)
2531                                 continue;
2532
2533                         info->bytenr = disk_bytenr;
2534                         info->num_bytes =
2535                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2536                         info->objectid = key.objectid;
2537                         info->offset = key.offset;
2538                         info++;
2539                 }
2540
2541                 ret = btrfs_add_leaf_ref(root, ref, shared);
2542                 if (ret == -EEXIST && shared) {
2543                         struct btrfs_leaf_ref *old;
2544                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2545                         BUG_ON(!old);
2546                         btrfs_remove_leaf_ref(root, old);
2547                         btrfs_free_leaf_ref(root, old);
2548                         ret = btrfs_add_leaf_ref(root, ref, shared);
2549                 }
2550                 WARN_ON(ret);
2551                 btrfs_free_leaf_ref(root, ref);
2552         }
2553 out:
2554         return ret;
2555 }
2556
2557 /* when a block goes through cow, we update the reference counts of
2558  * everything that block points to.  The internal pointers of the block
2559  * can be in just about any order, and it is likely to have clusters of
2560  * things that are close together and clusters of things that are not.
2561  *
2562  * To help reduce the seeks that come with updating all of these reference
2563  * counts, sort them by byte number before actual updates are done.
2564  *
2565  * struct refsort is used to match byte number to slot in the btree block.
2566  * we sort based on the byte number and then use the slot to actually
2567  * find the item.
2568  *
2569  * struct refsort is smaller than strcut btrfs_item and smaller than
2570  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2571  * for a btree block, there's no way for a kmalloc of refsorts for a
2572  * single node to be bigger than a page.
2573  */
2574 struct refsort {
2575         u64 bytenr;
2576         u32 slot;
2577 };
2578
2579 /*
2580  * for passing into sort()
2581  */
2582 static int refsort_cmp(const void *a_void, const void *b_void)
2583 {
2584         const struct refsort *a = a_void;
2585         const struct refsort *b = b_void;
2586
2587         if (a->bytenr < b->bytenr)
2588                 return -1;
2589         if (a->bytenr > b->bytenr)
2590                 return 1;
2591         return 0;
2592 }
2593 #endif
2594
2595 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2596                            struct btrfs_root *root,
2597                            struct extent_buffer *buf,
2598                            int full_backref, int inc)
2599 {
2600         u64 bytenr;
2601         u64 num_bytes;
2602         u64 parent;
2603         u64 ref_root;
2604         u32 nritems;
2605         struct btrfs_key key;
2606         struct btrfs_file_extent_item *fi;
2607         int i;
2608         int level;
2609         int ret = 0;
2610         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2611                             u64, u64, u64, u64, u64, u64);
2612
2613         ref_root = btrfs_header_owner(buf);
2614         nritems = btrfs_header_nritems(buf);
2615         level = btrfs_header_level(buf);
2616
2617         if (!root->ref_cows && level == 0)
2618                 return 0;
2619
2620         if (inc)
2621                 process_func = btrfs_inc_extent_ref;
2622         else
2623                 process_func = btrfs_free_extent;
2624
2625         if (full_backref)
2626                 parent = buf->start;
2627         else
2628                 parent = 0;
2629
2630         for (i = 0; i < nritems; i++) {
2631                 if (level == 0) {
2632                         btrfs_item_key_to_cpu(buf, &key, i);
2633                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2634                                 continue;
2635                         fi = btrfs_item_ptr(buf, i,
2636                                             struct btrfs_file_extent_item);
2637                         if (btrfs_file_extent_type(buf, fi) ==
2638                             BTRFS_FILE_EXTENT_INLINE)
2639                                 continue;
2640                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2641                         if (bytenr == 0)
2642                                 continue;
2643
2644                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2645                         key.offset -= btrfs_file_extent_offset(buf, fi);
2646                         ret = process_func(trans, root, bytenr, num_bytes,
2647                                            parent, ref_root, key.objectid,
2648                                            key.offset);
2649                         if (ret)
2650                                 goto fail;
2651                 } else {
2652                         bytenr = btrfs_node_blockptr(buf, i);
2653                         num_bytes = btrfs_level_size(root, level - 1);
2654                         ret = process_func(trans, root, bytenr, num_bytes,
2655                                            parent, ref_root, level - 1, 0);
2656                         if (ret)
2657                                 goto fail;
2658                 }
2659         }
2660         return 0;
2661 fail:
2662         BUG();
2663         return ret;
2664 }
2665
2666 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2667                   struct extent_buffer *buf, int full_backref)
2668 {
2669         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2670 }
2671
2672 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2673                   struct extent_buffer *buf, int full_backref)
2674 {
2675         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2676 }
2677
2678 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2679                                  struct btrfs_root *root,
2680                                  struct btrfs_path *path,
2681                                  struct btrfs_block_group_cache *cache)
2682 {
2683         int ret;
2684         struct btrfs_root *extent_root = root->fs_info->extent_root;
2685         unsigned long bi;
2686         struct extent_buffer *leaf;
2687
2688         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2689         if (ret < 0)
2690                 goto fail;
2691         BUG_ON(ret);
2692
2693         leaf = path->nodes[0];
2694         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2695         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2696         btrfs_mark_buffer_dirty(leaf);
2697         btrfs_release_path(extent_root, path);
2698 fail:
2699         if (ret)
2700                 return ret;
2701         return 0;
2702
2703 }
2704
2705 static struct btrfs_block_group_cache *
2706 next_block_group(struct btrfs_root *root,
2707                  struct btrfs_block_group_cache *cache)
2708 {
2709         struct rb_node *node;
2710         spin_lock(&root->fs_info->block_group_cache_lock);
2711         node = rb_next(&cache->cache_node);
2712         btrfs_put_block_group(cache);
2713         if (node) {
2714                 cache = rb_entry(node, struct btrfs_block_group_cache,
2715                                  cache_node);
2716                 btrfs_get_block_group(cache);
2717         } else
2718                 cache = NULL;
2719         spin_unlock(&root->fs_info->block_group_cache_lock);
2720         return cache;
2721 }
2722
2723 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2724                             struct btrfs_trans_handle *trans,
2725                             struct btrfs_path *path)
2726 {
2727         struct btrfs_root *root = block_group->fs_info->tree_root;
2728         struct inode *inode = NULL;
2729         u64 alloc_hint = 0;
2730         int num_pages = 0;
2731         int retries = 0;
2732         int ret = 0;
2733
2734         /*
2735          * If this block group is smaller than 100 megs don't bother caching the
2736          * block group.
2737          */
2738         if (block_group->key.offset < (100 * 1024 * 1024)) {
2739                 spin_lock(&block_group->lock);
2740                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2741                 spin_unlock(&block_group->lock);
2742                 return 0;
2743         }
2744
2745 again:
2746         inode = lookup_free_space_inode(root, block_group, path);
2747         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2748                 ret = PTR_ERR(inode);
2749                 btrfs_release_path(root, path);
2750                 goto out;
2751         }
2752
2753         if (IS_ERR(inode)) {
2754                 BUG_ON(retries);
2755                 retries++;
2756
2757                 if (block_group->ro)
2758                         goto out_free;
2759
2760                 ret = create_free_space_inode(root, trans, block_group, path);
2761                 if (ret)
2762                         goto out_free;
2763                 goto again;
2764         }
2765
2766         /*
2767          * We want to set the generation to 0, that way if anything goes wrong
2768          * from here on out we know not to trust this cache when we load up next
2769          * time.
2770          */
2771         BTRFS_I(inode)->generation = 0;
2772         ret = btrfs_update_inode(trans, root, inode);
2773         WARN_ON(ret);
2774
2775         if (i_size_read(inode) > 0) {
2776                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2777                                                       inode);
2778                 if (ret)
2779                         goto out_put;
2780         }
2781
2782         spin_lock(&block_group->lock);
2783         if (block_group->cached != BTRFS_CACHE_FINISHED) {
2784                 spin_unlock(&block_group->lock);
2785                 goto out_put;
2786         }
2787         spin_unlock(&block_group->lock);
2788
2789         num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2790         if (!num_pages)
2791                 num_pages = 1;
2792
2793         /*
2794          * Just to make absolutely sure we have enough space, we're going to
2795          * preallocate 12 pages worth of space for each block group.  In
2796          * practice we ought to use at most 8, but we need extra space so we can
2797          * add our header and have a terminator between the extents and the
2798          * bitmaps.
2799          */
2800         num_pages *= 16;
2801         num_pages *= PAGE_CACHE_SIZE;
2802
2803         ret = btrfs_check_data_free_space(inode, num_pages);
2804         if (ret)
2805                 goto out_put;
2806
2807         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2808                                               num_pages, num_pages,
2809                                               &alloc_hint);
2810         btrfs_free_reserved_data_space(inode, num_pages);
2811 out_put:
2812         iput(inode);
2813 out_free:
2814         btrfs_release_path(root, path);
2815 out:
2816         spin_lock(&block_group->lock);
2817         if (ret)
2818                 block_group->disk_cache_state = BTRFS_DC_ERROR;
2819         else
2820                 block_group->disk_cache_state = BTRFS_DC_SETUP;
2821         spin_unlock(&block_group->lock);
2822
2823         return ret;
2824 }
2825
2826 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2827                                    struct btrfs_root *root)
2828 {
2829         struct btrfs_block_group_cache *cache;
2830         int err = 0;
2831         struct btrfs_path *path;
2832         u64 last = 0;
2833
2834         path = btrfs_alloc_path();
2835         if (!path)
2836                 return -ENOMEM;
2837
2838 again:
2839         while (1) {
2840                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2841                 while (cache) {
2842                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2843                                 break;
2844                         cache = next_block_group(root, cache);
2845                 }
2846                 if (!cache) {
2847                         if (last == 0)
2848                                 break;
2849                         last = 0;
2850                         continue;
2851                 }
2852                 err = cache_save_setup(cache, trans, path);
2853                 last = cache->key.objectid + cache->key.offset;
2854                 btrfs_put_block_group(cache);
2855         }
2856
2857         while (1) {
2858                 if (last == 0) {
2859                         err = btrfs_run_delayed_refs(trans, root,
2860                                                      (unsigned long)-1);
2861                         BUG_ON(err);
2862                 }
2863
2864                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2865                 while (cache) {
2866                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2867                                 btrfs_put_block_group(cache);
2868                                 goto again;
2869                         }
2870
2871                         if (cache->dirty)
2872                                 break;
2873                         cache = next_block_group(root, cache);
2874                 }
2875                 if (!cache) {
2876                         if (last == 0)
2877                                 break;
2878                         last = 0;
2879                         continue;
2880                 }
2881
2882                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2883                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2884                 cache->dirty = 0;
2885                 last = cache->key.objectid + cache->key.offset;
2886
2887                 err = write_one_cache_group(trans, root, path, cache);
2888                 BUG_ON(err);
2889                 btrfs_put_block_group(cache);
2890         }
2891
2892         while (1) {
2893                 /*
2894                  * I don't think this is needed since we're just marking our
2895                  * preallocated extent as written, but just in case it can't
2896                  * hurt.
2897                  */
2898                 if (last == 0) {
2899                         err = btrfs_run_delayed_refs(trans, root,
2900                                                      (unsigned long)-1);
2901                         BUG_ON(err);
2902                 }
2903
2904                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2905                 while (cache) {
2906                         /*
2907                          * Really this shouldn't happen, but it could if we
2908                          * couldn't write the entire preallocated extent and
2909                          * splitting the extent resulted in a new block.
2910                          */
2911                         if (cache->dirty) {
2912                                 btrfs_put_block_group(cache);
2913                                 goto again;
2914                         }
2915                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2916                                 break;
2917                         cache = next_block_group(root, cache);
2918                 }
2919                 if (!cache) {
2920                         if (last == 0)
2921                                 break;
2922                         last = 0;
2923                         continue;
2924                 }
2925
2926                 btrfs_write_out_cache(root, trans, cache, path);
2927
2928                 /*
2929                  * If we didn't have an error then the cache state is still
2930                  * NEED_WRITE, so we can set it to WRITTEN.
2931                  */
2932                 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2933                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
2934                 last = cache->key.objectid + cache->key.offset;
2935                 btrfs_put_block_group(cache);
2936         }
2937
2938         btrfs_free_path(path);
2939         return 0;
2940 }
2941
2942 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2943 {
2944         struct btrfs_block_group_cache *block_group;
2945         int readonly = 0;
2946
2947         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2948         if (!block_group || block_group->ro)
2949                 readonly = 1;
2950         if (block_group)
2951                 btrfs_put_block_group(block_group);
2952         return readonly;
2953 }
2954
2955 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2956                              u64 total_bytes, u64 bytes_used,
2957                              struct btrfs_space_info **space_info)
2958 {
2959         struct btrfs_space_info *found;
2960         int i;
2961         int factor;
2962
2963         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2964                      BTRFS_BLOCK_GROUP_RAID10))
2965                 factor = 2;
2966         else
2967                 factor = 1;
2968
2969         found = __find_space_info(info, flags);
2970         if (found) {
2971                 spin_lock(&found->lock);
2972                 found->total_bytes += total_bytes;
2973                 found->bytes_used += bytes_used;
2974                 found->disk_used += bytes_used * factor;
2975                 found->full = 0;
2976                 spin_unlock(&found->lock);
2977                 *space_info = found;
2978                 return 0;
2979         }
2980         found = kzalloc(sizeof(*found), GFP_NOFS);
2981         if (!found)
2982                 return -ENOMEM;
2983
2984         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2985                 INIT_LIST_HEAD(&found->block_groups[i]);
2986         init_rwsem(&found->groups_sem);
2987         spin_lock_init(&found->lock);
2988         found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2989                                 BTRFS_BLOCK_GROUP_SYSTEM |
2990                                 BTRFS_BLOCK_GROUP_METADATA);
2991         found->total_bytes = total_bytes;
2992         found->bytes_used = bytes_used;
2993         found->disk_used = bytes_used * factor;
2994         found->bytes_pinned = 0;
2995         found->bytes_reserved = 0;
2996         found->bytes_readonly = 0;
2997         found->bytes_may_use = 0;
2998         found->full = 0;
2999         found->force_alloc = 0;
3000         *space_info = found;
3001         list_add_rcu(&found->list, &info->space_info);
3002         atomic_set(&found->caching_threads, 0);
3003         return 0;
3004 }
3005
3006 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3007 {
3008         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
3009                                    BTRFS_BLOCK_GROUP_RAID1 |
3010                                    BTRFS_BLOCK_GROUP_RAID10 |
3011                                    BTRFS_BLOCK_GROUP_DUP);
3012         if (extra_flags) {
3013                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3014                         fs_info->avail_data_alloc_bits |= extra_flags;
3015                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3016                         fs_info->avail_metadata_alloc_bits |= extra_flags;
3017                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3018                         fs_info->avail_system_alloc_bits |= extra_flags;
3019         }
3020 }
3021
3022 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3023 {
3024         u64 num_devices = root->fs_info->fs_devices->rw_devices;
3025
3026         if (num_devices == 1)
3027                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3028         if (num_devices < 4)
3029                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3030
3031         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3032             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3033                       BTRFS_BLOCK_GROUP_RAID10))) {
3034                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
3035         }
3036
3037         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3038             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3039                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3040         }
3041
3042         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3043             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3044              (flags & BTRFS_BLOCK_GROUP_RAID10) |
3045              (flags & BTRFS_BLOCK_GROUP_DUP)))
3046                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3047         return flags;
3048 }
3049
3050 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3051 {
3052         if (flags & BTRFS_BLOCK_GROUP_DATA)
3053                 flags |= root->fs_info->avail_data_alloc_bits &
3054                          root->fs_info->data_alloc_profile;
3055         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3056                 flags |= root->fs_info->avail_system_alloc_bits &
3057                          root->fs_info->system_alloc_profile;
3058         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3059                 flags |= root->fs_info->avail_metadata_alloc_bits &
3060                          root->fs_info->metadata_alloc_profile;
3061         return btrfs_reduce_alloc_profile(root, flags);
3062 }
3063
3064 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3065 {
3066         u64 flags;
3067
3068         if (data)
3069                 flags = BTRFS_BLOCK_GROUP_DATA;
3070         else if (root == root->fs_info->chunk_root)
3071                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3072         else
3073                 flags = BTRFS_BLOCK_GROUP_METADATA;
3074
3075         return get_alloc_profile(root, flags);
3076 }
3077
3078 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3079 {
3080         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3081                                                        BTRFS_BLOCK_GROUP_DATA);
3082 }
3083
3084 /*
3085  * This will check the space that the inode allocates from to make sure we have
3086  * enough space for bytes.
3087  */
3088 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3089 {
3090         struct btrfs_space_info *data_sinfo;
3091         struct btrfs_root *root = BTRFS_I(inode)->root;
3092         u64 used;
3093         int ret = 0, committed = 0, alloc_chunk = 1;
3094
3095         /* make sure bytes are sectorsize aligned */
3096         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3097
3098         if (root == root->fs_info->tree_root) {
3099                 alloc_chunk = 0;
3100                 committed = 1;
3101         }
3102
3103         data_sinfo = BTRFS_I(inode)->space_info;
3104         if (!data_sinfo)
3105                 goto alloc;
3106
3107 again:
3108         /* make sure we have enough space to handle the data first */
3109         spin_lock(&data_sinfo->lock);
3110         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3111                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3112                 data_sinfo->bytes_may_use;
3113
3114         if (used + bytes > data_sinfo->total_bytes) {
3115                 struct btrfs_trans_handle *trans;
3116
3117                 /*
3118                  * if we don't have enough free bytes in this space then we need
3119                  * to alloc a new chunk.
3120                  */
3121                 if (!data_sinfo->full && alloc_chunk) {
3122                         u64 alloc_target;
3123
3124                         data_sinfo->force_alloc = 1;
3125                         spin_unlock(&data_sinfo->lock);
3126 alloc:
3127                         alloc_target = btrfs_get_alloc_profile(root, 1);
3128                         trans = btrfs_join_transaction(root, 1);
3129                         if (IS_ERR(trans))
3130                                 return PTR_ERR(trans);
3131
3132                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3133                                              bytes + 2 * 1024 * 1024,
3134                                              alloc_target, 0);
3135                         btrfs_end_transaction(trans, root);
3136                         if (ret < 0)
3137                                 return ret;
3138
3139                         if (!data_sinfo) {
3140                                 btrfs_set_inode_space_info(root, inode);
3141                                 data_sinfo = BTRFS_I(inode)->space_info;
3142                         }
3143                         goto again;
3144                 }
3145                 spin_unlock(&data_sinfo->lock);
3146
3147                 /* commit the current transaction and try again */
3148                 if (!committed && !root->fs_info->open_ioctl_trans) {
3149                         committed = 1;
3150                         trans = btrfs_join_transaction(root, 1);
3151                         if (IS_ERR(trans))
3152                                 return PTR_ERR(trans);
3153                         ret = btrfs_commit_transaction(trans, root);
3154                         if (ret)
3155                                 return ret;
3156                         goto again;
3157                 }
3158
3159 #if 0 /* I hope we never need this code again, just in case */
3160                 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3161                        "%llu bytes_reserved, " "%llu bytes_pinned, "
3162                        "%llu bytes_readonly, %llu may use %llu total\n",
3163                        (unsigned long long)bytes,
3164                        (unsigned long long)data_sinfo->bytes_used,
3165                        (unsigned long long)data_sinfo->bytes_reserved,
3166                        (unsigned long long)data_sinfo->bytes_pinned,
3167                        (unsigned long long)data_sinfo->bytes_readonly,
3168                        (unsigned long long)data_sinfo->bytes_may_use,
3169                        (unsigned long long)data_sinfo->total_bytes);
3170 #endif
3171                 return -ENOSPC;
3172         }
3173         data_sinfo->bytes_may_use += bytes;
3174         BTRFS_I(inode)->reserved_bytes += bytes;
3175         spin_unlock(&data_sinfo->lock);
3176
3177         return 0;
3178 }
3179
3180 /*
3181  * called when we are clearing an delalloc extent from the
3182  * inode's io_tree or there was an error for whatever reason
3183  * after calling btrfs_check_data_free_space
3184  */
3185 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3186 {
3187         struct btrfs_root *root = BTRFS_I(inode)->root;
3188         struct btrfs_space_info *data_sinfo;
3189
3190         /* make sure bytes are sectorsize aligned */
3191         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3192
3193         data_sinfo = BTRFS_I(inode)->space_info;
3194         spin_lock(&data_sinfo->lock);
3195         data_sinfo->bytes_may_use -= bytes;
3196         BTRFS_I(inode)->reserved_bytes -= bytes;
3197         spin_unlock(&data_sinfo->lock);
3198 }
3199
3200 static void force_metadata_allocation(struct btrfs_fs_info *info)
3201 {
3202         struct list_head *head = &info->space_info;
3203         struct btrfs_space_info *found;
3204
3205         rcu_read_lock();
3206         list_for_each_entry_rcu(found, head, list) {
3207                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3208                         found->force_alloc = 1;
3209         }
3210         rcu_read_unlock();
3211 }
3212
3213 static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3214                               u64 alloc_bytes)
3215 {
3216         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3217
3218         if (sinfo->bytes_used + sinfo->bytes_reserved +
3219             alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3220                 return 0;
3221
3222         if (sinfo->bytes_used + sinfo->bytes_reserved +
3223             alloc_bytes < div_factor(num_bytes, 8))
3224                 return 0;
3225
3226         return 1;
3227 }
3228
3229 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3230                           struct btrfs_root *extent_root, u64 alloc_bytes,
3231                           u64 flags, int force)
3232 {
3233         struct btrfs_space_info *space_info;
3234         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3235         int ret = 0;
3236
3237         mutex_lock(&fs_info->chunk_mutex);
3238
3239         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3240
3241         space_info = __find_space_info(extent_root->fs_info, flags);
3242         if (!space_info) {
3243                 ret = update_space_info(extent_root->fs_info, flags,
3244                                         0, 0, &space_info);
3245                 BUG_ON(ret);
3246         }
3247         BUG_ON(!space_info);
3248
3249         spin_lock(&space_info->lock);
3250         if (space_info->force_alloc)
3251                 force = 1;
3252         if (space_info->full) {
3253                 spin_unlock(&space_info->lock);
3254                 goto out;
3255         }
3256
3257         if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3258                 spin_unlock(&space_info->lock);
3259                 goto out;
3260         }
3261         spin_unlock(&space_info->lock);
3262
3263         /*
3264          * if we're doing a data chunk, go ahead and make sure that
3265          * we keep a reasonable number of metadata chunks allocated in the
3266          * FS as well.
3267          */
3268         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3269                 fs_info->data_chunk_allocations++;
3270                 if (!(fs_info->data_chunk_allocations %
3271                       fs_info->metadata_ratio))
3272                         force_metadata_allocation(fs_info);
3273         }
3274
3275         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3276         spin_lock(&space_info->lock);
3277         if (ret)
3278                 space_info->full = 1;
3279         else
3280                 ret = 1;
3281         space_info->force_alloc = 0;
3282         spin_unlock(&space_info->lock);
3283 out:
3284         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3285         return ret;
3286 }
3287
3288 static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3289                                 struct btrfs_root *root,
3290                                 struct btrfs_space_info *sinfo, u64 num_bytes)
3291 {
3292         int ret;
3293         int end_trans = 0;
3294
3295         if (sinfo->full)
3296                 return 0;
3297
3298         spin_lock(&sinfo->lock);
3299         ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3300         spin_unlock(&sinfo->lock);
3301         if (!ret)
3302                 return 0;
3303
3304         if (!trans) {
3305                 trans = btrfs_join_transaction(root, 1);
3306                 BUG_ON(IS_ERR(trans));
3307                 end_trans = 1;
3308         }
3309
3310         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3311                              num_bytes + 2 * 1024 * 1024,
3312                              get_alloc_profile(root, sinfo->flags), 0);
3313
3314         if (end_trans)
3315                 btrfs_end_transaction(trans, root);
3316
3317         return ret == 1 ? 1 : 0;
3318 }
3319
3320 /*
3321  * shrink metadata reservation for delalloc
3322  */
3323 static int shrink_delalloc(struct btrfs_trans_handle *trans,
3324                            struct btrfs_root *root, u64 to_reclaim)
3325 {
3326         struct btrfs_block_rsv *block_rsv;
3327         u64 reserved;
3328         u64 max_reclaim;
3329         u64 reclaimed = 0;
3330         int pause = 1;
3331         int ret;
3332
3333         block_rsv = &root->fs_info->delalloc_block_rsv;
3334         spin_lock(&block_rsv->lock);
3335         reserved = block_rsv->reserved;
3336         spin_unlock(&block_rsv->lock);
3337
3338         if (reserved == 0)
3339                 return 0;
3340
3341         max_reclaim = min(reserved, to_reclaim);
3342
3343         while (1) {
3344                 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3345                 if (!ret) {
3346                         __set_current_state(TASK_INTERRUPTIBLE);
3347                         schedule_timeout(pause);
3348                         pause <<= 1;
3349                         if (pause > HZ / 10)
3350                                 pause = HZ / 10;
3351                 } else {
3352                         pause = 1;
3353                 }
3354
3355                 spin_lock(&block_rsv->lock);
3356                 if (reserved > block_rsv->reserved)
3357                         reclaimed = reserved - block_rsv->reserved;
3358                 reserved = block_rsv->reserved;
3359                 spin_unlock(&block_rsv->lock);
3360
3361                 if (reserved == 0 || reclaimed >= max_reclaim)
3362                         break;
3363
3364                 if (trans && trans->transaction->blocked)
3365                         return -EAGAIN;
3366         }
3367         return reclaimed >= to_reclaim;
3368 }
3369
3370 static int should_retry_reserve(struct btrfs_trans_handle *trans,
3371                                 struct btrfs_root *root,
3372                                 struct btrfs_block_rsv *block_rsv,
3373                                 u64 num_bytes, int *retries)
3374 {
3375         struct btrfs_space_info *space_info = block_rsv->space_info;
3376         int ret;
3377
3378         if ((*retries) > 2)
3379                 return -ENOSPC;
3380
3381         ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3382         if (ret)
3383                 return 1;
3384
3385         if (trans && trans->transaction->in_commit)
3386                 return -ENOSPC;
3387
3388         ret = shrink_delalloc(trans, root, num_bytes);
3389         if (ret)
3390                 return ret;
3391
3392         spin_lock(&space_info->lock);
3393         if (space_info->bytes_pinned < num_bytes)
3394                 ret = 1;
3395         spin_unlock(&space_info->lock);
3396         if (ret)
3397                 return -ENOSPC;
3398
3399         (*retries)++;
3400
3401         if (trans)
3402                 return -EAGAIN;
3403
3404         trans = btrfs_join_transaction(root, 1);
3405         BUG_ON(IS_ERR(trans));
3406         ret = btrfs_commit_transaction(trans, root);
3407         BUG_ON(ret);
3408
3409         return 1;
3410 }
3411
3412 static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3413                                   u64 num_bytes)
3414 {
3415         struct btrfs_space_info *space_info = block_rsv->space_info;
3416         u64 unused;
3417         int ret = -ENOSPC;
3418
3419         spin_lock(&space_info->lock);
3420         unused = space_info->bytes_used + space_info->bytes_reserved +
3421                  space_info->bytes_pinned + space_info->bytes_readonly;
3422
3423         if (unused < space_info->total_bytes)
3424                 unused = space_info->total_bytes - unused;
3425         else
3426                 unused = 0;
3427
3428         if (unused >= num_bytes) {
3429                 if (block_rsv->priority >= 10) {
3430                         space_info->bytes_reserved += num_bytes;
3431                         ret = 0;
3432                 } else {
3433                         if ((unused + block_rsv->reserved) *
3434                             block_rsv->priority >=
3435                             (num_bytes + block_rsv->reserved) * 10) {
3436                                 space_info->bytes_reserved += num_bytes;
3437                                 ret = 0;
3438                         }
3439                 }
3440         }
3441         spin_unlock(&space_info->lock);
3442
3443         return ret;
3444 }
3445
3446 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3447                                              struct btrfs_root *root)
3448 {
3449         struct btrfs_block_rsv *block_rsv;
3450         if (root->ref_cows)
3451                 block_rsv = trans->block_rsv;
3452         else
3453                 block_rsv = root->block_rsv;
3454
3455         if (!block_rsv)
3456                 block_rsv = &root->fs_info->empty_block_rsv;
3457
3458         return block_rsv;
3459 }
3460
3461 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3462                                u64 num_bytes)
3463 {
3464         int ret = -ENOSPC;
3465         spin_lock(&block_rsv->lock);
3466         if (block_rsv->reserved >= num_bytes) {
3467                 block_rsv->reserved -= num_bytes;
3468                 if (block_rsv->reserved < block_rsv->size)
3469                         block_rsv->full = 0;
3470                 ret = 0;
3471         }
3472         spin_unlock(&block_rsv->lock);
3473         return ret;
3474 }
3475
3476 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3477                                 u64 num_bytes, int update_size)
3478 {
3479         spin_lock(&block_rsv->lock);
3480         block_rsv->reserved += num_bytes;
3481         if (update_size)
3482                 block_rsv->size += num_bytes;
3483         else if (block_rsv->reserved >= block_rsv->size)
3484                 block_rsv->full = 1;
3485         spin_unlock(&block_rsv->lock);
3486 }
3487
3488 void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3489                              struct btrfs_block_rsv *dest, u64 num_bytes)
3490 {
3491         struct btrfs_space_info *space_info = block_rsv->space_info;
3492
3493         spin_lock(&block_rsv->lock);
3494         if (num_bytes == (u64)-1)
3495                 num_bytes = block_rsv->size;
3496         block_rsv->size -= num_bytes;
3497         if (block_rsv->reserved >= block_rsv->size) {
3498                 num_bytes = block_rsv->reserved - block_rsv->size;
3499                 block_rsv->reserved = block_rsv->size;
3500                 block_rsv->full = 1;
3501         } else {
3502                 num_bytes = 0;
3503         }
3504         spin_unlock(&block_rsv->lock);
3505
3506         if (num_bytes > 0) {
3507                 if (dest) {
3508                         block_rsv_add_bytes(dest, num_bytes, 0);
3509                 } else {
3510                         spin_lock(&space_info->lock);
3511                         space_info->bytes_reserved -= num_bytes;
3512                         spin_unlock(&space_info->lock);
3513                 }
3514         }
3515 }
3516
3517 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3518                                    struct btrfs_block_rsv *dst, u64 num_bytes)
3519 {
3520         int ret;
3521
3522         ret = block_rsv_use_bytes(src, num_bytes);
3523         if (ret)
3524                 return ret;
3525
3526         block_rsv_add_bytes(dst, num_bytes, 1);
3527         return 0;
3528 }
3529
3530 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3531 {
3532         memset(rsv, 0, sizeof(*rsv));
3533         spin_lock_init(&rsv->lock);
3534         atomic_set(&rsv->usage, 1);
3535         rsv->priority = 6;
3536         INIT_LIST_HEAD(&rsv->list);
3537 }
3538
3539 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3540 {
3541         struct btrfs_block_rsv *block_rsv;
3542         struct btrfs_fs_info *fs_info = root->fs_info;
3543         u64 alloc_target;
3544
3545         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3546         if (!block_rsv)
3547                 return NULL;
3548
3549         btrfs_init_block_rsv(block_rsv);
3550
3551         alloc_target = btrfs_get_alloc_profile(root, 0);
3552         block_rsv->space_info = __find_space_info(fs_info,
3553                                                   BTRFS_BLOCK_GROUP_METADATA);
3554
3555         return block_rsv;
3556 }
3557
3558 void btrfs_free_block_rsv(struct btrfs_root *root,
3559                           struct btrfs_block_rsv *rsv)
3560 {
3561         if (rsv && atomic_dec_and_test(&rsv->usage)) {
3562                 btrfs_block_rsv_release(root, rsv, (u64)-1);
3563                 if (!rsv->durable)
3564                         kfree(rsv);
3565         }
3566 }
3567
3568 /*
3569  * make the block_rsv struct be able to capture freed space.
3570  * the captured space will re-add to the the block_rsv struct
3571  * after transaction commit
3572  */
3573 void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3574                                  struct btrfs_block_rsv *block_rsv)
3575 {
3576         block_rsv->durable = 1;
3577         mutex_lock(&fs_info->durable_block_rsv_mutex);
3578         list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3579         mutex_unlock(&fs_info->durable_block_rsv_mutex);
3580 }
3581
3582 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3583                         struct btrfs_root *root,
3584                         struct btrfs_block_rsv *block_rsv,
3585                         u64 num_bytes, int *retries)
3586 {
3587         int ret;
3588
3589         if (num_bytes == 0)
3590                 return 0;
3591 again:
3592         ret = reserve_metadata_bytes(block_rsv, num_bytes);
3593         if (!ret) {
3594                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3595                 return 0;
3596         }
3597
3598         ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3599         if (ret > 0)
3600                 goto again;
3601
3602         return ret;
3603 }
3604
3605 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3606                           struct btrfs_root *root,
3607                           struct btrfs_block_rsv *block_rsv,
3608                           u64 min_reserved, int min_factor)
3609 {
3610         u64 num_bytes = 0;
3611         int commit_trans = 0;
3612         int ret = -ENOSPC;
3613
3614         if (!block_rsv)
3615                 return 0;
3616
3617         spin_lock(&block_rsv->lock);
3618         if (min_factor > 0)
3619                 num_bytes = div_factor(block_rsv->size, min_factor);
3620         if (min_reserved > num_bytes)
3621                 num_bytes = min_reserved;
3622
3623         if (block_rsv->reserved >= num_bytes) {
3624                 ret = 0;
3625         } else {
3626                 num_bytes -= block_rsv->reserved;
3627                 if (block_rsv->durable &&
3628                     block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3629                         commit_trans = 1;
3630         }
3631         spin_unlock(&block_rsv->lock);
3632         if (!ret)
3633                 return 0;
3634
3635         if (block_rsv->refill_used) {
3636                 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3637                 if (!ret) {
3638                         block_rsv_add_bytes(block_rsv, num_bytes, 0);
3639                         return 0;
3640                 }
3641         }
3642
3643         if (commit_trans) {
3644                 if (trans)
3645                         return -EAGAIN;
3646
3647                 trans = btrfs_join_transaction(root, 1);
3648                 BUG_ON(IS_ERR(trans));
3649                 ret = btrfs_commit_transaction(trans, root);
3650                 return 0;
3651         }
3652
3653         WARN_ON(1);
3654         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3655                 block_rsv->size, block_rsv->reserved,
3656                 block_rsv->freed[0], block_rsv->freed[1]);
3657
3658         return -ENOSPC;
3659 }
3660
3661 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3662                             struct btrfs_block_rsv *dst_rsv,
3663                             u64 num_bytes)
3664 {
3665         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3666 }
3667
3668 void btrfs_block_rsv_release(struct btrfs_root *root,
3669                              struct btrfs_block_rsv *block_rsv,
3670                              u64 num_bytes)
3671 {
3672         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3673         if (global_rsv->full || global_rsv == block_rsv ||
3674             block_rsv->space_info != global_rsv->space_info)
3675                 global_rsv = NULL;
3676         block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3677 }
3678
3679 /*
3680  * helper to calculate size of global block reservation.
3681  * the desired value is sum of space used by extent tree,
3682  * checksum tree and root tree
3683  */
3684 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3685 {
3686         struct btrfs_space_info *sinfo;
3687         u64 num_bytes;
3688         u64 meta_used;
3689         u64 data_used;
3690         int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3691 #if 0
3692         /*
3693          * per tree used space accounting can be inaccuracy, so we
3694          * can't rely on it.
3695          */
3696         spin_lock(&fs_info->extent_root->accounting_lock);
3697         num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3698         spin_unlock(&fs_info->extent_root->accounting_lock);
3699
3700         spin_lock(&fs_info->csum_root->accounting_lock);
3701         num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3702         spin_unlock(&fs_info->csum_root->accounting_lock);
3703
3704         spin_lock(&fs_info->tree_root->accounting_lock);
3705         num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3706         spin_unlock(&fs_info->tree_root->accounting_lock);
3707 #endif
3708         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3709         spin_lock(&sinfo->lock);
3710         data_used = sinfo->bytes_used;
3711         spin_unlock(&sinfo->lock);
3712
3713         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3714         spin_lock(&sinfo->lock);
3715         meta_used = sinfo->bytes_used;
3716         spin_unlock(&sinfo->lock);
3717
3718         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3719                     csum_size * 2;
3720         num_bytes += div64_u64(data_used + meta_used, 50);
3721
3722         if (num_bytes * 3 > meta_used)
3723                 num_bytes = div64_u64(meta_used, 3);
3724
3725         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3726 }
3727
3728 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3729 {
3730         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3731         struct btrfs_space_info *sinfo = block_rsv->space_info;
3732         u64 num_bytes;
3733
3734         num_bytes = calc_global_metadata_size(fs_info);
3735
3736         spin_lock(&block_rsv->lock);
3737         spin_lock(&sinfo->lock);
3738
3739         block_rsv->size = num_bytes;
3740
3741         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3742                     sinfo->bytes_reserved + sinfo->bytes_readonly;
3743
3744         if (sinfo->total_bytes > num_bytes) {
3745                 num_bytes = sinfo->total_bytes - num_bytes;
3746                 block_rsv->reserved += num_bytes;
3747                 sinfo->bytes_reserved += num_bytes;
3748         }
3749
3750         if (block_rsv->reserved >= block_rsv->size) {
3751                 num_bytes = block_rsv->reserved - block_rsv->size;
3752                 sinfo->bytes_reserved -= num_bytes;
3753                 block_rsv->reserved = block_rsv->size;
3754                 block_rsv->full = 1;
3755         }
3756 #if 0
3757         printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3758                 block_rsv->size, block_rsv->reserved);
3759 #endif
3760         spin_unlock(&sinfo->lock);
3761         spin_unlock(&block_rsv->lock);
3762 }
3763
3764 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3765 {
3766         struct btrfs_space_info *space_info;
3767
3768         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3769         fs_info->chunk_block_rsv.space_info = space_info;
3770         fs_info->chunk_block_rsv.priority = 10;
3771
3772         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3773         fs_info->global_block_rsv.space_info = space_info;
3774         fs_info->global_block_rsv.priority = 10;
3775         fs_info->global_block_rsv.refill_used = 1;
3776         fs_info->delalloc_block_rsv.space_info = space_info;
3777         fs_info->trans_block_rsv.space_info = space_info;
3778         fs_info->empty_block_rsv.space_info = space_info;
3779         fs_info->empty_block_rsv.priority = 10;
3780
3781         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3782         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3783         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3784         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3785         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3786
3787         btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3788
3789         btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3790
3791         update_global_block_rsv(fs_info);
3792 }
3793
3794 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3795 {
3796         block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3797         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3798         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3799         WARN_ON(fs_info->trans_block_rsv.size > 0);
3800         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3801         WARN_ON(fs_info->chunk_block_rsv.size > 0);
3802         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3803 }
3804
3805 static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3806 {
3807         return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3808                 3 * num_items;
3809 }
3810
3811 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3812                                  struct btrfs_root *root,
3813                                  int num_items, int *retries)
3814 {
3815         u64 num_bytes;
3816         int ret;
3817
3818         if (num_items == 0 || root->fs_info->chunk_root == root)
3819                 return 0;
3820
3821         num_bytes = calc_trans_metadata_size(root, num_items);
3822         ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3823                                   num_bytes, retries);
3824         if (!ret) {
3825                 trans->bytes_reserved += num_bytes;
3826                 trans->block_rsv = &root->fs_info->trans_block_rsv;
3827         }
3828         return ret;
3829 }
3830
3831 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3832                                   struct btrfs_root *root)
3833 {
3834         if (!trans->bytes_reserved)
3835                 return;
3836
3837         BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3838         btrfs_block_rsv_release(root, trans->block_rsv,
3839                                 trans->bytes_reserved);
3840         trans->bytes_reserved = 0;
3841 }
3842
3843 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3844                                   struct inode *inode)
3845 {
3846         struct btrfs_root *root = BTRFS_I(inode)->root;
3847         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3848         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3849
3850         /*
3851          * one for deleting orphan item, one for updating inode and
3852          * two for calling btrfs_truncate_inode_items.
3853          *
3854          * btrfs_truncate_inode_items is a delete operation, it frees
3855          * more space than it uses in most cases. So two units of
3856          * metadata space should be enough for calling it many times.
3857          * If all of the metadata space is used, we can commit
3858          * transaction and use space it freed.
3859          */
3860         u64 num_bytes = calc_trans_metadata_size(root, 4);
3861         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3862 }
3863
3864 void btrfs_orphan_release_metadata(struct inode *inode)
3865 {
3866         struct btrfs_root *root = BTRFS_I(inode)->root;
3867         u64 num_bytes = calc_trans_metadata_size(root, 4);
3868         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3869 }
3870
3871 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3872                                 struct btrfs_pending_snapshot *pending)
3873 {
3874         struct btrfs_root *root = pending->root;
3875         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3876         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3877         /*
3878          * two for root back/forward refs, two for directory entries
3879          * and one for root of the snapshot.
3880          */
3881         u64 num_bytes = calc_trans_metadata_size(root, 5);
3882         dst_rsv->space_info = src_rsv->space_info;
3883         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3884 }
3885
3886 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3887 {
3888         return num_bytes >>= 3;
3889 }
3890
3891 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3892 {
3893         struct btrfs_root *root = BTRFS_I(inode)->root;
3894         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3895         u64 to_reserve;
3896         int nr_extents;
3897         int retries = 0;
3898         int ret;
3899
3900         if (btrfs_transaction_in_commit(root->fs_info))
3901                 schedule_timeout(1);
3902
3903         num_bytes = ALIGN(num_bytes, root->sectorsize);
3904 again:
3905         spin_lock(&BTRFS_I(inode)->accounting_lock);
3906         nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3907         if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3908                 nr_extents -= BTRFS_I(inode)->reserved_extents;
3909                 to_reserve = calc_trans_metadata_size(root, nr_extents);
3910         } else {
3911                 nr_extents = 0;
3912                 to_reserve = 0;
3913         }
3914
3915         to_reserve += calc_csum_metadata_size(inode, num_bytes);
3916         ret = reserve_metadata_bytes(block_rsv, to_reserve);
3917         if (ret) {
3918                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3919                 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3920                                            &retries);
3921                 if (ret > 0)
3922                         goto again;
3923                 return ret;
3924         }
3925
3926         BTRFS_I(inode)->reserved_extents += nr_extents;
3927         atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3928         spin_unlock(&BTRFS_I(inode)->accounting_lock);
3929
3930         block_rsv_add_bytes(block_rsv, to_reserve, 1);
3931
3932         if (block_rsv->size > 512 * 1024 * 1024)
3933                 shrink_delalloc(NULL, root, to_reserve);
3934
3935         return 0;
3936 }
3937
3938 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3939 {
3940         struct btrfs_root *root = BTRFS_I(inode)->root;
3941         u64 to_free;
3942         int nr_extents;
3943
3944         num_bytes = ALIGN(num_bytes, root->sectorsize);
3945         atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3946
3947         spin_lock(&BTRFS_I(inode)->accounting_lock);
3948         nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3949         if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3950                 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3951                 BTRFS_I(inode)->reserved_extents -= nr_extents;
3952         } else {
3953                 nr_extents = 0;
3954         }
3955         spin_unlock(&BTRFS_I(inode)->accounting_lock);
3956
3957         to_free = calc_csum_metadata_size(inode, num_bytes);
3958         if (nr_extents > 0)
3959                 to_free += calc_trans_metadata_size(root, nr_extents);
3960
3961         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3962                                 to_free);
3963 }
3964
3965 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3966 {
3967         int ret;
3968
3969         ret = btrfs_check_data_free_space(inode, num_bytes);
3970         if (ret)
3971                 return ret;
3972
3973         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3974         if (ret) {
3975                 btrfs_free_reserved_data_space(inode, num_bytes);
3976                 return ret;
3977         }
3978
3979         return 0;
3980 }
3981
3982 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3983 {
3984         btrfs_delalloc_release_metadata(inode, num_bytes);
3985         btrfs_free_reserved_data_space(inode, num_bytes);
3986 }
3987
3988 static int update_block_group(struct btrfs_trans_handle *trans,
3989                               struct btrfs_root *root,
3990                               u64 bytenr, u64 num_bytes, int alloc)
3991 {
3992         struct btrfs_block_group_cache *cache = NULL;
3993         struct btrfs_fs_info *info = root->fs_info;
3994         u64 total = num_bytes;
3995         u64 old_val;
3996         u64 byte_in_group;
3997         int factor;
3998
3999         /* block accounting for super block */
4000         spin_lock(&info->delalloc_lock);
4001         old_val = btrfs_super_bytes_used(&info->super_copy);
4002         if (alloc)
4003                 old_val += num_bytes;
4004         else
4005                 old_val -= num_bytes;
4006         btrfs_set_super_bytes_used(&info->super_copy, old_val);
4007         spin_unlock(&info->delalloc_lock);
4008
4009         while (total) {
4010                 cache = btrfs_lookup_block_group(info, bytenr);
4011                 if (!cache)
4012                         return -1;
4013                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4014                                     BTRFS_BLOCK_GROUP_RAID1 |
4015                                     BTRFS_BLOCK_GROUP_RAID10))
4016                         factor = 2;
4017                 else
4018                         factor = 1;
4019                 /*
4020                  * If this block group has free space cache written out, we
4021                  * need to make sure to load it if we are removing space.  This
4022                  * is because we need the unpinning stage to actually add the
4023                  * space back to the block group, otherwise we will leak space.
4024                  */
4025                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4026                         cache_block_group(cache, trans, 1);
4027
4028                 byte_in_group = bytenr - cache->key.objectid;
4029                 WARN_ON(byte_in_group > cache->key.offset);
4030
4031                 spin_lock(&cache->space_info->lock);
4032                 spin_lock(&cache->lock);
4033
4034                 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4035                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4036                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4037
4038                 cache->dirty = 1;
4039                 old_val = btrfs_block_group_used(&cache->item);
4040                 num_bytes = min(total, cache->key.offset - byte_in_group);
4041                 if (alloc) {
4042                         old_val += num_bytes;
4043                         btrfs_set_block_group_used(&cache->item, old_val);
4044                         cache->reserved -= num_bytes;
4045                         cache->space_info->bytes_reserved -= num_bytes;
4046                         cache->space_info->bytes_used += num_bytes;
4047                         cache->space_info->disk_used += num_bytes * factor;
4048                         spin_unlock(&cache->lock);
4049                         spin_unlock(&cache->space_info->lock);
4050                 } else {
4051                         old_val -= num_bytes;
4052                         btrfs_set_block_group_used(&cache->item, old_val);
4053                         cache->pinned += num_bytes;
4054                         cache->space_info->bytes_pinned += num_bytes;
4055                         cache->space_info->bytes_used -= num_bytes;
4056                         cache->space_info->disk_used -= num_bytes * factor;
4057                         spin_unlock(&cache->lock);
4058                         spin_unlock(&cache->space_info->lock);
4059
4060                         set_extent_dirty(info->pinned_extents,
4061                                          bytenr, bytenr + num_bytes - 1,
4062                                          GFP_NOFS | __GFP_NOFAIL);
4063                 }
4064                 btrfs_put_block_group(cache);
4065                 total -= num_bytes;
4066                 bytenr += num_bytes;
4067         }
4068         return 0;
4069 }
4070
4071 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4072 {
4073         struct btrfs_block_group_cache *cache;
4074         u64 bytenr;
4075
4076         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4077         if (!cache)
4078                 return 0;
4079
4080         bytenr = cache->key.objectid;
4081         btrfs_put_block_group(cache);
4082
4083         return bytenr;
4084 }
4085
4086 static int pin_down_extent(struct btrfs_root *root,
4087                            struct btrfs_block_group_cache *cache,
4088                            u64 bytenr, u64 num_bytes, int reserved)
4089 {
4090         spin_lock(&cache->space_info->lock);
4091         spin_lock(&cache->lock);
4092         cache->pinned += num_bytes;
4093         cache->space_info->bytes_pinned += num_bytes;
4094         if (reserved) {
4095                 cache->reserved -= num_bytes;
4096                 cache->space_info->bytes_reserved -= num_bytes;
4097         }
4098         spin_unlock(&cache->lock);
4099         spin_unlock(&cache->space_info->lock);
4100
4101         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4102                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4103         return 0;
4104 }
4105
4106 /*
4107  * this function must be called within transaction
4108  */
4109 int btrfs_pin_extent(struct btrfs_root *root,
4110                      u64 bytenr, u64 num_bytes, int reserved)
4111 {
4112         struct btrfs_block_group_cache *cache;
4113
4114         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4115         BUG_ON(!cache);
4116
4117         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4118
4119         btrfs_put_block_group(cache);
4120         return 0;
4121 }
4122
4123 /*
4124  * update size of reserved extents. this function may return -EAGAIN
4125  * if 'reserve' is true or 'sinfo' is false.
4126  */
4127 static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4128                                  u64 num_bytes, int reserve, int sinfo)
4129 {
4130         int ret = 0;
4131         if (sinfo) {
4132                 struct btrfs_space_info *space_info = cache->space_info;
4133                 spin_lock(&space_info->lock);
4134                 spin_lock(&cache->lock);
4135                 if (reserve) {
4136                         if (cache->ro) {
4137                                 ret = -EAGAIN;
4138                         } else {
4139                                 cache->reserved += num_bytes;
4140                                 space_info->bytes_reserved += num_bytes;
4141                         }
4142                 } else {
4143                         if (cache->ro)
4144                                 space_info->bytes_readonly += num_bytes;
4145                         cache->reserved -= num_bytes;
4146                         space_info->bytes_reserved -= num_bytes;
4147                 }
4148                 spin_unlock(&cache->lock);
4149                 spin_unlock(&space_info->lock);
4150         } else {
4151                 spin_lock(&cache->lock);
4152                 if (cache->ro) {
4153                         ret = -EAGAIN;
4154                 } else {
4155                         if (reserve)
4156                                 cache->reserved += num_bytes;
4157                         else
4158                                 cache->reserved -= num_bytes;
4159                 }
4160                 spin_unlock(&cache->lock);
4161         }
4162         return ret;
4163 }
4164
4165 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4166                                 struct btrfs_root *root)
4167 {
4168         struct btrfs_fs_info *fs_info = root->fs_info;
4169         struct btrfs_caching_control *next;
4170         struct btrfs_caching_control *caching_ctl;
4171         struct btrfs_block_group_cache *cache;
4172
4173         down_write(&fs_info->extent_commit_sem);
4174
4175         list_for_each_entry_safe(caching_ctl, next,
4176                                  &fs_info->caching_block_groups, list) {
4177                 cache = caching_ctl->block_group;
4178                 if (block_group_cache_done(cache)) {
4179                         cache->last_byte_to_unpin = (u64)-1;
4180                         list_del_init(&caching_ctl->list);
4181                         put_caching_control(caching_ctl);
4182                 } else {
4183                         cache->last_byte_to_unpin = caching_ctl->progress;
4184                 }
4185         }
4186
4187         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4188                 fs_info->pinned_extents = &fs_info->freed_extents[1];
4189         else
4190                 fs_info->pinned_extents = &fs_info->freed_extents[0];
4191
4192         up_write(&fs_info->extent_commit_sem);
4193
4194         update_global_block_rsv(fs_info);
4195         return 0;
4196 }
4197
4198 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4199 {
4200         struct btrfs_fs_info *fs_info = root->fs_info;
4201         struct btrfs_block_group_cache *cache = NULL;
4202         u64 len;
4203
4204         while (start <= end) {
4205                 if (!cache ||
4206                     start >= cache->key.objectid + cache->key.offset) {
4207                         if (cache)
4208                                 btrfs_put_block_group(cache);
4209                         cache = btrfs_lookup_block_group(fs_info, start);
4210                         BUG_ON(!cache);
4211                 }
4212
4213                 len = cache->key.objectid + cache->key.offset - start;
4214                 len = min(len, end + 1 - start);
4215
4216                 if (start < cache->last_byte_to_unpin) {
4217                         len = min(len, cache->last_byte_to_unpin - start);
4218                         btrfs_add_free_space(cache, start, len);
4219                 }
4220
4221                 start += len;
4222
4223                 spin_lock(&cache->space_info->lock);
4224                 spin_lock(&cache->lock);
4225                 cache->pinned -= len;
4226                 cache->space_info->bytes_pinned -= len;
4227                 if (cache->ro) {
4228                         cache->space_info->bytes_readonly += len;
4229                 } else if (cache->reserved_pinned > 0) {
4230                         len = min(len, cache->reserved_pinned);
4231                         cache->reserved_pinned -= len;
4232                         cache->space_info->bytes_reserved += len;
4233                 }
4234                 spin_unlock(&cache->lock);
4235                 spin_unlock(&cache->space_info->lock);
4236         }
4237
4238         if (cache)
4239                 btrfs_put_block_group(cache);
4240         return 0;
4241 }
4242
4243 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4244                                struct btrfs_root *root)
4245 {
4246         struct btrfs_fs_info *fs_info = root->fs_info;
4247         struct extent_io_tree *unpin;
4248         struct btrfs_block_rsv *block_rsv;
4249         struct btrfs_block_rsv *next_rsv;
4250         u64 start;
4251         u64 end;
4252         int idx;
4253         int ret;
4254
4255         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4256                 unpin = &fs_info->freed_extents[1];
4257         else
4258                 unpin = &fs_info->freed_extents[0];
4259
4260         while (1) {
4261                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4262                                             EXTENT_DIRTY);
4263                 if (ret)
4264                         break;
4265
4266                 ret = btrfs_discard_extent(root, start, end + 1 - start);
4267
4268                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4269                 unpin_extent_range(root, start, end);
4270                 cond_resched();
4271         }
4272
4273         mutex_lock(&fs_info->durable_block_rsv_mutex);
4274         list_for_each_entry_safe(block_rsv, next_rsv,
4275                                  &fs_info->durable_block_rsv_list, list) {
4276
4277                 idx = trans->transid & 0x1;
4278                 if (block_rsv->freed[idx] > 0) {
4279                         block_rsv_add_bytes(block_rsv,
4280                                             block_rsv->freed[idx], 0);
4281                         block_rsv->freed[idx] = 0;
4282                 }
4283                 if (atomic_read(&block_rsv->usage) == 0) {
4284                         btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4285
4286                         if (block_rsv->freed[0] == 0 &&
4287                             block_rsv->freed[1] == 0) {
4288                                 list_del_init(&block_rsv->list);
4289                                 kfree(block_rsv);
4290                         }
4291                 } else {
4292                         btrfs_block_rsv_release(root, block_rsv, 0);
4293                 }
4294         }
4295         mutex_unlock(&fs_info->durable_block_rsv_mutex);
4296
4297         return 0;
4298 }
4299
4300 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4301                                 struct btrfs_root *root,
4302                                 u64 bytenr, u64 num_bytes, u64 parent,
4303                                 u64 root_objectid, u64 owner_objectid,
4304                                 u64 owner_offset, int refs_to_drop,
4305                                 struct btrfs_delayed_extent_op *extent_op)
4306 {
4307         struct btrfs_key key;
4308         struct btrfs_path *path;
4309         struct btrfs_fs_info *info = root->fs_info;
4310         struct btrfs_root *extent_root = info->extent_root;
4311         struct extent_buffer *leaf;
4312         struct btrfs_extent_item *ei;
4313         struct btrfs_extent_inline_ref *iref;
4314         int ret;
4315         int is_data;
4316         int extent_slot = 0;
4317         int found_extent = 0;
4318         int num_to_del = 1;
4319         u32 item_size;
4320         u64 refs;
4321
4322         path = btrfs_alloc_path();
4323         if (!path)
4324                 return -ENOMEM;
4325
4326         path->reada = 1;
4327         path->leave_spinning = 1;
4328
4329         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4330         BUG_ON(!is_data && refs_to_drop != 1);
4331
4332         ret = lookup_extent_backref(trans, extent_root, path, &iref,
4333                                     bytenr, num_bytes, parent,
4334                                     root_objectid, owner_objectid,
4335                                     owner_offset);
4336         if (ret == 0) {
4337                 extent_slot = path->slots[0];
4338                 while (extent_slot >= 0) {
4339                         btrfs_item_key_to_cpu(path->nodes[0], &key,
4340                                               extent_slot);
4341                         if (key.objectid != bytenr)
4342                                 break;
4343                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4344                             key.offset == num_bytes) {
4345                                 found_extent = 1;
4346                                 break;
4347                         }
4348                         if (path->slots[0] - extent_slot > 5)
4349                                 break;
4350                         extent_slot--;
4351                 }
4352 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4353                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4354                 if (found_extent && item_size < sizeof(*ei))
4355                         found_extent = 0;
4356 #endif
4357                 if (!found_extent) {
4358                         BUG_ON(iref);
4359                         ret = remove_extent_backref(trans, extent_root, path,
4360                                                     NULL, refs_to_drop,
4361                                                     is_data);
4362                         BUG_ON(ret);
4363                         btrfs_release_path(extent_root, path);
4364                         path->leave_spinning = 1;
4365
4366                         key.objectid = bytenr;
4367                         key.type = BTRFS_EXTENT_ITEM_KEY;
4368                         key.offset = num_bytes;
4369
4370                         ret = btrfs_search_slot(trans, extent_root,
4371                                                 &key, path, -1, 1);
4372                         if (ret) {
4373                                 printk(KERN_ERR "umm, got %d back from search"
4374                                        ", was looking for %llu\n", ret,
4375                                        (unsigned long long)bytenr);
4376                                 btrfs_print_leaf(extent_root, path->nodes[0]);
4377                         }
4378                         BUG_ON(ret);
4379                         extent_slot = path->slots[0];
4380                 }
4381         } else {
4382                 btrfs_print_leaf(extent_root, path->nodes[0]);
4383                 WARN_ON(1);
4384                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4385                        "parent %llu root %llu  owner %llu offset %llu\n",
4386                        (unsigned long long)bytenr,
4387                        (unsigned long long)parent,
4388                        (unsigned long long)root_objectid,
4389                        (unsigned long long)owner_objectid,
4390                        (unsigned long long)owner_offset);
4391         }
4392
4393         leaf = path->nodes[0];
4394         item_size = btrfs_item_size_nr(leaf, extent_slot);
4395 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4396         if (item_size < sizeof(*ei)) {
4397                 BUG_ON(found_extent || extent_slot != path->slots[0]);
4398                 ret = convert_extent_item_v0(trans, extent_root, path,
4399                                              owner_objectid, 0);
4400                 BUG_ON(ret < 0);
4401
4402                 btrfs_release_path(extent_root, path);
4403                 path->leave_spinning = 1;
4404
4405                 key.objectid = bytenr;
4406                 key.type = BTRFS_EXTENT_ITEM_KEY;
4407                 key.offset = num_bytes;
4408
4409                 ret = btrfs_search_slot(trans, extent_root, &key, path,
4410                                         -1, 1);
4411                 if (ret) {
4412                         printk(KERN_ERR "umm, got %d back from search"
4413                                ", was looking for %llu\n", ret,
4414                                (unsigned long long)bytenr);
4415                         btrfs_print_leaf(extent_root, path->nodes[0]);
4416                 }
4417                 BUG_ON(ret);
4418                 extent_slot = path->slots[0];
4419                 leaf = path->nodes[0];
4420                 item_size = btrfs_item_size_nr(leaf, extent_slot);
4421         }
4422 #endif
4423         BUG_ON(item_size < sizeof(*ei));
4424         ei = btrfs_item_ptr(leaf, extent_slot,
4425                             struct btrfs_extent_item);
4426         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4427                 struct btrfs_tree_block_info *bi;
4428                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4429                 bi = (struct btrfs_tree_block_info *)(ei + 1);
4430                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4431         }
4432
4433         refs = btrfs_extent_refs(leaf, ei);
4434         BUG_ON(refs < refs_to_drop);
4435         refs -= refs_to_drop;
4436
4437         if (refs > 0) {
4438                 if (extent_op)
4439                         __run_delayed_extent_op(extent_op, leaf, ei);
4440                 /*
4441                  * In the case of inline back ref, reference count will
4442                  * be updated by remove_extent_backref
4443                  */
4444                 if (iref) {
4445                         BUG_ON(!found_extent);
4446                 } else {
4447                         btrfs_set_extent_refs(leaf, ei, refs);
4448                         btrfs_mark_buffer_dirty(leaf);
4449                 }
4450                 if (found_extent) {
4451                         ret = remove_extent_backref(trans, extent_root, path,
4452                                                     iref, refs_to_drop,
4453                                                     is_data);
4454                         BUG_ON(ret);
4455                 }
4456         } else {
4457                 if (found_extent) {
4458                         BUG_ON(is_data && refs_to_drop !=
4459                                extent_data_ref_count(root, path, iref));
4460                         if (iref) {
4461                                 BUG_ON(path->slots[0] != extent_slot);
4462                         } else {
4463                                 BUG_ON(path->slots[0] != extent_slot + 1);
4464                                 path->slots[0] = extent_slot;
4465                                 num_to_del = 2;
4466                         }
4467                 }
4468
4469                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4470                                       num_to_del);
4471                 BUG_ON(ret);
4472                 btrfs_release_path(extent_root, path);
4473
4474                 if (is_data) {
4475                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4476                         BUG_ON(ret);
4477                 } else {
4478                         invalidate_mapping_pages(info->btree_inode->i_mapping,
4479                              bytenr >> PAGE_CACHE_SHIFT,
4480                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4481                 }
4482
4483                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4484                 BUG_ON(ret);
4485         }
4486         btrfs_free_path(path);
4487         return ret;
4488 }
4489
4490 /*
4491  * when we free an block, it is possible (and likely) that we free the last
4492  * delayed ref for that extent as well.  This searches the delayed ref tree for
4493  * a given extent, and if there are no other delayed refs to be processed, it
4494  * removes it from the tree.
4495  */
4496 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4497                                       struct btrfs_root *root, u64 bytenr)
4498 {
4499         struct btrfs_delayed_ref_head *head;
4500         struct btrfs_delayed_ref_root *delayed_refs;
4501         struct btrfs_delayed_ref_node *ref;
4502         struct rb_node *node;
4503         int ret = 0;
4504
4505         delayed_refs = &trans->transaction->delayed_refs;
4506         spin_lock(&delayed_refs->lock);
4507         head = btrfs_find_delayed_ref_head(trans, bytenr);
4508         if (!head)
4509                 goto out;
4510
4511         node = rb_prev(&head->node.rb_node);
4512         if (!node)
4513                 goto out;
4514
4515         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4516
4517         /* there are still entries for this ref, we can't drop it */
4518         if (ref->bytenr == bytenr)
4519                 goto out;
4520
4521         if (head->extent_op) {
4522                 if (!head->must_insert_reserved)
4523                         goto out;
4524                 kfree(head->extent_op);
4525                 head->extent_op = NULL;
4526         }
4527
4528         /*
4529          * waiting for the lock here would deadlock.  If someone else has it
4530          * locked they are already in the process of dropping it anyway
4531          */
4532         if (!mutex_trylock(&head->mutex))
4533                 goto out;
4534
4535         /*
4536          * at this point we have a head with no other entries.  Go
4537          * ahead and process it.
4538          */
4539         head->node.in_tree = 0;
4540         rb_erase(&head->node.rb_node, &delayed_refs->root);
4541
4542         delayed_refs->num_entries--;
4543
4544         /*
4545          * we don't take a ref on the node because we're removing it from the
4546          * tree, so we just steal the ref the tree was holding.
4547          */
4548         delayed_refs->num_heads--;
4549         if (list_empty(&head->cluster))
4550                 delayed_refs->num_heads_ready--;
4551
4552         list_del_init(&head->cluster);
4553         spin_unlock(&delayed_refs->lock);
4554
4555         BUG_ON(head->extent_op);
4556         if (head->must_insert_reserved)
4557                 ret = 1;
4558
4559         mutex_unlock(&head->mutex);
4560         btrfs_put_delayed_ref(&head->node);
4561         return ret;
4562 out:
4563         spin_unlock(&delayed_refs->lock);
4564         return 0;
4565 }
4566
4567 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4568                            struct btrfs_root *root,
4569                            struct extent_buffer *buf,
4570                            u64 parent, int last_ref)
4571 {
4572         struct btrfs_block_rsv *block_rsv;
4573         struct btrfs_block_group_cache *cache = NULL;
4574         int ret;
4575
4576         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4577                 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4578                                                 parent, root->root_key.objectid,
4579                                                 btrfs_header_level(buf),
4580                                                 BTRFS_DROP_DELAYED_REF, NULL);
4581                 BUG_ON(ret);
4582         }
4583
4584         if (!last_ref)
4585                 return;
4586
4587         block_rsv = get_block_rsv(trans, root);
4588         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4589         if (block_rsv->space_info != cache->space_info)
4590                 goto out;
4591
4592         if (btrfs_header_generation(buf) == trans->transid) {
4593                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4594                         ret = check_ref_cleanup(trans, root, buf->start);
4595                         if (!ret)
4596                                 goto pin;
4597                 }
4598
4599                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4600                         pin_down_extent(root, cache, buf->start, buf->len, 1);
4601                         goto pin;
4602                 }
4603
4604                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4605
4606                 btrfs_add_free_space(cache, buf->start, buf->len);
4607                 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4608                 if (ret == -EAGAIN) {
4609                         /* block group became read-only */
4610                         update_reserved_bytes(cache, buf->len, 0, 1);
4611                         goto out;
4612                 }
4613
4614                 ret = 1;
4615                 spin_lock(&block_rsv->lock);
4616                 if (block_rsv->reserved < block_rsv->size) {
4617                         block_rsv->reserved += buf->len;
4618                         ret = 0;
4619                 }
4620                 spin_unlock(&block_rsv->lock);
4621
4622                 if (ret) {
4623                         spin_lock(&cache->space_info->lock);
4624                         cache->space_info->bytes_reserved -= buf->len;
4625                         spin_unlock(&cache->space_info->lock);
4626                 }
4627                 goto out;
4628         }
4629 pin:
4630         if (block_rsv->durable && !cache->ro) {
4631                 ret = 0;
4632                 spin_lock(&cache->lock);
4633                 if (!cache->ro) {
4634                         cache->reserved_pinned += buf->len;
4635                         ret = 1;
4636                 }
4637                 spin_unlock(&cache->lock);
4638
4639                 if (ret) {
4640                         spin_lock(&block_rsv->lock);
4641                         block_rsv->freed[trans->transid & 0x1] += buf->len;
4642                         spin_unlock(&block_rsv->lock);
4643                 }
4644         }
4645 out:
4646         btrfs_put_block_group(cache);
4647 }
4648
4649 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4650                       struct btrfs_root *root,
4651                       u64 bytenr, u64 num_bytes, u64 parent,
4652                       u64 root_objectid, u64 owner, u64 offset)
4653 {
4654         int ret;
4655
4656         /*
4657          * tree log blocks never actually go into the extent allocation
4658          * tree, just update pinning info and exit early.
4659          */
4660         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4661                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4662                 /* unlocks the pinned mutex */
4663                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4664                 ret = 0;
4665         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4666                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4667                                         parent, root_objectid, (int)owner,
4668                                         BTRFS_DROP_DELAYED_REF, NULL);
4669                 BUG_ON(ret);
4670         } else {
4671                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4672                                         parent, root_objectid, owner,
4673                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4674                 BUG_ON(ret);
4675         }
4676         return ret;
4677 }
4678
4679 static u64 stripe_align(struct btrfs_root *root, u64 val)
4680 {
4681         u64 mask = ((u64)root->stripesize - 1);
4682         u64 ret = (val + mask) & ~mask;
4683         return ret;
4684 }
4685
4686 /*
4687  * when we wait for progress in the block group caching, its because
4688  * our allocation attempt failed at least once.  So, we must sleep
4689  * and let some progress happen before we try again.
4690  *
4691  * This function will sleep at least once waiting for new free space to
4692  * show up, and then it will check the block group free space numbers
4693  * for our min num_bytes.  Another option is to have it go ahead
4694  * and look in the rbtree for a free extent of a given size, but this
4695  * is a good start.
4696  */
4697 static noinline int
4698 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4699                                 u64 num_bytes)
4700 {
4701         struct btrfs_caching_control *caching_ctl;
4702         DEFINE_WAIT(wait);
4703
4704         caching_ctl = get_caching_control(cache);
4705         if (!caching_ctl)
4706                 return 0;
4707
4708         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4709                    (cache->free_space >= num_bytes));
4710
4711         put_caching_control(caching_ctl);
4712         return 0;
4713 }
4714
4715 static noinline int
4716 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4717 {
4718         struct btrfs_caching_control *caching_ctl;
4719         DEFINE_WAIT(wait);
4720
4721         caching_ctl = get_caching_control(cache);
4722         if (!caching_ctl)
4723                 return 0;
4724
4725         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4726
4727         put_caching_control(caching_ctl);
4728         return 0;
4729 }
4730
4731 static int get_block_group_index(struct btrfs_block_group_cache *cache)
4732 {
4733         int index;
4734         if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4735                 index = 0;
4736         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4737                 index = 1;
4738         else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4739                 index = 2;
4740         else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4741                 index = 3;
4742         else
4743                 index = 4;
4744         return index;
4745 }
4746
4747 enum btrfs_loop_type {
4748         LOOP_FIND_IDEAL = 0,
4749         LOOP_CACHING_NOWAIT = 1,
4750         LOOP_CACHING_WAIT = 2,
4751         LOOP_ALLOC_CHUNK = 3,
4752         LOOP_NO_EMPTY_SIZE = 4,
4753 };
4754
4755 /*
4756  * walks the btree of allocated extents and find a hole of a given size.
4757  * The key ins is changed to record the hole:
4758  * ins->objectid == block start
4759  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4760  * ins->offset == number of blocks
4761  * Any available blocks before search_start are skipped.
4762  */
4763 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4764                                      struct btrfs_root *orig_root,
4765                                      u64 num_bytes, u64 empty_size,
4766                                      u64 search_start, u64 search_end,
4767                                      u64 hint_byte, struct btrfs_key *ins,
4768                                      int data)
4769 {
4770         int ret = 0;
4771         struct btrfs_root *root = orig_root->fs_info->extent_root;
4772         struct btrfs_free_cluster *last_ptr = NULL;
4773         struct btrfs_block_group_cache *block_group = NULL;
4774         int empty_cluster = 2 * 1024 * 1024;
4775         int allowed_chunk_alloc = 0;
4776         int done_chunk_alloc = 0;
4777         struct btrfs_space_info *space_info;
4778         int last_ptr_loop = 0;
4779         int loop = 0;
4780         int index = 0;
4781         bool found_uncached_bg = false;
4782         bool failed_cluster_refill = false;
4783         bool failed_alloc = false;
4784         u64 ideal_cache_percent = 0;
4785         u64 ideal_cache_offset = 0;
4786
4787         WARN_ON(num_bytes < root->sectorsize);
4788         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4789         ins->objectid = 0;
4790         ins->offset = 0;
4791
4792         space_info = __find_space_info(root->fs_info, data);
4793         if (!space_info) {
4794                 printk(KERN_ERR "No space info for %d\n", data);
4795                 return -ENOSPC;
4796         }
4797
4798         if (orig_root->ref_cows || empty_size)
4799                 allowed_chunk_alloc = 1;
4800
4801         if (data & BTRFS_BLOCK_GROUP_METADATA) {
4802                 last_ptr = &root->fs_info->meta_alloc_cluster;
4803                 if (!btrfs_test_opt(root, SSD))
4804                         empty_cluster = 64 * 1024;
4805         }
4806
4807         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4808                 last_ptr = &root->fs_info->data_alloc_cluster;
4809         }
4810
4811         if (last_ptr) {
4812                 spin_lock(&last_ptr->lock);
4813                 if (last_ptr->block_group)
4814                         hint_byte = last_ptr->window_start;
4815                 spin_unlock(&last_ptr->lock);
4816         }
4817
4818         search_start = max(search_start, first_logical_byte(root, 0));
4819         search_start = max(search_start, hint_byte);
4820
4821         if (!last_ptr)
4822                 empty_cluster = 0;
4823
4824         if (search_start == hint_byte) {
4825 ideal_cache:
4826                 block_group = btrfs_lookup_block_group(root->fs_info,
4827                                                        search_start);
4828                 /*
4829                  * we don't want to use the block group if it doesn't match our
4830                  * allocation bits, or if its not cached.
4831                  *
4832                  * However if we are re-searching with an ideal block group
4833                  * picked out then we don't care that the block group is cached.
4834                  */
4835                 if (block_group && block_group_bits(block_group, data) &&
4836                     (block_group->cached != BTRFS_CACHE_NO ||
4837                      search_start == ideal_cache_offset)) {
4838                         down_read(&space_info->groups_sem);
4839                         if (list_empty(&block_group->list) ||
4840                             block_group->ro) {
4841                                 /*
4842                                  * someone is removing this block group,
4843                                  * we can't jump into the have_block_group
4844                                  * target because our list pointers are not
4845                                  * valid
4846                                  */
4847                                 btrfs_put_block_group(block_group);
4848                                 up_read(&space_info->groups_sem);
4849                         } else {
4850                                 index = get_block_group_index(block_group);
4851                                 goto have_block_group;
4852                         }
4853                 } else if (block_group) {
4854                         btrfs_put_block_group(block_group);
4855                 }
4856         }
4857 search:
4858         down_read(&space_info->groups_sem);
4859         list_for_each_entry(block_group, &space_info->block_groups[index],
4860                             list) {
4861                 u64 offset;
4862                 int cached;
4863
4864                 btrfs_get_block_group(block_group);
4865                 search_start = block_group->key.objectid;
4866
4867 have_block_group:
4868                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4869                         u64 free_percent;
4870
4871                         ret = cache_block_group(block_group, trans, 1);
4872                         if (block_group->cached == BTRFS_CACHE_FINISHED)
4873                                 goto have_block_group;
4874
4875                         free_percent = btrfs_block_group_used(&block_group->item);
4876                         free_percent *= 100;
4877                         free_percent = div64_u64(free_percent,
4878                                                  block_group->key.offset);
4879                         free_percent = 100 - free_percent;
4880                         if (free_percent > ideal_cache_percent &&
4881                             likely(!block_group->ro)) {
4882                                 ideal_cache_offset = block_group->key.objectid;
4883                                 ideal_cache_percent = free_percent;
4884                         }
4885
4886                         /*
4887                          * We only want to start kthread caching if we are at
4888                          * the point where we will wait for caching to make
4889                          * progress, or if our ideal search is over and we've
4890                          * found somebody to start caching.
4891                          */
4892                         if (loop > LOOP_CACHING_NOWAIT ||
4893                             (loop > LOOP_FIND_IDEAL &&
4894                              atomic_read(&space_info->caching_threads) < 2)) {
4895                                 ret = cache_block_group(block_group, trans, 0);
4896                                 BUG_ON(ret);
4897                         }
4898                         found_uncached_bg = true;
4899
4900                         /*
4901                          * If loop is set for cached only, try the next block
4902                          * group.
4903                          */
4904                         if (loop == LOOP_FIND_IDEAL)
4905                                 goto loop;
4906                 }
4907
4908                 cached = block_group_cache_done(block_group);
4909                 if (unlikely(!cached))
4910                         found_uncached_bg = true;
4911
4912                 if (unlikely(block_group->ro))
4913                         goto loop;
4914
4915                 /*
4916                  * Ok we want to try and use the cluster allocator, so lets look
4917                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4918                  * have tried the cluster allocator plenty of times at this
4919                  * point and not have found anything, so we are likely way too
4920                  * fragmented for the clustering stuff to find anything, so lets
4921                  * just skip it and let the allocator find whatever block it can
4922                  * find
4923                  */
4924                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4925                         /*
4926                          * the refill lock keeps out other
4927                          * people trying to start a new cluster
4928                          */
4929                         spin_lock(&last_ptr->refill_lock);
4930                         if (last_ptr->block_group &&
4931                             (last_ptr->block_group->ro ||
4932                             !block_group_bits(last_ptr->block_group, data))) {
4933                                 offset = 0;
4934                                 goto refill_cluster;
4935                         }
4936
4937                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4938                                                  num_bytes, search_start);
4939                         if (offset) {
4940                                 /* we have a block, we're done */
4941                                 spin_unlock(&last_ptr->refill_lock);
4942                                 goto checks;
4943                         }
4944
4945                         spin_lock(&last_ptr->lock);
4946                         /*
4947                          * whoops, this cluster doesn't actually point to
4948                          * this block group.  Get a ref on the block
4949                          * group is does point to and try again
4950                          */
4951                         if (!last_ptr_loop && last_ptr->block_group &&
4952                             last_ptr->block_group != block_group) {
4953
4954                                 btrfs_put_block_group(block_group);
4955                                 block_group = last_ptr->block_group;
4956                                 btrfs_get_block_group(block_group);
4957                                 spin_unlock(&last_ptr->lock);
4958                                 spin_unlock(&last_ptr->refill_lock);
4959
4960                                 last_ptr_loop = 1;
4961                                 search_start = block_group->key.objectid;
4962                                 /*
4963                                  * we know this block group is properly
4964                                  * in the list because
4965                                  * btrfs_remove_block_group, drops the
4966                                  * cluster before it removes the block
4967                                  * group from the list
4968                                  */
4969                                 goto have_block_group;
4970                         }
4971                         spin_unlock(&last_ptr->lock);
4972 refill_cluster:
4973                         /*
4974                          * this cluster didn't work out, free it and
4975                          * start over
4976                          */
4977                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4978
4979                         last_ptr_loop = 0;
4980
4981                         /* allocate a cluster in this block group */
4982                         ret = btrfs_find_space_cluster(trans, root,
4983                                                block_group, last_ptr,
4984                                                offset, num_bytes,
4985                                                empty_cluster + empty_size);
4986                         if (ret == 0) {
4987                                 /*
4988                                  * now pull our allocation out of this
4989                                  * cluster
4990                                  */
4991                                 offset = btrfs_alloc_from_cluster(block_group,
4992                                                   last_ptr, num_bytes,
4993                                                   search_start);
4994                                 if (offset) {
4995                                         /* we found one, proceed */
4996                                         spin_unlock(&last_ptr->refill_lock);
4997                                         goto checks;
4998                                 }
4999                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5000                                    && !failed_cluster_refill) {
5001                                 spin_unlock(&last_ptr->refill_lock);
5002
5003                                 failed_cluster_refill = true;
5004                                 wait_block_group_cache_progress(block_group,
5005                                        num_bytes + empty_cluster + empty_size);
5006                                 goto have_block_group;
5007                         }
5008
5009                         /*
5010                          * at this point we either didn't find a cluster
5011                          * or we weren't able to allocate a block from our
5012                          * cluster.  Free the cluster we've been trying
5013                          * to use, and go to the next block group
5014                          */
5015                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5016                         spin_unlock(&last_ptr->refill_lock);
5017                         goto loop;
5018                 }
5019
5020                 offset = btrfs_find_space_for_alloc(block_group, search_start,
5021                                                     num_bytes, empty_size);
5022                 /*
5023                  * If we didn't find a chunk, and we haven't failed on this
5024                  * block group before, and this block group is in the middle of
5025                  * caching and we are ok with waiting, then go ahead and wait
5026                  * for progress to be made, and set failed_alloc to true.
5027                  *
5028                  * If failed_alloc is true then we've already waited on this
5029                  * block group once and should move on to the next block group.
5030                  */
5031                 if (!offset && !failed_alloc && !cached &&
5032                     loop > LOOP_CACHING_NOWAIT) {
5033                         wait_block_group_cache_progress(block_group,
5034                                                 num_bytes + empty_size);
5035                         failed_alloc = true;
5036                         goto have_block_group;
5037                 } else if (!offset) {
5038                         goto loop;
5039                 }
5040 checks:
5041                 search_start = stripe_align(root, offset);
5042                 /* move on to the next group */
5043                 if (search_start + num_bytes >= search_end) {
5044                         btrfs_add_free_space(block_group, offset, num_bytes);
5045                         goto loop;
5046                 }
5047
5048                 /* move on to the next group */
5049                 if (search_start + num_bytes >
5050                     block_group->key.objectid + block_group->key.offset) {
5051                         btrfs_add_free_space(block_group, offset, num_bytes);
5052                         goto loop;
5053                 }
5054
5055                 ins->objectid = search_start;
5056                 ins->offset = num_bytes;
5057
5058                 if (offset < search_start)
5059                         btrfs_add_free_space(block_group, offset,
5060                                              search_start - offset);
5061                 BUG_ON(offset > search_start);
5062
5063                 ret = update_reserved_bytes(block_group, num_bytes, 1,
5064                                             (data & BTRFS_BLOCK_GROUP_DATA));
5065                 if (ret == -EAGAIN) {
5066                         btrfs_add_free_space(block_group, offset, num_bytes);
5067                         goto loop;
5068                 }
5069
5070                 /* we are all good, lets return */
5071                 ins->objectid = search_start;
5072                 ins->offset = num_bytes;
5073
5074                 if (offset < search_start)
5075                         btrfs_add_free_space(block_group, offset,
5076                                              search_start - offset);
5077                 BUG_ON(offset > search_start);
5078                 break;
5079 loop:
5080                 failed_cluster_refill = false;
5081                 failed_alloc = false;
5082                 BUG_ON(index != get_block_group_index(block_group));
5083                 btrfs_put_block_group(block_group);
5084         }
5085         up_read(&space_info->groups_sem);
5086
5087         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5088                 goto search;
5089
5090         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5091          *                      for them to make caching progress.  Also
5092          *                      determine the best possible bg to cache
5093          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5094          *                      caching kthreads as we move along
5095          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5096          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5097          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5098          *                      again
5099          */
5100         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
5101             (found_uncached_bg || empty_size || empty_cluster ||
5102              allowed_chunk_alloc)) {
5103                 index = 0;
5104                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5105                         found_uncached_bg = false;
5106                         loop++;
5107                         if (!ideal_cache_percent &&
5108                             atomic_read(&space_info->caching_threads))
5109                                 goto search;
5110
5111                         /*
5112                          * 1 of the following 2 things have happened so far
5113                          *
5114                          * 1) We found an ideal block group for caching that
5115                          * is mostly full and will cache quickly, so we might
5116                          * as well wait for it.
5117                          *
5118                          * 2) We searched for cached only and we didn't find
5119                          * anything, and we didn't start any caching kthreads
5120                          * either, so chances are we will loop through and
5121                          * start a couple caching kthreads, and then come back
5122                          * around and just wait for them.  This will be slower
5123                          * because we will have 2 caching kthreads reading at
5124                          * the same time when we could have just started one
5125                          * and waited for it to get far enough to give us an
5126                          * allocation, so go ahead and go to the wait caching
5127                          * loop.
5128                          */
5129                         loop = LOOP_CACHING_WAIT;
5130                         search_start = ideal_cache_offset;
5131                         ideal_cache_percent = 0;
5132                         goto ideal_cache;
5133                 } else if (loop == LOOP_FIND_IDEAL) {
5134                         /*
5135                          * Didn't find a uncached bg, wait on anything we find
5136                          * next.
5137                          */
5138                         loop = LOOP_CACHING_WAIT;
5139                         goto search;
5140                 }
5141
5142                 if (loop < LOOP_CACHING_WAIT) {
5143                         loop++;
5144                         goto search;
5145                 }
5146
5147                 if (loop == LOOP_ALLOC_CHUNK) {
5148                         empty_size = 0;
5149                         empty_cluster = 0;
5150                 }
5151
5152                 if (allowed_chunk_alloc) {
5153                         ret = do_chunk_alloc(trans, root, num_bytes +
5154                                              2 * 1024 * 1024, data, 1);
5155                         allowed_chunk_alloc = 0;
5156                         done_chunk_alloc = 1;
5157                 } else if (!done_chunk_alloc) {
5158                         space_info->force_alloc = 1;
5159                 }
5160
5161                 if (loop < LOOP_NO_EMPTY_SIZE) {
5162                         loop++;
5163                         goto search;
5164                 }
5165                 ret = -ENOSPC;
5166         } else if (!ins->objectid) {
5167                 ret = -ENOSPC;
5168         }
5169
5170         /* we found what we needed */
5171         if (ins->objectid) {
5172                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
5173                         trans->block_group = block_group->key.objectid;
5174
5175                 btrfs_put_block_group(block_group);
5176                 ret = 0;
5177         }
5178
5179         return ret;
5180 }
5181
5182 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5183                             int dump_block_groups)
5184 {
5185         struct btrfs_block_group_cache *cache;
5186         int index = 0;
5187
5188         spin_lock(&info->lock);
5189         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
5190                (unsigned long long)(info->total_bytes - info->bytes_used -
5191                                     info->bytes_pinned - info->bytes_reserved -
5192                                     info->bytes_readonly),
5193                (info->full) ? "" : "not ");
5194         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5195                "reserved=%llu, may_use=%llu, readonly=%llu\n",
5196                (unsigned long long)info->total_bytes,
5197                (unsigned long long)info->bytes_used,
5198                (unsigned long long)info->bytes_pinned,
5199                (unsigned long long)info->bytes_reserved,
5200                (unsigned long long)info->bytes_may_use,
5201                (unsigned long long)info->bytes_readonly);
5202         spin_unlock(&info->lock);
5203
5204         if (!dump_block_groups)
5205                 return;
5206
5207         down_read(&info->groups_sem);
5208 again:
5209         list_for_each_entry(cache, &info->block_groups[index], list) {
5210                 spin_lock(&cache->lock);
5211                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5212                        "%llu pinned %llu reserved\n",
5213                        (unsigned long long)cache->key.objectid,
5214                        (unsigned long long)cache->key.offset,
5215                        (unsigned long long)btrfs_block_group_used(&cache->item),
5216                        (unsigned long long)cache->pinned,
5217                        (unsigned long long)cache->reserved);
5218                 btrfs_dump_free_space(cache, bytes);
5219                 spin_unlock(&cache->lock);
5220         }
5221         if (++index < BTRFS_NR_RAID_TYPES)
5222                 goto again;
5223         up_read(&info->groups_sem);
5224 }
5225
5226 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5227                          struct btrfs_root *root,
5228                          u64 num_bytes, u64 min_alloc_size,
5229                          u64 empty_size, u64 hint_byte,
5230                          u64 search_end, struct btrfs_key *ins,
5231                          u64 data)
5232 {
5233         int ret;
5234         u64 search_start = 0;
5235
5236         data = btrfs_get_alloc_profile(root, data);
5237 again:
5238         /*
5239          * the only place that sets empty_size is btrfs_realloc_node, which
5240          * is not called recursively on allocations
5241          */
5242         if (empty_size || root->ref_cows)
5243                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5244                                      num_bytes + 2 * 1024 * 1024, data, 0);
5245
5246         WARN_ON(num_bytes < root->sectorsize);
5247         ret = find_free_extent(trans, root, num_bytes, empty_size,
5248                                search_start, search_end, hint_byte,
5249                                ins, data);
5250
5251         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5252                 num_bytes = num_bytes >> 1;
5253                 num_bytes = num_bytes & ~(root->sectorsize - 1);
5254                 num_bytes = max(num_bytes, min_alloc_size);
5255                 do_chunk_alloc(trans, root->fs_info->extent_root,
5256                                num_bytes, data, 1);
5257                 goto again;
5258         }
5259         if (ret == -ENOSPC) {
5260                 struct btrfs_space_info *sinfo;
5261
5262                 sinfo = __find_space_info(root->fs_info, data);
5263                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
5264                        "wanted %llu\n", (unsigned long long)data,
5265                        (unsigned long long)num_bytes);
5266                 dump_space_info(sinfo, num_bytes, 1);
5267         }
5268
5269         return ret;
5270 }
5271
5272 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5273 {
5274         struct btrfs_block_group_cache *cache;
5275         int ret = 0;
5276
5277         cache = btrfs_lookup_block_group(root->fs_info, start);
5278         if (!cache) {
5279                 printk(KERN_ERR "Unable to find block group for %llu\n",
5280                        (unsigned long long)start);
5281                 return -ENOSPC;
5282         }
5283
5284         ret = btrfs_discard_extent(root, start, len);
5285
5286         btrfs_add_free_space(cache, start, len);
5287         update_reserved_bytes(cache, len, 0, 1);
5288         btrfs_put_block_group(cache);
5289
5290         return ret;
5291 }
5292
5293 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5294                                       struct btrfs_root *root,
5295                                       u64 parent, u64 root_objectid,
5296                                       u64 flags, u64 owner, u64 offset,
5297                                       struct btrfs_key *ins, int ref_mod)
5298 {
5299         int ret;
5300         struct btrfs_fs_info *fs_info = root->fs_info;
5301         struct btrfs_extent_item *extent_item;
5302         struct btrfs_extent_inline_ref *iref;
5303         struct btrfs_path *path;
5304         struct extent_buffer *leaf;
5305         int type;
5306         u32 size;
5307
5308         if (parent > 0)
5309                 type = BTRFS_SHARED_DATA_REF_KEY;
5310         else
5311                 type = BTRFS_EXTENT_DATA_REF_KEY;
5312
5313         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5314
5315         path = btrfs_alloc_path();
5316         BUG_ON(!path);
5317
5318         path->leave_spinning = 1;
5319         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5320                                       ins, size);
5321         BUG_ON(ret);
5322
5323         leaf = path->nodes[0];
5324         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5325                                      struct btrfs_extent_item);
5326         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5327         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5328         btrfs_set_extent_flags(leaf, extent_item,
5329                                flags | BTRFS_EXTENT_FLAG_DATA);
5330
5331         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5332         btrfs_set_extent_inline_ref_type(leaf, iref, type);
5333         if (parent > 0) {
5334                 struct btrfs_shared_data_ref *ref;
5335                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
5336                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5337                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5338         } else {
5339                 struct btrfs_extent_data_ref *ref;
5340                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5341                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5342                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5343                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5344                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5345         }
5346
5347         btrfs_mark_buffer_dirty(path->nodes[0]);
5348         btrfs_free_path(path);
5349
5350         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5351         if (ret) {
5352                 printk(KERN_ERR "btrfs update block group failed for %llu "
5353                        "%llu\n", (unsigned long long)ins->objectid,
5354                        (unsigned long long)ins->offset);
5355                 BUG();
5356         }
5357         return ret;
5358 }
5359
5360 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5361                                      struct btrfs_root *root,
5362                                      u64 parent, u64 root_objectid,
5363                                      u64 flags, struct btrfs_disk_key *key,
5364                                      int level, struct btrfs_key *ins)
5365 {
5366         int ret;
5367         struct btrfs_fs_info *fs_info = root->fs_info;
5368         struct btrfs_extent_item *extent_item;
5369         struct btrfs_tree_block_info *block_info;
5370         struct btrfs_extent_inline_ref *iref;
5371         struct btrfs_path *path;
5372         struct extent_buffer *leaf;
5373         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5374
5375         path = btrfs_alloc_path();
5376         BUG_ON(!path);
5377
5378         path->leave_spinning = 1;
5379         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5380                                       ins, size);
5381         BUG_ON(ret);
5382
5383         leaf = path->nodes[0];
5384         extent_item = btrfs_item_ptr(leaf, path->slots[0],
5385                                      struct btrfs_extent_item);
5386         btrfs_set_extent_refs(leaf, extent_item, 1);
5387         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5388         btrfs_set_extent_flags(leaf, extent_item,
5389                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5390         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5391
5392         btrfs_set_tree_block_key(leaf, block_info, key);
5393         btrfs_set_tree_block_level(leaf, block_info, level);
5394
5395         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5396         if (parent > 0) {
5397                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5398                 btrfs_set_extent_inline_ref_type(leaf, iref,
5399                                                  BTRFS_SHARED_BLOCK_REF_KEY);
5400                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5401         } else {
5402                 btrfs_set_extent_inline_ref_type(leaf, iref,
5403                                                  BTRFS_TREE_BLOCK_REF_KEY);
5404                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5405         }
5406
5407         btrfs_mark_buffer_dirty(leaf);
5408         btrfs_free_path(path);
5409
5410         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5411         if (ret) {
5412                 printk(KERN_ERR "btrfs update block group failed for %llu "
5413                        "%llu\n", (unsigned long long)ins->objectid,
5414                        (unsigned long long)ins->offset);
5415                 BUG();
5416         }
5417         return ret;
5418 }
5419
5420 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5421                                      struct btrfs_root *root,
5422                                      u64 root_objectid, u64 owner,
5423                                      u64 offset, struct btrfs_key *ins)
5424 {
5425         int ret;
5426
5427         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5428
5429         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5430                                          0, root_objectid, owner, offset,
5431                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
5432         return ret;
5433 }
5434
5435 /*
5436  * this is used by the tree logging recovery code.  It records that
5437  * an extent has been allocated and makes sure to clear the free
5438  * space cache bits as well
5439  */
5440 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5441                                    struct btrfs_root *root,
5442                                    u64 root_objectid, u64 owner, u64 offset,
5443                                    struct btrfs_key *ins)
5444 {
5445         int ret;
5446         struct btrfs_block_group_cache *block_group;
5447         struct btrfs_caching_control *caching_ctl;
5448         u64 start = ins->objectid;
5449         u64 num_bytes = ins->offset;
5450
5451         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5452         cache_block_group(block_group, trans, 0);
5453         caching_ctl = get_caching_control(block_group);
5454
5455         if (!caching_ctl) {
5456                 BUG_ON(!block_group_cache_done(block_group));
5457                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5458                 BUG_ON(ret);
5459         } else {
5460                 mutex_lock(&caching_ctl->mutex);
5461
5462                 if (start >= caching_ctl->progress) {
5463                         ret = add_excluded_extent(root, start, num_bytes);
5464                         BUG_ON(ret);
5465                 } else if (start + num_bytes <= caching_ctl->progress) {
5466                         ret = btrfs_remove_free_space(block_group,
5467                                                       start, num_bytes);
5468                         BUG_ON(ret);
5469                 } else {
5470                         num_bytes = caching_ctl->progress - start;
5471                         ret = btrfs_remove_free_space(block_group,
5472                                                       start, num_bytes);
5473                         BUG_ON(ret);
5474
5475                         start = caching_ctl->progress;
5476                         num_bytes = ins->objectid + ins->offset -
5477                                     caching_ctl->progress;
5478                         ret = add_excluded_extent(root, start, num_bytes);
5479                         BUG_ON(ret);
5480                 }
5481
5482                 mutex_unlock(&caching_ctl->mutex);
5483                 put_caching_control(caching_ctl);
5484         }
5485
5486         ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5487         BUG_ON(ret);
5488         btrfs_put_block_group(block_group);
5489         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5490                                          0, owner, offset, ins, 1);
5491         return ret;
5492 }
5493
5494 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5495                                             struct btrfs_root *root,
5496                                             u64 bytenr, u32 blocksize,
5497                                             int level)
5498 {
5499         struct extent_buffer *buf;
5500
5501         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5502         if (!buf)
5503                 return ERR_PTR(-ENOMEM);
5504         btrfs_set_header_generation(buf, trans->transid);
5505         btrfs_set_buffer_lockdep_class(buf, level);
5506         btrfs_tree_lock(buf);
5507         clean_tree_block(trans, root, buf);
5508
5509         btrfs_set_lock_blocking(buf);
5510         btrfs_set_buffer_uptodate(buf);
5511
5512         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5513                 /*
5514                  * we allow two log transactions at a time, use different
5515                  * EXENT bit to differentiate dirty pages.
5516                  */
5517                 if (root->log_transid % 2 == 0)
5518                         set_extent_dirty(&root->dirty_log_pages, buf->start,
5519                                         buf->start + buf->len - 1, GFP_NOFS);
5520                 else
5521                         set_extent_new(&root->dirty_log_pages, buf->start,
5522                                         buf->start + buf->len - 1, GFP_NOFS);
5523         } else {
5524                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5525                          buf->start + buf->len - 1, GFP_NOFS);
5526         }
5527         trans->blocks_used++;
5528         /* this returns a buffer locked for blocking */
5529         return buf;
5530 }
5531
5532 static struct btrfs_block_rsv *
5533 use_block_rsv(struct btrfs_trans_handle *trans,
5534               struct btrfs_root *root, u32 blocksize)
5535 {
5536         struct btrfs_block_rsv *block_rsv;
5537         int ret;
5538
5539         block_rsv = get_block_rsv(trans, root);
5540
5541         if (block_rsv->size == 0) {
5542                 ret = reserve_metadata_bytes(block_rsv, blocksize);
5543                 if (ret)
5544                         return ERR_PTR(ret);
5545                 return block_rsv;
5546         }
5547
5548         ret = block_rsv_use_bytes(block_rsv, blocksize);
5549         if (!ret)
5550                 return block_rsv;
5551
5552         WARN_ON(1);
5553         printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5554                 block_rsv->size, block_rsv->reserved,
5555                 block_rsv->freed[0], block_rsv->freed[1]);
5556
5557         return ERR_PTR(-ENOSPC);
5558 }
5559
5560 static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5561 {
5562         block_rsv_add_bytes(block_rsv, blocksize, 0);
5563         block_rsv_release_bytes(block_rsv, NULL, 0);
5564 }
5565
5566 /*
5567  * finds a free extent and does all the dirty work required for allocation
5568  * returns the key for the extent through ins, and a tree buffer for
5569  * the first block of the extent through buf.
5570  *
5571  * returns the tree buffer or NULL.
5572  */
5573 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5574                                         struct btrfs_root *root, u32 blocksize,
5575                                         u64 parent, u64 root_objectid,
5576                                         struct btrfs_disk_key *key, int level,
5577                                         u64 hint, u64 empty_size)
5578 {
5579         struct btrfs_key ins;
5580         struct btrfs_block_rsv *block_rsv;
5581         struct extent_buffer *buf;
5582         u64 flags = 0;
5583         int ret;
5584
5585
5586         block_rsv = use_block_rsv(trans, root, blocksize);
5587         if (IS_ERR(block_rsv))
5588                 return ERR_CAST(block_rsv);
5589
5590         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5591                                    empty_size, hint, (u64)-1, &ins, 0);
5592         if (ret) {
5593                 unuse_block_rsv(block_rsv, blocksize);
5594                 return ERR_PTR(ret);
5595         }
5596
5597         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5598                                     blocksize, level);
5599         BUG_ON(IS_ERR(buf));
5600
5601         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5602                 if (parent == 0)
5603                         parent = ins.objectid;
5604                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5605         } else
5606                 BUG_ON(parent > 0);
5607
5608         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5609                 struct btrfs_delayed_extent_op *extent_op;
5610                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5611                 BUG_ON(!extent_op);
5612                 if (key)
5613                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
5614                 else
5615                         memset(&extent_op->key, 0, sizeof(extent_op->key));
5616                 extent_op->flags_to_set = flags;
5617                 extent_op->update_key = 1;
5618                 extent_op->update_flags = 1;
5619                 extent_op->is_data = 0;
5620
5621                 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5622                                         ins.offset, parent, root_objectid,
5623                                         level, BTRFS_ADD_DELAYED_EXTENT,
5624                                         extent_op);
5625                 BUG_ON(ret);
5626         }
5627         return buf;
5628 }
5629
5630 struct walk_control {
5631         u64 refs[BTRFS_MAX_LEVEL];
5632         u64 flags[BTRFS_MAX_LEVEL];
5633         struct btrfs_key update_progress;
5634         int stage;
5635         int level;
5636         int shared_level;
5637         int update_ref;
5638         int keep_locks;
5639         int reada_slot;
5640         int reada_count;
5641 };
5642
5643 #define DROP_REFERENCE  1
5644 #define UPDATE_BACKREF  2
5645
5646 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5647                                      struct btrfs_root *root,
5648                                      struct walk_control *wc,
5649                                      struct btrfs_path *path)
5650 {
5651         u64 bytenr;
5652         u64 generation;
5653         u64 refs;
5654         u64 flags;
5655         u64 last = 0;
5656         u32 nritems;
5657         u32 blocksize;
5658         struct btrfs_key key;
5659         struct extent_buffer *eb;
5660         int ret;
5661         int slot;
5662         int nread = 0;
5663
5664         if (path->slots[wc->level] < wc->reada_slot) {
5665                 wc->reada_count = wc->reada_count * 2 / 3;
5666                 wc->reada_count = max(wc->reada_count, 2);
5667         } else {
5668                 wc->reada_count = wc->reada_count * 3 / 2;
5669                 wc->reada_count = min_t(int, wc->reada_count,
5670                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5671         }
5672
5673         eb = path->nodes[wc->level];
5674         nritems = btrfs_header_nritems(eb);
5675         blocksize = btrfs_level_size(root, wc->level - 1);
5676
5677         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5678                 if (nread >= wc->reada_count)
5679                         break;
5680
5681                 cond_resched();
5682                 bytenr = btrfs_node_blockptr(eb, slot);
5683                 generation = btrfs_node_ptr_generation(eb, slot);
5684
5685                 if (slot == path->slots[wc->level])
5686                         goto reada;
5687
5688                 if (wc->stage == UPDATE_BACKREF &&
5689                     generation <= root->root_key.offset)
5690                         continue;
5691
5692                 /* We don't lock the tree block, it's OK to be racy here */
5693                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5694                                                &refs, &flags);
5695                 BUG_ON(ret);
5696                 BUG_ON(refs == 0);
5697
5698                 if (wc->stage == DROP_REFERENCE) {
5699                         if (refs == 1)
5700                                 goto reada;
5701
5702                         if (wc->level == 1 &&
5703                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5704                                 continue;
5705                         if (!wc->update_ref ||
5706                             generation <= root->root_key.offset)
5707                                 continue;
5708                         btrfs_node_key_to_cpu(eb, &key, slot);
5709                         ret = btrfs_comp_cpu_keys(&key,
5710                                                   &wc->update_progress);
5711                         if (ret < 0)
5712                                 continue;
5713                 } else {
5714                         if (wc->level == 1 &&
5715                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5716                                 continue;
5717                 }
5718 reada:
5719                 ret = readahead_tree_block(root, bytenr, blocksize,
5720                                            generation);
5721                 if (ret)
5722                         break;
5723                 last = bytenr + blocksize;
5724                 nread++;
5725         }
5726         wc->reada_slot = slot;
5727 }
5728
5729 /*
5730  * hepler to process tree block while walking down the tree.
5731  *
5732  * when wc->stage == UPDATE_BACKREF, this function updates
5733  * back refs for pointers in the block.
5734  *
5735  * NOTE: return value 1 means we should stop walking down.
5736  */
5737 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5738                                    struct btrfs_root *root,
5739                                    struct btrfs_path *path,
5740                                    struct walk_control *wc, int lookup_info)
5741 {
5742         int level = wc->level;
5743         struct extent_buffer *eb = path->nodes[level];
5744         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5745         int ret;
5746
5747         if (wc->stage == UPDATE_BACKREF &&
5748             btrfs_header_owner(eb) != root->root_key.objectid)
5749                 return 1;
5750
5751         /*
5752          * when reference count of tree block is 1, it won't increase
5753          * again. once full backref flag is set, we never clear it.
5754          */
5755         if (lookup_info &&
5756             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5757              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5758                 BUG_ON(!path->locks[level]);
5759                 ret = btrfs_lookup_extent_info(trans, root,
5760                                                eb->start, eb->len,
5761                                                &wc->refs[level],
5762                                                &wc->flags[level]);
5763                 BUG_ON(ret);
5764                 BUG_ON(wc->refs[level] == 0);
5765         }
5766
5767         if (wc->stage == DROP_REFERENCE) {
5768                 if (wc->refs[level] > 1)
5769                         return 1;
5770
5771                 if (path->locks[level] && !wc->keep_locks) {
5772                         btrfs_tree_unlock(eb);
5773                         path->locks[level] = 0;
5774                 }
5775                 return 0;
5776         }
5777
5778         /* wc->stage == UPDATE_BACKREF */
5779         if (!(wc->flags[level] & flag)) {
5780                 BUG_ON(!path->locks[level]);
5781                 ret = btrfs_inc_ref(trans, root, eb, 1);
5782                 BUG_ON(ret);
5783                 ret = btrfs_dec_ref(trans, root, eb, 0);
5784                 BUG_ON(ret);
5785                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5786                                                   eb->len, flag, 0);
5787                 BUG_ON(ret);
5788                 wc->flags[level] |= flag;
5789         }
5790
5791         /*
5792          * the block is shared by multiple trees, so it's not good to
5793          * keep the tree lock
5794          */
5795         if (path->locks[level] && level > 0) {
5796                 btrfs_tree_unlock(eb);
5797                 path->locks[level] = 0;
5798         }
5799         return 0;
5800 }
5801
5802 /*
5803  * hepler to process tree block pointer.
5804  *
5805  * when wc->stage == DROP_REFERENCE, this function checks
5806  * reference count of the block pointed to. if the block
5807  * is shared and we need update back refs for the subtree
5808  * rooted at the block, this function changes wc->stage to
5809  * UPDATE_BACKREF. if the block is shared and there is no
5810  * need to update back, this function drops the reference
5811  * to the block.
5812  *
5813  * NOTE: return value 1 means we should stop walking down.
5814  */
5815 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5816                                  struct btrfs_root *root,
5817                                  struct btrfs_path *path,
5818                                  struct walk_control *wc, int *lookup_info)
5819 {
5820         u64 bytenr;
5821         u64 generation;
5822         u64 parent;
5823         u32 blocksize;
5824         struct btrfs_key key;
5825         struct extent_buffer *next;
5826         int level = wc->level;
5827         int reada = 0;
5828         int ret = 0;
5829
5830         generation = btrfs_node_ptr_generation(path->nodes[level],
5831                                                path->slots[level]);
5832         /*
5833          * if the lower level block was created before the snapshot
5834          * was created, we know there is no need to update back refs
5835          * for the subtree
5836          */
5837         if (wc->stage == UPDATE_BACKREF &&
5838             generation <= root->root_key.offset) {
5839                 *lookup_info = 1;
5840                 return 1;
5841         }
5842
5843         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5844         blocksize = btrfs_level_size(root, level - 1);
5845
5846         next = btrfs_find_tree_block(root, bytenr, blocksize);
5847         if (!next) {
5848                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5849                 if (!next)
5850                         return -ENOMEM;
5851                 reada = 1;
5852         }
5853         btrfs_tree_lock(next);
5854         btrfs_set_lock_blocking(next);
5855
5856         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5857                                        &wc->refs[level - 1],
5858                                        &wc->flags[level - 1]);
5859         BUG_ON(ret);
5860         BUG_ON(wc->refs[level - 1] == 0);
5861         *lookup_info = 0;
5862
5863         if (wc->stage == DROP_REFERENCE) {
5864                 if (wc->refs[level - 1] > 1) {
5865                         if (level == 1 &&
5866                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5867                                 goto skip;
5868
5869                         if (!wc->update_ref ||
5870                             generation <= root->root_key.offset)
5871                                 goto skip;
5872
5873                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5874                                               path->slots[level]);
5875                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5876                         if (ret < 0)
5877                                 goto skip;
5878
5879                         wc->stage = UPDATE_BACKREF;
5880                         wc->shared_level = level - 1;
5881                 }
5882         } else {
5883                 if (level == 1 &&
5884                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5885                         goto skip;
5886         }
5887
5888         if (!btrfs_buffer_uptodate(next, generation)) {
5889                 btrfs_tree_unlock(next);
5890                 free_extent_buffer(next);
5891                 next = NULL;
5892                 *lookup_info = 1;
5893         }
5894
5895         if (!next) {
5896                 if (reada && level == 1)
5897                         reada_walk_down(trans, root, wc, path);
5898                 next = read_tree_block(root, bytenr, blocksize, generation);
5899                 btrfs_tree_lock(next);
5900                 btrfs_set_lock_blocking(next);
5901         }
5902
5903         level--;
5904         BUG_ON(level != btrfs_header_level(next));
5905         path->nodes[level] = next;
5906         path->slots[level] = 0;
5907         path->locks[level] = 1;
5908         wc->level = level;
5909         if (wc->level == 1)
5910                 wc->reada_slot = 0;
5911         return 0;
5912 skip:
5913         wc->refs[level - 1] = 0;
5914         wc->flags[level - 1] = 0;
5915         if (wc->stage == DROP_REFERENCE) {
5916                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5917                         parent = path->nodes[level]->start;
5918                 } else {
5919                         BUG_ON(root->root_key.objectid !=
5920                                btrfs_header_owner(path->nodes[level]));
5921                         parent = 0;
5922                 }
5923
5924                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5925                                         root->root_key.objectid, level - 1, 0);
5926                 BUG_ON(ret);
5927         }
5928         btrfs_tree_unlock(next);
5929         free_extent_buffer(next);
5930         *lookup_info = 1;
5931         return 1;
5932 }
5933
5934 /*
5935  * hepler to process tree block while walking up the tree.
5936  *
5937  * when wc->stage == DROP_REFERENCE, this function drops
5938  * reference count on the block.
5939  *
5940  * when wc->stage == UPDATE_BACKREF, this function changes
5941  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5942  * to UPDATE_BACKREF previously while processing the block.
5943  *
5944  * NOTE: return value 1 means we should stop walking up.
5945  */
5946 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5947                                  struct btrfs_root *root,
5948                                  struct btrfs_path *path,
5949                                  struct walk_control *wc)
5950 {
5951         int ret;
5952         int level = wc->level;
5953         struct extent_buffer *eb = path->nodes[level];
5954         u64 parent = 0;
5955
5956         if (wc->stage == UPDATE_BACKREF) {
5957                 BUG_ON(wc->shared_level < level);
5958                 if (level < wc->shared_level)
5959                         goto out;
5960
5961                 ret = find_next_key(path, level + 1, &wc->update_progress);
5962                 if (ret > 0)
5963                         wc->update_ref = 0;
5964
5965                 wc->stage = DROP_REFERENCE;
5966                 wc->shared_level = -1;
5967                 path->slots[level] = 0;
5968
5969                 /*
5970                  * check reference count again if the block isn't locked.
5971                  * we should start walking down the tree again if reference
5972                  * count is one.
5973                  */
5974                 if (!path->locks[level]) {
5975                         BUG_ON(level == 0);
5976                         btrfs_tree_lock(eb);
5977                         btrfs_set_lock_blocking(eb);
5978                         path->locks[level] = 1;
5979
5980                         ret = btrfs_lookup_extent_info(trans, root,
5981                                                        eb->start, eb->len,
5982                                                        &wc->refs[level],
5983                                                        &wc->flags[level]);
5984                         BUG_ON(ret);
5985                         BUG_ON(wc->refs[level] == 0);
5986                         if (wc->refs[level] == 1) {
5987                                 btrfs_tree_unlock(eb);
5988                                 path->locks[level] = 0;
5989                                 return 1;
5990                         }
5991                 }
5992         }
5993
5994         /* wc->stage == DROP_REFERENCE */
5995         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5996
5997         if (wc->refs[level] == 1) {
5998                 if (level == 0) {
5999                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6000                                 ret = btrfs_dec_ref(trans, root, eb, 1);
6001                         else
6002                                 ret = btrfs_dec_ref(trans, root, eb, 0);
6003                         BUG_ON(ret);
6004                 }
6005                 /* make block locked assertion in clean_tree_block happy */
6006                 if (!path->locks[level] &&
6007                     btrfs_header_generation(eb) == trans->transid) {
6008                         btrfs_tree_lock(eb);
6009                         btrfs_set_lock_blocking(eb);
6010                         path->locks[level] = 1;
6011                 }
6012                 clean_tree_block(trans, root, eb);
6013         }
6014
6015         if (eb == root->node) {
6016                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6017                         parent = eb->start;
6018                 else
6019                         BUG_ON(root->root_key.objectid !=
6020                                btrfs_header_owner(eb));
6021         } else {
6022                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6023                         parent = path->nodes[level + 1]->start;
6024                 else
6025                         BUG_ON(root->root_key.objectid !=
6026                                btrfs_header_owner(path->nodes[level + 1]));
6027         }
6028
6029         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6030 out:
6031         wc->refs[level] = 0;
6032         wc->flags[level] = 0;
6033         return 0;
6034 }
6035
6036 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6037                                    struct btrfs_root *root,
6038                                    struct btrfs_path *path,
6039                                    struct walk_control *wc)
6040 {
6041         int level = wc->level;
6042         int lookup_info = 1;
6043         int ret;
6044
6045         while (level >= 0) {
6046                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6047                 if (ret > 0)
6048                         break;
6049
6050                 if (level == 0)
6051                         break;
6052
6053                 if (path->slots[level] >=
6054                     btrfs_header_nritems(path->nodes[level]))
6055                         break;
6056
6057                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6058                 if (ret > 0) {
6059                         path->slots[level]++;
6060                         continue;
6061                 } else if (ret < 0)
6062                         return ret;
6063                 level = wc->level;
6064         }
6065         return 0;
6066 }
6067
6068 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6069                                  struct btrfs_root *root,
6070                                  struct btrfs_path *path,
6071                                  struct walk_control *wc, int max_level)
6072 {
6073         int level = wc->level;
6074         int ret;
6075
6076         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6077         while (level < max_level && path->nodes[level]) {
6078                 wc->level = level;
6079                 if (path->slots[level] + 1 <
6080                     btrfs_header_nritems(path->nodes[level])) {
6081                         path->slots[level]++;
6082                         return 0;
6083                 } else {
6084                         ret = walk_up_proc(trans, root, path, wc);
6085                         if (ret > 0)
6086                                 return 0;
6087
6088                         if (path->locks[level]) {
6089                                 btrfs_tree_unlock(path->nodes[level]);
6090                                 path->locks[level] = 0;
6091                         }
6092                         free_extent_buffer(path->nodes[level]);
6093                         path->nodes[level] = NULL;
6094                         level++;
6095                 }
6096         }
6097         return 1;
6098 }
6099
6100 /*
6101  * drop a subvolume tree.
6102  *
6103  * this function traverses the tree freeing any blocks that only
6104  * referenced by the tree.
6105  *
6106  * when a shared tree block is found. this function decreases its
6107  * reference count by one. if update_ref is true, this function
6108  * also make sure backrefs for the shared block and all lower level
6109  * blocks are properly updated.
6110  */
6111 int btrfs_drop_snapshot(struct btrfs_root *root,
6112                         struct btrfs_block_rsv *block_rsv, int update_ref)
6113 {
6114         struct btrfs_path *path;
6115         struct btrfs_trans_handle *trans;
6116         struct btrfs_root *tree_root = root->fs_info->tree_root;
6117         struct btrfs_root_item *root_item = &root->root_item;
6118         struct walk_control *wc;
6119         struct btrfs_key key;
6120         int err = 0;
6121         int ret;
6122         int level;
6123
6124         path = btrfs_alloc_path();
6125         BUG_ON(!path);
6126
6127         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6128         BUG_ON(!wc);
6129
6130         trans = btrfs_start_transaction(tree_root, 0);
6131         if (block_rsv)
6132                 trans->block_rsv = block_rsv;
6133
6134         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6135                 level = btrfs_header_level(root->node);
6136                 path->nodes[level] = btrfs_lock_root_node(root);
6137                 btrfs_set_lock_blocking(path->nodes[level]);
6138                 path->slots[level] = 0;
6139                 path->locks[level] = 1;
6140                 memset(&wc->update_progress, 0,
6141                        sizeof(wc->update_progress));
6142         } else {
6143                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6144                 memcpy(&wc->update_progress, &key,
6145                        sizeof(wc->update_progress));
6146
6147                 level = root_item->drop_level;
6148                 BUG_ON(level == 0);
6149                 path->lowest_level = level;
6150                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6151                 path->lowest_level = 0;
6152                 if (ret < 0) {
6153                         err = ret;
6154                         goto out;
6155                 }
6156                 WARN_ON(ret > 0);
6157
6158                 /*
6159                  * unlock our path, this is safe because only this
6160                  * function is allowed to delete this snapshot
6161                  */
6162                 btrfs_unlock_up_safe(path, 0);
6163
6164                 level = btrfs_header_level(root->node);
6165                 while (1) {
6166                         btrfs_tree_lock(path->nodes[level]);
6167                         btrfs_set_lock_blocking(path->nodes[level]);
6168
6169                         ret = btrfs_lookup_extent_info(trans, root,
6170                                                 path->nodes[level]->start,
6171                                                 path->nodes[level]->len,
6172                                                 &wc->refs[level],
6173                                                 &wc->flags[level]);
6174                         BUG_ON(ret);
6175                         BUG_ON(wc->refs[level] == 0);
6176
6177                         if (level == root_item->drop_level)
6178                                 break;
6179
6180                         btrfs_tree_unlock(path->nodes[level]);
6181                         WARN_ON(wc->refs[level] != 1);
6182                         level--;
6183                 }
6184         }
6185
6186         wc->level = level;
6187         wc->shared_level = -1;
6188         wc->stage = DROP_REFERENCE;
6189         wc->update_ref = update_ref;
6190         wc->keep_locks = 0;
6191         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6192
6193         while (1) {
6194                 ret = walk_down_tree(trans, root, path, wc);
6195                 if (ret < 0) {
6196                         err = ret;
6197                         break;
6198                 }
6199
6200                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6201                 if (ret < 0) {
6202                         err = ret;
6203                         break;
6204                 }
6205
6206                 if (ret > 0) {
6207                         BUG_ON(wc->stage != DROP_REFERENCE);
6208                         break;
6209                 }
6210
6211                 if (wc->stage == DROP_REFERENCE) {
6212                         level = wc->level;
6213                         btrfs_node_key(path->nodes[level],
6214                                        &root_item->drop_progress,
6215                                        path->slots[level]);
6216                         root_item->drop_level = level;
6217                 }
6218
6219                 BUG_ON(wc->level == 0);
6220                 if (btrfs_should_end_transaction(trans, tree_root)) {
6221                         ret = btrfs_update_root(trans, tree_root,
6222                                                 &root->root_key,
6223                                                 root_item);
6224                         BUG_ON(ret);
6225
6226                         btrfs_end_transaction_throttle(trans, tree_root);
6227                         trans = btrfs_start_transaction(tree_root, 0);
6228                         if (block_rsv)
6229                                 trans->block_rsv = block_rsv;
6230                 }
6231         }
6232         btrfs_release_path(root, path);
6233         BUG_ON(err);
6234
6235         ret = btrfs_del_root(trans, tree_root, &root->root_key);
6236         BUG_ON(ret);
6237
6238         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6239                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6240                                            NULL, NULL);
6241                 BUG_ON(ret < 0);
6242                 if (ret > 0) {
6243                         ret = btrfs_del_orphan_item(trans, tree_root,
6244                                                     root->root_key.objectid);
6245                         BUG_ON(ret);
6246                 }
6247         }
6248
6249         if (root->in_radix) {
6250                 btrfs_free_fs_root(tree_root->fs_info, root);
6251         } else {
6252                 free_extent_buffer(root->node);
6253                 free_extent_buffer(root->commit_root);
6254                 kfree(root);
6255         }
6256 out:
6257         btrfs_end_transaction_throttle(trans, tree_root);
6258         kfree(wc);
6259         btrfs_free_path(path);
6260         return err;
6261 }
6262
6263 /*
6264  * drop subtree rooted at tree block 'node'.
6265  *
6266  * NOTE: this function will unlock and release tree block 'node'
6267  */
6268 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6269                         struct btrfs_root *root,
6270                         struct extent_buffer *node,
6271                         struct extent_buffer *parent)
6272 {
6273         struct btrfs_path *path;
6274         struct walk_control *wc;
6275         int level;
6276         int parent_level;
6277         int ret = 0;
6278         int wret;
6279
6280         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6281
6282         path = btrfs_alloc_path();
6283         BUG_ON(!path);
6284
6285         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6286         BUG_ON(!wc);
6287
6288         btrfs_assert_tree_locked(parent);
6289         parent_level = btrfs_header_level(parent);
6290         extent_buffer_get(parent);
6291         path->nodes[parent_level] = parent;
6292         path->slots[parent_level] = btrfs_header_nritems(parent);
6293
6294         btrfs_assert_tree_locked(node);
6295         level = btrfs_header_level(node);
6296         path->nodes[level] = node;
6297         path->slots[level] = 0;
6298         path->locks[level] = 1;
6299
6300         wc->refs[parent_level] = 1;
6301         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6302         wc->level = level;
6303         wc->shared_level = -1;
6304         wc->stage = DROP_REFERENCE;
6305         wc->update_ref = 0;
6306         wc->keep_locks = 1;
6307         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6308
6309         while (1) {
6310                 wret = walk_down_tree(trans, root, path, wc);
6311                 if (wret < 0) {
6312                         ret = wret;
6313                         break;
6314                 }
6315
6316                 wret = walk_up_tree(trans, root, path, wc, parent_level);
6317                 if (wret < 0)
6318                         ret = wret;
6319                 if (wret != 0)
6320                         break;
6321         }
6322
6323         kfree(wc);
6324         btrfs_free_path(path);
6325         return ret;
6326 }
6327
6328 #if 0
6329 static unsigned long calc_ra(unsigned long start, unsigned long last,
6330                              unsigned long nr)
6331 {
6332         return min(last, start + nr - 1);
6333 }
6334
6335 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6336                                          u64 len)
6337 {
6338         u64 page_start;
6339         u64 page_end;
6340         unsigned long first_index;
6341         unsigned long last_index;
6342         unsigned long i;
6343         struct page *page;
6344         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6345         struct file_ra_state *ra;
6346         struct btrfs_ordered_extent *ordered;
6347         unsigned int total_read = 0;
6348         unsigned int total_dirty = 0;
6349         int ret = 0;
6350
6351         ra = kzalloc(sizeof(*ra), GFP_NOFS);
6352
6353         mutex_lock(&inode->i_mutex);
6354         first_index = start >> PAGE_CACHE_SHIFT;
6355         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
6356
6357         /* make sure the dirty trick played by the caller work */
6358         ret = invalidate_inode_pages2_range(inode->i_mapping,
6359                                             first_index, last_index);
6360         if (ret)
6361                 goto out_unlock;
6362
6363         file_ra_state_init(ra, inode->i_mapping);
6364
6365         for (i = first_index ; i <= last_index; i++) {
6366                 if (total_read % ra->ra_pages == 0) {
6367                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
6368                                        calc_ra(i, last_index, ra->ra_pages));
6369                 }
6370                 total_read++;
6371 again:
6372                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
6373                         BUG_ON(1);
6374                 page = grab_cache_page(inode->i_mapping, i);
6375                 if (!page) {
6376                         ret = -ENOMEM;
6377                         goto out_unlock;
6378                 }
6379                 if (!PageUptodate(page)) {
6380                         btrfs_readpage(NULL, page);
6381                         lock_page(page);
6382                         if (!PageUptodate(page)) {
6383                                 unlock_page(page);
6384                                 page_cache_release(page);
6385                                 ret = -EIO;
6386                                 goto out_unlock;
6387                         }
6388                 }
6389                 wait_on_page_writeback(page);
6390
6391                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
6392                 page_end = page_start + PAGE_CACHE_SIZE - 1;
6393                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
6394
6395                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
6396                 if (ordered) {
6397                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6398                         unlock_page(page);
6399                         page_cache_release(page);
6400                         btrfs_start_ordered_extent(inode, ordered, 1);
6401                         btrfs_put_ordered_extent(ordered);
6402                         goto again;
6403                 }
6404                 set_page_extent_mapped(page);
6405
6406                 if (i == first_index)
6407                         set_extent_bits(io_tree, page_start, page_end,
6408                                         EXTENT_BOUNDARY, GFP_NOFS);
6409                 btrfs_set_extent_delalloc(inode, page_start, page_end);
6410
6411                 set_page_dirty(page);
6412                 total_dirty++;
6413
6414                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6415                 unlock_page(page);
6416                 page_cache_release(page);
6417         }
6418
6419 out_unlock:
6420         kfree(ra);
6421         mutex_unlock(&inode->i_mutex);
6422         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
6423         return ret;
6424 }
6425
6426 static noinline int relocate_data_extent(struct inode *reloc_inode,
6427                                          struct btrfs_key *extent_key,
6428                                          u64 offset)
6429 {
6430         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6431         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
6432         struct extent_map *em;
6433         u64 start = extent_key->objectid - offset;
6434         u64 end = start + extent_key->offset - 1;
6435
6436         em = alloc_extent_map(GFP_NOFS);
6437         BUG_ON(!em || IS_ERR(em));
6438
6439         em->start = start;
6440         em->len = extent_key->offset;
6441         em->block_len = extent_key->offset;
6442         em->block_start = extent_key->objectid;
6443         em->bdev = root->fs_info->fs_devices->latest_bdev;
6444         set_bit(EXTENT_FLAG_PINNED, &em->flags);
6445
6446         /* setup extent map to cheat btrfs_readpage */
6447         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6448         while (1) {
6449                 int ret;
6450                 write_lock(&em_tree->lock);
6451                 ret = add_extent_mapping(em_tree, em);
6452                 write_unlock(&em_tree->lock);
6453                 if (ret != -EEXIST) {
6454                         free_extent_map(em);
6455                         break;
6456                 }
6457                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
6458         }
6459         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6460
6461         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
6462 }
6463
6464 struct btrfs_ref_path {
6465         u64 extent_start;
6466         u64 nodes[BTRFS_MAX_LEVEL];
6467         u64 root_objectid;
6468         u64 root_generation;
6469         u64 owner_objectid;
6470         u32 num_refs;
6471         int lowest_level;
6472         int current_level;
6473         int shared_level;
6474
6475         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
6476         u64 new_nodes[BTRFS_MAX_LEVEL];
6477 };
6478
6479 struct disk_extent {
6480         u64 ram_bytes;
6481         u64 disk_bytenr;
6482         u64 disk_num_bytes;
6483         u64 offset;
6484         u64 num_bytes;
6485         u8 compression;
6486         u8 encryption;
6487         u16 other_encoding;
6488 };
6489
6490 static int is_cowonly_root(u64 root_objectid)
6491 {
6492         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
6493             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
6494             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
6495             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
6496             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6497             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
6498                 return 1;
6499         return 0;
6500 }
6501
6502 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
6503                                     struct btrfs_root *extent_root,
6504                                     struct btrfs_ref_path *ref_path,
6505                                     int first_time)
6506 {
6507         struct extent_buffer *leaf;
6508         struct btrfs_path *path;
6509         struct btrfs_extent_ref *ref;
6510         struct btrfs_key key;
6511         struct btrfs_key found_key;
6512         u64 bytenr;
6513         u32 nritems;
6514         int level;
6515         int ret = 1;
6516
6517         path = btrfs_alloc_path();
6518         if (!path)
6519                 return -ENOMEM;
6520
6521         if (first_time) {
6522                 ref_path->lowest_level = -1;
6523                 ref_path->current_level = -1;
6524                 ref_path->shared_level = -1;
6525                 goto walk_up;
6526         }
6527 walk_down:
6528         level = ref_path->current_level - 1;
6529         while (level >= -1) {
6530                 u64 parent;
6531                 if (level < ref_path->lowest_level)
6532                         break;
6533
6534                 if (level >= 0)
6535                         bytenr = ref_path->nodes[level];
6536                 else
6537                         bytenr = ref_path->extent_start;
6538                 BUG_ON(bytenr == 0);
6539
6540                 parent = ref_path->nodes[level + 1];
6541                 ref_path->nodes[level + 1] = 0;
6542                 ref_path->current_level = level;
6543                 BUG_ON(parent == 0);
6544
6545                 key.objectid = bytenr;
6546                 key.offset = parent + 1;
6547                 key.type = BTRFS_EXTENT_REF_KEY;
6548
6549                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6550                 if (ret < 0)
6551                         goto out;
6552                 BUG_ON(ret == 0);
6553
6554                 leaf = path->nodes[0];
6555                 nritems = btrfs_header_nritems(leaf);
6556                 if (path->slots[0] >= nritems) {
6557                         ret = btrfs_next_leaf(extent_root, path);
6558                         if (ret < 0)
6559                                 goto out;
6560                         if (ret > 0)
6561                                 goto next;
6562                         leaf = path->nodes[0];
6563                 }
6564
6565                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6566                 if (found_key.objectid == bytenr &&
6567                     found_key.type == BTRFS_EXTENT_REF_KEY) {
6568                         if (level < ref_path->shared_level)
6569                                 ref_path->shared_level = level;
6570                         goto found;
6571                 }
6572 next:
6573                 level--;
6574                 btrfs_release_path(extent_root, path);
6575                 cond_resched();
6576         }
6577         /* reached lowest level */
6578         ret = 1;
6579         goto out;
6580 walk_up:
6581         level = ref_path->current_level;
6582         while (level < BTRFS_MAX_LEVEL - 1) {
6583                 u64 ref_objectid;
6584
6585                 if (level >= 0)
6586                         bytenr = ref_path->nodes[level];
6587                 else
6588                         bytenr = ref_path->extent_start;
6589
6590                 BUG_ON(bytenr == 0);
6591
6592                 key.objectid = bytenr;
6593                 key.offset = 0;
6594                 key.type = BTRFS_EXTENT_REF_KEY;
6595
6596                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6597                 if (ret < 0)
6598                         goto out;
6599
6600                 leaf = path->nodes[0];
6601                 nritems = btrfs_header_nritems(leaf);
6602                 if (path->slots[0] >= nritems) {
6603                         ret = btrfs_next_leaf(extent_root, path);
6604                         if (ret < 0)
6605                                 goto out;
6606                         if (ret > 0) {
6607                                 /* the extent was freed by someone */
6608                                 if (ref_path->lowest_level == level)
6609                                         goto out;
6610                                 btrfs_release_path(extent_root, path);
6611                                 goto walk_down;
6612                         }
6613                         leaf = path->nodes[0];
6614                 }
6615
6616                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6617                 if (found_key.objectid != bytenr ||
6618                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
6619                         /* the extent was freed by someone */
6620                         if (ref_path->lowest_level == level) {
6621                                 ret = 1;
6622                                 goto out;
6623                         }
6624                         btrfs_release_path(extent_root, path);
6625                         goto walk_down;
6626                 }
6627 found:
6628                 ref = btrfs_item_ptr(leaf, path->slots[0],
6629                                 struct btrfs_extent_ref);
6630                 ref_objectid = btrfs_ref_objectid(leaf, ref);
6631                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6632                         if (first_time) {
6633                                 level = (int)ref_objectid;
6634                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
6635                                 ref_path->lowest_level = level;
6636                                 ref_path->current_level = level;
6637                                 ref_path->nodes[level] = bytenr;
6638                         } else {
6639                                 WARN_ON(ref_objectid != level);
6640                         }
6641                 } else {
6642                         WARN_ON(level != -1);
6643                 }
6644                 first_time = 0;
6645
6646                 if (ref_path->lowest_level == level) {
6647                         ref_path->owner_objectid = ref_objectid;
6648                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6649                 }
6650
6651                 /*
6652                  * the block is tree root or the block isn't in reference
6653                  * counted tree.
6654                  */
6655                 if (found_key.objectid == found_key.offset ||
6656                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6657                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6658                         ref_path->root_generation =
6659                                 btrfs_ref_generation(leaf, ref);
6660                         if (level < 0) {
6661                                 /* special reference from the tree log */
6662                                 ref_path->nodes[0] = found_key.offset;
6663                                 ref_path->current_level = 0;
6664                         }
6665                         ret = 0;
6666                         goto out;
6667                 }
6668
6669                 level++;
6670                 BUG_ON(ref_path->nodes[level] != 0);
6671                 ref_path->nodes[level] = found_key.offset;
6672                 ref_path->current_level = level;
6673
6674                 /*
6675                  * the reference was created in the running transaction,
6676                  * no need to continue walking up.
6677                  */
6678                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6679                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6680                         ref_path->root_generation =
6681                                 btrfs_ref_generation(leaf, ref);
6682                         ret = 0;
6683                         goto out;
6684                 }
6685
6686                 btrfs_release_path(extent_root, path);
6687                 cond_resched();
6688         }
6689         /* reached max tree level, but no tree root found. */
6690         BUG();
6691 out:
6692         btrfs_free_path(path);
6693         return ret;
6694 }
6695
6696 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6697                                 struct btrfs_root *extent_root,
6698                                 struct btrfs_ref_path *ref_path,
6699                                 u64 extent_start)
6700 {
6701         memset(ref_path, 0, sizeof(*ref_path));
6702         ref_path->extent_start = extent_start;
6703
6704         return __next_ref_path(trans, extent_root, ref_path, 1);
6705 }
6706
6707 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6708                                struct btrfs_root *extent_root,
6709                                struct btrfs_ref_path *ref_path)
6710 {
6711         return __next_ref_path(trans, extent_root, ref_path, 0);
6712 }
6713
6714 static noinline int get_new_locations(struct inode *reloc_inode,
6715                                       struct btrfs_key *extent_key,
6716                                       u64 offset, int no_fragment,
6717                                       struct disk_extent **extents,
6718                                       int *nr_extents)
6719 {
6720         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6721         struct btrfs_path *path;
6722         struct btrfs_file_extent_item *fi;
6723         struct extent_buffer *leaf;
6724         struct disk_extent *exts = *extents;
6725         struct btrfs_key found_key;
6726         u64 cur_pos;
6727         u64 last_byte;
6728         u32 nritems;
6729         int nr = 0;
6730         int max = *nr_extents;
6731         int ret;
6732
6733         WARN_ON(!no_fragment && *extents);
6734         if (!exts) {
6735                 max = 1;
6736                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6737                 if (!exts)
6738                         return -ENOMEM;
6739         }
6740
6741         path = btrfs_alloc_path();
6742         BUG_ON(!path);
6743
6744         cur_pos = extent_key->objectid - offset;
6745         last_byte = extent_key->objectid + extent_key->offset;
6746         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6747                                        cur_pos, 0);
6748         if (ret < 0)
6749                 goto out;
6750         if (ret > 0) {
6751                 ret = -ENOENT;
6752                 goto out;
6753         }
6754
6755         while (1) {
6756                 leaf = path->nodes[0];
6757                 nritems = btrfs_header_nritems(leaf);
6758                 if (path->slots[0] >= nritems) {
6759                         ret = btrfs_next_leaf(root, path);
6760                         if (ret < 0)
6761                                 goto out;
6762                         if (ret > 0)
6763                                 break;
6764                         leaf = path->nodes[0];
6765                 }
6766
6767                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6768                 if (found_key.offset != cur_pos ||
6769                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
6770                     found_key.objectid != reloc_inode->i_ino)
6771                         break;
6772
6773                 fi = btrfs_item_ptr(leaf, path->slots[0],
6774                                     struct btrfs_file_extent_item);
6775                 if (btrfs_file_extent_type(leaf, fi) !=
6776                     BTRFS_FILE_EXTENT_REG ||
6777                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6778                         break;
6779
6780                 if (nr == max) {
6781                         struct disk_extent *old = exts;
6782                         max *= 2;
6783                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6784                         memcpy(exts, old, sizeof(*exts) * nr);
6785                         if (old != *extents)
6786                                 kfree(old);
6787                 }
6788
6789                 exts[nr].disk_bytenr =
6790                         btrfs_file_extent_disk_bytenr(leaf, fi);
6791                 exts[nr].disk_num_bytes =
6792                         btrfs_file_extent_disk_num_bytes(leaf, fi);
6793                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6794                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6795                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6796                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6797                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6798                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6799                                                                            fi);
6800                 BUG_ON(exts[nr].offset > 0);
6801                 BUG_ON(exts[nr].compression || exts[nr].encryption);
6802                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6803
6804                 cur_pos += exts[nr].num_bytes;
6805                 nr++;
6806
6807                 if (cur_pos + offset >= last_byte)
6808                         break;
6809
6810                 if (no_fragment) {
6811                         ret = 1;
6812                         goto out;
6813                 }
6814                 path->slots[0]++;
6815         }
6816
6817         BUG_ON(cur_pos + offset > last_byte);
6818         if (cur_pos + offset < last_byte) {
6819                 ret = -ENOENT;
6820                 goto out;
6821         }
6822         ret = 0;
6823 out:
6824         btrfs_free_path(path);
6825         if (ret) {
6826                 if (exts != *extents)
6827                         kfree(exts);
6828         } else {
6829                 *extents = exts;
6830                 *nr_extents = nr;
6831         }
6832         return ret;
6833 }
6834
6835 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6836                                         struct btrfs_root *root,
6837                                         struct btrfs_path *path,
6838                                         struct btrfs_key *extent_key,
6839                                         struct btrfs_key *leaf_key,
6840                                         struct btrfs_ref_path *ref_path,
6841                                         struct disk_extent *new_extents,
6842                                         int nr_extents)
6843 {
6844         struct extent_buffer *leaf;
6845         struct btrfs_file_extent_item *fi;
6846         struct inode *inode = NULL;
6847         struct btrfs_key key;
6848         u64 lock_start = 0;
6849         u64 lock_end = 0;
6850         u64 num_bytes;
6851         u64 ext_offset;
6852         u64 search_end = (u64)-1;
6853         u32 nritems;
6854         int nr_scaned = 0;
6855         int extent_locked = 0;
6856         int extent_type;
6857         int ret;
6858
6859         memcpy(&key, leaf_key, sizeof(key));
6860         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6861                 if (key.objectid < ref_path->owner_objectid ||
6862                     (key.objectid == ref_path->owner_objectid &&
6863                      key.type < BTRFS_EXTENT_DATA_KEY)) {
6864                         key.objectid = ref_path->owner_objectid;
6865                         key.type = BTRFS_EXTENT_DATA_KEY;
6866                         key.offset = 0;
6867                 }
6868         }
6869
6870         while (1) {
6871                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6872                 if (ret < 0)
6873                         goto out;
6874
6875                 leaf = path->nodes[0];
6876                 nritems = btrfs_header_nritems(leaf);
6877 next:
6878                 if (extent_locked && ret > 0) {
6879                         /*
6880                          * the file extent item was modified by someone
6881                          * before the extent got locked.
6882                          */
6883                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6884                                       lock_end, GFP_NOFS);
6885                         extent_locked = 0;
6886                 }
6887
6888                 if (path->slots[0] >= nritems) {
6889                         if (++nr_scaned > 2)
6890                                 break;
6891
6892                         BUG_ON(extent_locked);
6893                         ret = btrfs_next_leaf(root, path);
6894                         if (ret < 0)
6895                                 goto out;
6896                         if (ret > 0)
6897                                 break;
6898                         leaf = path->nodes[0];
6899                         nritems = btrfs_header_nritems(leaf);
6900                 }
6901
6902                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6903
6904                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6905                         if ((key.objectid > ref_path->owner_objectid) ||
6906                             (key.objectid == ref_path->owner_objectid &&
6907                              key.type > BTRFS_EXTENT_DATA_KEY) ||
6908                             key.offset >= search_end)
6909                                 break;
6910                 }
6911
6912                 if (inode && key.objectid != inode->i_ino) {
6913                         BUG_ON(extent_locked);
6914                         btrfs_release_path(root, path);
6915                         mutex_unlock(&inode->i_mutex);
6916                         iput(inode);
6917                         inode = NULL;
6918                         continue;
6919                 }
6920
6921                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6922                         path->slots[0]++;
6923                         ret = 1;
6924                         goto next;
6925                 }
6926                 fi = btrfs_item_ptr(leaf, path->slots[0],
6927                                     struct btrfs_file_extent_item);
6928                 extent_type = btrfs_file_extent_type(leaf, fi);
6929                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6930                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6931                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6932                      extent_key->objectid)) {
6933                         path->slots[0]++;
6934                         ret = 1;
6935                         goto next;
6936                 }
6937
6938                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6939                 ext_offset = btrfs_file_extent_offset(leaf, fi);
6940
6941                 if (search_end == (u64)-1) {
6942                         search_end = key.offset - ext_offset +
6943                                 btrfs_file_extent_ram_bytes(leaf, fi);
6944                 }
6945
6946                 if (!extent_locked) {
6947                         lock_start = key.offset;
6948                         lock_end = lock_start + num_bytes - 1;
6949                 } else {
6950                         if (lock_start > key.offset ||
6951                             lock_end + 1 < key.offset + num_bytes) {
6952                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6953                                               lock_start, lock_end, GFP_NOFS);
6954                                 extent_locked = 0;
6955                         }
6956                 }
6957
6958                 if (!inode) {
6959                         btrfs_release_path(root, path);
6960
6961                         inode = btrfs_iget_locked(root->fs_info->sb,
6962                                                   key.objectid, root);
6963                         if (inode->i_state & I_NEW) {
6964                                 BTRFS_I(inode)->root = root;
6965                                 BTRFS_I(inode)->location.objectid =
6966                                         key.objectid;
6967                                 BTRFS_I(inode)->location.type =
6968                                         BTRFS_INODE_ITEM_KEY;
6969                                 BTRFS_I(inode)->location.offset = 0;
6970                                 btrfs_read_locked_inode(inode);
6971                                 unlock_new_inode(inode);
6972                         }
6973                         /*
6974                          * some code call btrfs_commit_transaction while
6975                          * holding the i_mutex, so we can't use mutex_lock
6976                          * here.
6977                          */
6978                         if (is_bad_inode(inode) ||
6979                             !mutex_trylock(&inode->i_mutex)) {
6980                                 iput(inode);
6981                                 inode = NULL;
6982                                 key.offset = (u64)-1;
6983                                 goto skip;
6984                         }
6985                 }
6986
6987                 if (!extent_locked) {
6988                         struct btrfs_ordered_extent *ordered;
6989
6990                         btrfs_release_path(root, path);
6991
6992                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6993                                     lock_end, GFP_NOFS);
6994                         ordered = btrfs_lookup_first_ordered_extent(inode,
6995                                                                     lock_end);
6996                         if (ordered &&
6997                             ordered->file_offset <= lock_end &&
6998                             ordered->file_offset + ordered->len > lock_start) {
6999                                 unlock_extent(&BTRFS_I(inode)->io_tree,
7000                                               lock_start, lock_end, GFP_NOFS);
7001                                 btrfs_start_ordered_extent(inode, ordered, 1);
7002                                 btrfs_put_ordered_extent(ordered);
7003                                 key.offset += num_bytes;
7004                                 goto skip;
7005                         }
7006                         if (ordered)
7007                                 btrfs_put_ordered_extent(ordered);
7008
7009                         extent_locked = 1;
7010                         continue;
7011                 }
7012
7013                 if (nr_extents == 1) {
7014                         /* update extent pointer in place */
7015                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
7016                                                 new_extents[0].disk_bytenr);
7017                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7018                                                 new_extents[0].disk_num_bytes);
7019                         btrfs_mark_buffer_dirty(leaf);
7020
7021                         btrfs_drop_extent_cache(inode, key.offset,
7022                                                 key.offset + num_bytes - 1, 0);
7023
7024                         ret = btrfs_inc_extent_ref(trans, root,
7025                                                 new_extents[0].disk_bytenr,
7026                                                 new_extents[0].disk_num_bytes,
7027                                                 leaf->start,
7028                                                 root->root_key.objectid,
7029                                                 trans->transid,
7030                                                 key.objectid);
7031                         BUG_ON(ret);
7032
7033                         ret = btrfs_free_extent(trans, root,
7034                                                 extent_key->objectid,
7035                                                 extent_key->offset,
7036                                                 leaf->start,
7037                                                 btrfs_header_owner(leaf),
7038                                                 btrfs_header_generation(leaf),
7039                                                 key.objectid, 0);
7040                         BUG_ON(ret);
7041
7042                         btrfs_release_path(root, path);
7043                         key.offset += num_bytes;
7044                 } else {
7045                         BUG_ON(1);
7046 #if 0
7047                         u64 alloc_hint;
7048                         u64 extent_len;
7049                         int i;
7050                         /*
7051                          * drop old extent pointer at first, then insert the
7052                          * new pointers one bye one
7053                          */
7054                         btrfs_release_path(root, path);
7055                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
7056                                                  key.offset + num_bytes,
7057                                                  key.offset, &alloc_hint);
7058                         BUG_ON(ret);
7059
7060                         for (i = 0; i < nr_extents; i++) {
7061                                 if (ext_offset >= new_extents[i].num_bytes) {
7062                                         ext_offset -= new_extents[i].num_bytes;
7063                                         continue;
7064                                 }
7065                                 extent_len = min(new_extents[i].num_bytes -
7066                                                  ext_offset, num_bytes);
7067
7068                                 ret = btrfs_insert_empty_item(trans, root,
7069                                                               path, &key,
7070                                                               sizeof(*fi));
7071                                 BUG_ON(ret);
7072
7073                                 leaf = path->nodes[0];
7074                                 fi = btrfs_item_ptr(leaf, path->slots[0],
7075                                                 struct btrfs_file_extent_item);
7076                                 btrfs_set_file_extent_generation(leaf, fi,
7077                                                         trans->transid);
7078                                 btrfs_set_file_extent_type(leaf, fi,
7079                                                         BTRFS_FILE_EXTENT_REG);
7080                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
7081                                                 new_extents[i].disk_bytenr);
7082                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7083                                                 new_extents[i].disk_num_bytes);
7084                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
7085                                                 new_extents[i].ram_bytes);
7086
7087                                 btrfs_set_file_extent_compression(leaf, fi,
7088                                                 new_extents[i].compression);
7089                                 btrfs_set_file_extent_encryption(leaf, fi,
7090                                                 new_extents[i].encryption);
7091                                 btrfs_set_file_extent_other_encoding(leaf, fi,
7092                                                 new_extents[i].other_encoding);
7093
7094                                 btrfs_set_file_extent_num_bytes(leaf, fi,
7095                                                         extent_len);
7096                                 ext_offset += new_extents[i].offset;
7097                                 btrfs_set_file_extent_offset(leaf, fi,
7098                                                         ext_offset);
7099                                 btrfs_mark_buffer_dirty(leaf);
7100
7101                                 btrfs_drop_extent_cache(inode, key.offset,
7102                                                 key.offset + extent_len - 1, 0);
7103
7104                                 ret = btrfs_inc_extent_ref(trans, root,
7105                                                 new_extents[i].disk_bytenr,
7106                                                 new_extents[i].disk_num_bytes,
7107                                                 leaf->start,
7108                                                 root->root_key.objectid,
7109                                                 trans->transid, key.objectid);
7110                                 BUG_ON(ret);
7111                                 btrfs_release_path(root, path);
7112
7113                                 inode_add_bytes(inode, extent_len);
7114
7115                                 ext_offset = 0;
7116                                 num_bytes -= extent_len;
7117                                 key.offset += extent_len;
7118
7119                                 if (num_bytes == 0)
7120                                         break;
7121                         }
7122                         BUG_ON(i >= nr_extents);
7123 #endif
7124                 }
7125
7126                 if (extent_locked) {
7127                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7128                                       lock_end, GFP_NOFS);
7129                         extent_locked = 0;
7130                 }
7131 skip:
7132                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
7133                     key.offset >= search_end)
7134                         break;
7135
7136                 cond_resched();
7137         }
7138         ret = 0;
7139 out:
7140         btrfs_release_path(root, path);
7141         if (inode) {
7142                 mutex_unlock(&inode->i_mutex);
7143                 if (extent_locked) {
7144                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7145                                       lock_end, GFP_NOFS);
7146                 }
7147                 iput(inode);
7148         }
7149         return ret;
7150 }
7151
7152 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
7153                                struct btrfs_root *root,
7154                                struct extent_buffer *buf, u64 orig_start)
7155 {
7156         int level;
7157         int ret;
7158
7159         BUG_ON(btrfs_header_generation(buf) != trans->transid);
7160         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7161
7162         level = btrfs_header_level(buf);
7163         if (level == 0) {
7164                 struct btrfs_leaf_ref *ref;
7165                 struct btrfs_leaf_ref *orig_ref;
7166
7167                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
7168                 if (!orig_ref)
7169                         return -ENOENT;
7170
7171                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
7172                 if (!ref) {
7173                         btrfs_free_leaf_ref(root, orig_ref);
7174                         return -ENOMEM;
7175                 }
7176
7177                 ref->nritems = orig_ref->nritems;
7178                 memcpy(ref->extents, orig_ref->extents,
7179                         sizeof(ref->extents[0]) * ref->nritems);
7180
7181                 btrfs_free_leaf_ref(root, orig_ref);
7182
7183                 ref->root_gen = trans->transid;
7184                 ref->bytenr = buf->start;
7185                 ref->owner = btrfs_header_owner(buf);
7186                 ref->generation = btrfs_header_generation(buf);
7187
7188                 ret = btrfs_add_leaf_ref(root, ref, 0);
7189                 WARN_ON(ret);
7190                 btrfs_free_leaf_ref(root, ref);
7191         }
7192         return 0;
7193 }
7194
7195 static noinline int invalidate_extent_cache(struct btrfs_root *root,
7196                                         struct extent_buffer *leaf,
7197                                         struct btrfs_block_group_cache *group,
7198                                         struct btrfs_root *target_root)
7199 {
7200         struct btrfs_key key;
7201         struct inode *inode = NULL;
7202         struct btrfs_file_extent_item *fi;
7203         struct extent_state *cached_state = NULL;
7204         u64 num_bytes;
7205         u64 skip_objectid = 0;
7206         u32 nritems;
7207         u32 i;
7208
7209         nritems = btrfs_header_nritems(leaf);
7210         for (i = 0; i < nritems; i++) {
7211                 btrfs_item_key_to_cpu(leaf, &key, i);
7212                 if (key.objectid == skip_objectid ||
7213                     key.type != BTRFS_EXTENT_DATA_KEY)
7214                         continue;
7215                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7216                 if (btrfs_file_extent_type(leaf, fi) ==
7217                     BTRFS_FILE_EXTENT_INLINE)
7218                         continue;
7219                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
7220                         continue;
7221                 if (!inode || inode->i_ino != key.objectid) {
7222                         iput(inode);
7223                         inode = btrfs_ilookup(target_root->fs_info->sb,
7224                                               key.objectid, target_root, 1);
7225                 }
7226                 if (!inode) {
7227                         skip_objectid = key.objectid;
7228                         continue;
7229                 }
7230                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
7231
7232                 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
7233                                  key.offset + num_bytes - 1, 0, &cached_state,
7234                                  GFP_NOFS);
7235                 btrfs_drop_extent_cache(inode, key.offset,
7236                                         key.offset + num_bytes - 1, 1);
7237                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
7238                                      key.offset + num_bytes - 1, &cached_state,
7239                                      GFP_NOFS);
7240                 cond_resched();
7241         }
7242         iput(inode);
7243         return 0;
7244 }
7245
7246 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7247                                         struct btrfs_root *root,
7248                                         struct extent_buffer *leaf,
7249                                         struct btrfs_block_group_cache *group,
7250                                         struct inode *reloc_inode)
7251 {
7252         struct btrfs_key key;
7253         struct btrfs_key extent_key;
7254         struct btrfs_file_extent_item *fi;
7255         struct btrfs_leaf_ref *ref;
7256         struct disk_extent *new_extent;
7257         u64 bytenr;
7258         u64 num_bytes;
7259         u32 nritems;
7260         u32 i;
7261         int ext_index;
7262         int nr_extent;
7263         int ret;
7264
7265         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7266         BUG_ON(!new_extent);
7267
7268         ref = btrfs_lookup_leaf_ref(root, leaf->start);
7269         BUG_ON(!ref);
7270
7271         ext_index = -1;
7272         nritems = btrfs_header_nritems(leaf);
7273         for (i = 0; i < nritems; i++) {
7274                 btrfs_item_key_to_cpu(leaf, &key, i);
7275                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
7276                         continue;
7277                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7278                 if (btrfs_file_extent_type(leaf, fi) ==
7279                     BTRFS_FILE_EXTENT_INLINE)
7280                         continue;
7281                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7282                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
7283                 if (bytenr == 0)
7284                         continue;
7285
7286                 ext_index++;
7287                 if (bytenr >= group->key.objectid + group->key.offset ||
7288                     bytenr + num_bytes <= group->key.objectid)
7289                         continue;
7290
7291                 extent_key.objectid = bytenr;
7292                 extent_key.offset = num_bytes;
7293                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7294                 nr_extent = 1;
7295                 ret = get_new_locations(reloc_inode, &extent_key,
7296                                         group->key.objectid, 1,
7297                                         &new_extent, &nr_extent);
7298                 if (ret > 0)
7299                         continue;
7300                 BUG_ON(ret < 0);
7301
7302                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
7303                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
7304                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
7305                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
7306
7307                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
7308                                                 new_extent->disk_bytenr);
7309                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7310                                                 new_extent->disk_num_bytes);
7311                 btrfs_mark_buffer_dirty(leaf);
7312
7313                 ret = btrfs_inc_extent_ref(trans, root,
7314                                         new_extent->disk_bytenr,
7315                                         new_extent->disk_num_bytes,
7316                                         leaf->start,
7317                                         root->root_key.objectid,
7318                                         trans->transid, key.objectid);
7319                 BUG_ON(ret);
7320
7321                 ret = btrfs_free_extent(trans, root,
7322                                         bytenr, num_bytes, leaf->start,
7323                                         btrfs_header_owner(leaf),
7324                                         btrfs_header_generation(leaf),
7325                                         key.objectid, 0);
7326                 BUG_ON(ret);
7327                 cond_resched();
7328         }
7329         kfree(new_extent);
7330         BUG_ON(ext_index + 1 != ref->nritems);
7331         btrfs_free_leaf_ref(root, ref);
7332         return 0;
7333 }
7334
7335 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
7336                           struct btrfs_root *root)
7337 {
7338         struct btrfs_root *reloc_root;
7339         int ret;
7340
7341         if (root->reloc_root) {
7342                 reloc_root = root->reloc_root;
7343                 root->reloc_root = NULL;
7344                 list_add(&reloc_root->dead_list,
7345                          &root->fs_info->dead_reloc_roots);
7346
7347                 btrfs_set_root_bytenr(&reloc_root->root_item,
7348                                       reloc_root->node->start);
7349                 btrfs_set_root_level(&root->root_item,
7350                                      btrfs_header_level(reloc_root->node));
7351                 memset(&reloc_root->root_item.drop_progress, 0,
7352                         sizeof(struct btrfs_disk_key));
7353                 reloc_root->root_item.drop_level = 0;
7354
7355                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
7356                                         &reloc_root->root_key,
7357                                         &reloc_root->root_item);
7358                 BUG_ON(ret);
7359         }
7360         return 0;
7361 }
7362
7363 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7364 {
7365         struct btrfs_trans_handle *trans;
7366         struct btrfs_root *reloc_root;
7367         struct btrfs_root *prev_root = NULL;
7368         struct list_head dead_roots;
7369         int ret;
7370         unsigned long nr;
7371
7372         INIT_LIST_HEAD(&dead_roots);
7373         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
7374
7375         while (!list_empty(&dead_roots)) {
7376                 reloc_root = list_entry(dead_roots.prev,
7377                                         struct btrfs_root, dead_list);
7378                 list_del_init(&reloc_root->dead_list);
7379
7380                 BUG_ON(reloc_root->commit_root != NULL);
7381                 while (1) {
7382                         trans = btrfs_join_transaction(root, 1);
7383                         BUG_ON(!trans);
7384
7385                         mutex_lock(&root->fs_info->drop_mutex);
7386                         ret = btrfs_drop_snapshot(trans, reloc_root);
7387                         if (ret != -EAGAIN)
7388                                 break;
7389                         mutex_unlock(&root->fs_info->drop_mutex);
7390
7391                         nr = trans->blocks_used;
7392                         ret = btrfs_end_transaction(trans, root);
7393                         BUG_ON(ret);
7394                         btrfs_btree_balance_dirty(root, nr);
7395                 }
7396
7397                 free_extent_buffer(reloc_root->node);
7398
7399                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
7400                                      &reloc_root->root_key);
7401                 BUG_ON(ret);
7402                 mutex_unlock(&root->fs_info->drop_mutex);
7403
7404                 nr = trans->blocks_used;
7405                 ret = btrfs_end_transaction(trans, root);
7406                 BUG_ON(ret);
7407                 btrfs_btree_balance_dirty(root, nr);
7408
7409                 kfree(prev_root);
7410                 prev_root = reloc_root;
7411         }
7412         if (prev_root) {
7413                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
7414                 kfree(prev_root);
7415         }
7416         return 0;
7417 }
7418
7419 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
7420 {
7421         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
7422         return 0;
7423 }
7424
7425 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7426 {
7427         struct btrfs_root *reloc_root;
7428         struct btrfs_trans_handle *trans;
7429         struct btrfs_key location;
7430         int found;
7431         int ret;
7432
7433         mutex_lock(&root->fs_info->tree_reloc_mutex);
7434         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
7435         BUG_ON(ret);
7436         found = !list_empty(&root->fs_info->dead_reloc_roots);
7437         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7438
7439         if (found) {
7440                 trans = btrfs_start_transaction(root, 1);
7441                 BUG_ON(!trans);
7442                 ret = btrfs_commit_transaction(trans, root);
7443                 BUG_ON(ret);
7444         }
7445
7446         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7447         location.offset = (u64)-1;
7448         location.type = BTRFS_ROOT_ITEM_KEY;
7449
7450         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7451         BUG_ON(!reloc_root);
7452         btrfs_orphan_cleanup(reloc_root);
7453         return 0;
7454 }
7455
7456 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7457                                     struct btrfs_root *root)
7458 {
7459         struct btrfs_root *reloc_root;
7460         struct extent_buffer *eb;
7461         struct btrfs_root_item *root_item;
7462         struct btrfs_key root_key;
7463         int ret;
7464
7465         BUG_ON(!root->ref_cows);
7466         if (root->reloc_root)
7467                 return 0;
7468
7469         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7470         BUG_ON(!root_item);
7471
7472         ret = btrfs_copy_root(trans, root, root->commit_root,
7473                               &eb, BTRFS_TREE_RELOC_OBJECTID);
7474         BUG_ON(ret);
7475
7476         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7477         root_key.offset = root->root_key.objectid;
7478         root_key.type = BTRFS_ROOT_ITEM_KEY;
7479
7480         memcpy(root_item, &root->root_item, sizeof(root_item));
7481         btrfs_set_root_refs(root_item, 0);
7482         btrfs_set_root_bytenr(root_item, eb->start);
7483         btrfs_set_root_level(root_item, btrfs_header_level(eb));
7484         btrfs_set_root_generation(root_item, trans->transid);
7485
7486         btrfs_tree_unlock(eb);
7487         free_extent_buffer(eb);
7488
7489         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
7490                                 &root_key, root_item);
7491         BUG_ON(ret);
7492         kfree(root_item);
7493
7494         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7495                                                  &root_key);
7496         BUG_ON(!reloc_root);
7497         reloc_root->last_trans = trans->transid;
7498         reloc_root->commit_root = NULL;
7499         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
7500
7501         root->reloc_root = reloc_root;
7502         return 0;
7503 }
7504
7505 /*
7506  * Core function of space balance.
7507  *
7508  * The idea is using reloc trees to relocate tree blocks in reference
7509  * counted roots. There is one reloc tree for each subvol, and all
7510  * reloc trees share same root key objectid. Reloc trees are snapshots
7511  * of the latest committed roots of subvols (root->commit_root).
7512  *
7513  * To relocate a tree block referenced by a subvol, there are two steps.
7514  * COW the block through subvol's reloc tree, then update block pointer
7515  * in the subvol to point to the new block. Since all reloc trees share
7516  * same root key objectid, doing special handing for tree blocks owned
7517  * by them is easy. Once a tree block has been COWed in one reloc tree,
7518  * we can use the resulting new block directly when the same block is
7519  * required to COW again through other reloc trees. By this way, relocated
7520  * tree blocks are shared between reloc trees, so they are also shared
7521  * between subvols.
7522  */
7523 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
7524                                       struct btrfs_root *root,
7525                                       struct btrfs_path *path,
7526                                       struct btrfs_key *first_key,
7527                                       struct btrfs_ref_path *ref_path,
7528                                       struct btrfs_block_group_cache *group,
7529                                       struct inode *reloc_inode)
7530 {
7531         struct btrfs_root *reloc_root;
7532         struct extent_buffer *eb = NULL;
7533         struct btrfs_key *keys;
7534         u64 *nodes;
7535         int level;
7536         int shared_level;
7537         int lowest_level = 0;
7538         int ret;
7539
7540         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
7541                 lowest_level = ref_path->owner_objectid;
7542
7543         if (!root->ref_cows) {
7544                 path->lowest_level = lowest_level;
7545                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
7546                 BUG_ON(ret < 0);
7547                 path->lowest_level = 0;
7548                 btrfs_release_path(root, path);
7549                 return 0;
7550         }
7551
7552         mutex_lock(&root->fs_info->tree_reloc_mutex);
7553         ret = init_reloc_tree(trans, root);
7554         BUG_ON(ret);
7555         reloc_root = root->reloc_root;
7556
7557         shared_level = ref_path->shared_level;
7558         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
7559
7560         keys = ref_path->node_keys;
7561         nodes = ref_path->new_nodes;
7562         memset(&keys[shared_level + 1], 0,
7563                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7564         memset(&nodes[shared_level + 1], 0,
7565                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
7566
7567         if (nodes[lowest_level] == 0) {
7568                 path->lowest_level = lowest_level;
7569                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7570                                         0, 1);
7571                 BUG_ON(ret);
7572                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7573                         eb = path->nodes[level];
7574                         if (!eb || eb == reloc_root->node)
7575                                 break;
7576                         nodes[level] = eb->start;
7577                         if (level == 0)
7578                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
7579                         else
7580                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
7581                 }
7582                 if (nodes[0] &&
7583                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7584                         eb = path->nodes[0];
7585                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
7586                                                       group, reloc_inode);
7587                         BUG_ON(ret);
7588                 }
7589                 btrfs_release_path(reloc_root, path);
7590         } else {
7591                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
7592                                        lowest_level);
7593                 BUG_ON(ret);
7594         }
7595
7596         /*
7597          * replace tree blocks in the fs tree with tree blocks in
7598          * the reloc tree.
7599          */
7600         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7601         BUG_ON(ret < 0);
7602
7603         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7604                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7605                                         0, 0);
7606                 BUG_ON(ret);
7607                 extent_buffer_get(path->nodes[0]);
7608                 eb = path->nodes[0];
7609                 btrfs_release_path(reloc_root, path);
7610                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
7611                 BUG_ON(ret);
7612                 free_extent_buffer(eb);
7613         }
7614
7615         mutex_unlock(&root->fs_info->tree_reloc_mutex);
7616         path->lowest_level = 0;
7617         return 0;
7618 }
7619
7620 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
7621                                         struct btrfs_root *root,
7622                                         struct btrfs_path *path,
7623                                         struct btrfs_key *first_key,
7624                                         struct btrfs_ref_path *ref_path)
7625 {
7626         int ret;
7627
7628         ret = relocate_one_path(trans, root, path, first_key,
7629                                 ref_path, NULL, NULL);
7630         BUG_ON(ret);
7631
7632         return 0;
7633 }
7634
7635 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
7636                                     struct btrfs_root *extent_root,
7637                                     struct btrfs_path *path,
7638                                     struct btrfs_key *extent_key)
7639 {
7640         int ret;
7641
7642         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7643         if (ret)
7644                 goto out;
7645         ret = btrfs_del_item(trans, extent_root, path);
7646 out:
7647         btrfs_release_path(extent_root, path);
7648         return ret;
7649 }
7650
7651 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7652                                                 struct btrfs_ref_path *ref_path)
7653 {
7654         struct btrfs_key root_key;
7655
7656         root_key.objectid = ref_path->root_objectid;
7657         root_key.type = BTRFS_ROOT_ITEM_KEY;
7658         if (is_cowonly_root(ref_path->root_objectid))
7659                 root_key.offset = 0;
7660         else
7661                 root_key.offset = (u64)-1;
7662
7663         return btrfs_read_fs_root_no_name(fs_info, &root_key);
7664 }
7665
7666 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7667                                         struct btrfs_path *path,
7668                                         struct btrfs_key *extent_key,
7669                                         struct btrfs_block_group_cache *group,
7670                                         struct inode *reloc_inode, int pass)
7671 {
7672         struct btrfs_trans_handle *trans;
7673         struct btrfs_root *found_root;
7674         struct btrfs_ref_path *ref_path = NULL;
7675         struct disk_extent *new_extents = NULL;
7676         int nr_extents = 0;
7677         int loops;
7678         int ret;
7679         int level;
7680         struct btrfs_key first_key;
7681         u64 prev_block = 0;
7682
7683
7684         trans = btrfs_start_transaction(extent_root, 1);
7685         BUG_ON(!trans);
7686
7687         if (extent_key->objectid == 0) {
7688                 ret = del_extent_zero(trans, extent_root, path, extent_key);
7689                 goto out;
7690         }
7691
7692         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7693         if (!ref_path) {
7694                 ret = -ENOMEM;
7695                 goto out;
7696         }
7697
7698         for (loops = 0; ; loops++) {
7699                 if (loops == 0) {
7700                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7701                                                    extent_key->objectid);
7702                 } else {
7703                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7704                 }
7705                 if (ret < 0)
7706                         goto out;
7707                 if (ret > 0)
7708                         break;
7709
7710                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7711                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7712                         continue;
7713
7714                 found_root = read_ref_root(extent_root->fs_info, ref_path);
7715                 BUG_ON(!found_root);
7716                 /*
7717                  * for reference counted tree, only process reference paths
7718                  * rooted at the latest committed root.
7719                  */
7720                 if (found_root->ref_cows &&
7721                     ref_path->root_generation != found_root->root_key.offset)
7722                         continue;
7723
7724                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7725                         if (pass == 0) {
7726                                 /*
7727                                  * copy data extents to new locations
7728                                  */
7729                                 u64 group_start = group->key.objectid;
7730                                 ret = relocate_data_extent(reloc_inode,
7731                                                            extent_key,
7732                                                            group_start);
7733                                 if (ret < 0)
7734                                         goto out;
7735                                 break;
7736                         }
7737                         level = 0;
7738                 } else {
7739                         level = ref_path->owner_objectid;
7740                 }
7741
7742                 if (prev_block != ref_path->nodes[level]) {
7743                         struct extent_buffer *eb;
7744                         u64 block_start = ref_path->nodes[level];
7745                         u64 block_size = btrfs_level_size(found_root, level);
7746
7747                         eb = read_tree_block(found_root, block_start,
7748                                              block_size, 0);
7749                         btrfs_tree_lock(eb);
7750                         BUG_ON(level != btrfs_header_level(eb));
7751
7752                         if (level == 0)
7753                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
7754                         else
7755                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
7756
7757                         btrfs_tree_unlock(eb);
7758                         free_extent_buffer(eb);
7759                         prev_block = block_start;
7760                 }
7761
7762                 mutex_lock(&extent_root->fs_info->trans_mutex);
7763                 btrfs_record_root_in_trans(found_root);
7764                 mutex_unlock(&extent_root->fs_info->trans_mutex);
7765                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7766                         /*
7767                          * try to update data extent references while
7768                          * keeping metadata shared between snapshots.
7769                          */
7770                         if (pass == 1) {
7771                                 ret = relocate_one_path(trans, found_root,
7772                                                 path, &first_key, ref_path,
7773                                                 group, reloc_inode);
7774                                 if (ret < 0)
7775                                         goto out;
7776                                 continue;
7777                         }
7778                         /*
7779                          * use fallback method to process the remaining
7780                          * references.
7781                          */
7782                         if (!new_extents) {
7783                                 u64 group_start = group->key.objectid;
7784                                 new_extents = kmalloc(sizeof(*new_extents),
7785                                                       GFP_NOFS);
7786                                 nr_extents = 1;
7787                                 ret = get_new_locations(reloc_inode,
7788                                                         extent_key,
7789                                                         group_start, 1,
7790                                                         &new_extents,
7791                                                         &nr_extents);
7792                                 if (ret)
7793                                         goto out;
7794                         }
7795                         ret = replace_one_extent(trans, found_root,
7796                                                 path, extent_key,
7797                                                 &first_key, ref_path,
7798                                                 new_extents, nr_extents);
7799                 } else {
7800                         ret = relocate_tree_block(trans, found_root, path,
7801                                                   &first_key, ref_path);
7802                 }
7803                 if (ret < 0)
7804                         goto out;
7805         }
7806         ret = 0;
7807 out:
7808         btrfs_end_transaction(trans, extent_root);
7809         kfree(new_extents);
7810         kfree(ref_path);
7811         return ret;
7812 }
7813 #endif
7814
7815 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7816 {
7817         u64 num_devices;
7818         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7819                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7820
7821         num_devices = root->fs_info->fs_devices->rw_devices;
7822         if (num_devices == 1) {
7823                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7824                 stripped = flags & ~stripped;
7825
7826                 /* turn raid0 into single device chunks */
7827                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7828                         return stripped;
7829
7830                 /* turn mirroring into duplication */
7831                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7832                              BTRFS_BLOCK_GROUP_RAID10))
7833                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7834                 return flags;
7835         } else {
7836                 /* they already had raid on here, just return */
7837                 if (flags & stripped)
7838                         return flags;
7839
7840                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7841                 stripped = flags & ~stripped;
7842
7843                 /* switch duplicated blocks with raid1 */
7844                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7845                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7846
7847                 /* turn single device chunks into raid0 */
7848                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7849         }
7850         return flags;
7851 }
7852
7853 static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7854 {
7855         struct btrfs_space_info *sinfo = cache->space_info;
7856         u64 num_bytes;
7857         int ret = -ENOSPC;
7858
7859         if (cache->ro)
7860                 return 0;
7861
7862         spin_lock(&sinfo->lock);
7863         spin_lock(&cache->lock);
7864         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7865                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7866
7867         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7868             sinfo->bytes_may_use + sinfo->bytes_readonly +
7869             cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7870                 sinfo->bytes_readonly += num_bytes;
7871                 sinfo->bytes_reserved += cache->reserved_pinned;
7872                 cache->reserved_pinned = 0;
7873                 cache->ro = 1;
7874                 ret = 0;
7875         }
7876         spin_unlock(&cache->lock);
7877         spin_unlock(&sinfo->lock);
7878         return ret;
7879 }
7880
7881 int btrfs_set_block_group_ro(struct btrfs_root *root,
7882                              struct btrfs_block_group_cache *cache)
7883
7884 {
7885         struct btrfs_trans_handle *trans;
7886         u64 alloc_flags;
7887         int ret;
7888
7889         BUG_ON(cache->ro);
7890
7891         trans = btrfs_join_transaction(root, 1);
7892         BUG_ON(IS_ERR(trans));
7893
7894         alloc_flags = update_block_group_flags(root, cache->flags);
7895         if (alloc_flags != cache->flags)
7896                 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7897
7898         ret = set_block_group_ro(cache);
7899         if (!ret)
7900                 goto out;
7901         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7902         ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7903         if (ret < 0)
7904                 goto out;
7905         ret = set_block_group_ro(cache);
7906 out:
7907         btrfs_end_transaction(trans, root);
7908         return ret;
7909 }
7910
7911 int btrfs_set_block_group_rw(struct btrfs_root *root,
7912                               struct btrfs_block_group_cache *cache)
7913 {
7914         struct btrfs_space_info *sinfo = cache->space_info;
7915         u64 num_bytes;
7916
7917         BUG_ON(!cache->ro);
7918
7919         spin_lock(&sinfo->lock);
7920         spin_lock(&cache->lock);
7921         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7922                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7923         sinfo->bytes_readonly -= num_bytes;
7924         cache->ro = 0;
7925         spin_unlock(&cache->lock);
7926         spin_unlock(&sinfo->lock);
7927         return 0;
7928 }
7929
7930 /*
7931  * checks to see if its even possible to relocate this block group.
7932  *
7933  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7934  * ok to go ahead and try.
7935  */
7936 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7937 {
7938         struct btrfs_block_group_cache *block_group;
7939         struct btrfs_space_info *space_info;
7940         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7941         struct btrfs_device *device;
7942         int full = 0;
7943         int ret = 0;
7944
7945         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7946
7947         /* odd, couldn't find the block group, leave it alone */
7948         if (!block_group)
7949                 return -1;
7950
7951         /* no bytes used, we're good */
7952         if (!btrfs_block_group_used(&block_group->item))
7953                 goto out;
7954
7955         space_info = block_group->space_info;
7956         spin_lock(&space_info->lock);
7957
7958         full = space_info->full;
7959
7960         /*
7961          * if this is the last block group we have in this space, we can't
7962          * relocate it unless we're able to allocate a new chunk below.
7963          *
7964          * Otherwise, we need to make sure we have room in the space to handle
7965          * all of the extents from this block group.  If we can, we're good
7966          */
7967         if ((space_info->total_bytes != block_group->key.offset) &&
7968            (space_info->bytes_used + space_info->bytes_reserved +
7969             space_info->bytes_pinned + space_info->bytes_readonly +
7970             btrfs_block_group_used(&block_group->item) <
7971             space_info->total_bytes)) {
7972                 spin_unlock(&space_info->lock);
7973                 goto out;
7974         }
7975         spin_unlock(&space_info->lock);
7976
7977         /*
7978          * ok we don't have enough space, but maybe we have free space on our
7979          * devices to allocate new chunks for relocation, so loop through our
7980          * alloc devices and guess if we have enough space.  However, if we
7981          * were marked as full, then we know there aren't enough chunks, and we
7982          * can just return.
7983          */
7984         ret = -1;
7985         if (full)
7986                 goto out;
7987
7988         mutex_lock(&root->fs_info->chunk_mutex);
7989         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7990                 u64 min_free = btrfs_block_group_used(&block_group->item);
7991                 u64 dev_offset, max_avail;
7992
7993                 /*
7994                  * check to make sure we can actually find a chunk with enough
7995                  * space to fit our block group in.
7996                  */
7997                 if (device->total_bytes > device->bytes_used + min_free) {
7998                         ret = find_free_dev_extent(NULL, device, min_free,
7999                                                    &dev_offset, &max_avail);
8000                         if (!ret)
8001                                 break;
8002                         ret = -1;
8003                 }
8004         }
8005         mutex_unlock(&root->fs_info->chunk_mutex);
8006 out:
8007         btrfs_put_block_group(block_group);
8008         return ret;
8009 }
8010
8011 static int find_first_block_group(struct btrfs_root *root,
8012                 struct btrfs_path *path, struct btrfs_key *key)
8013 {
8014         int ret = 0;
8015         struct btrfs_key found_key;
8016         struct extent_buffer *leaf;
8017         int slot;
8018
8019         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8020         if (ret < 0)
8021                 goto out;
8022
8023         while (1) {
8024                 slot = path->slots[0];
8025                 leaf = path->nodes[0];
8026                 if (slot >= btrfs_header_nritems(leaf)) {
8027                         ret = btrfs_next_leaf(root, path);
8028                         if (ret == 0)
8029                                 continue;
8030                         if (ret < 0)
8031                                 goto out;
8032                         break;
8033                 }
8034                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
8035
8036                 if (found_key.objectid >= key->objectid &&
8037                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8038                         ret = 0;
8039                         goto out;
8040                 }
8041                 path->slots[0]++;
8042         }
8043 out:
8044         return ret;
8045 }
8046
8047 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8048 {
8049         struct btrfs_block_group_cache *block_group;
8050         u64 last = 0;
8051
8052         while (1) {
8053                 struct inode *inode;
8054
8055                 block_group = btrfs_lookup_first_block_group(info, last);
8056                 while (block_group) {
8057                         spin_lock(&block_group->lock);
8058                         if (block_group->iref)
8059                                 break;
8060                         spin_unlock(&block_group->lock);
8061                         block_group = next_block_group(info->tree_root,
8062                                                        block_group);
8063                 }
8064                 if (!block_group) {
8065                         if (last == 0)
8066                                 break;
8067                         last = 0;
8068                         continue;
8069                 }
8070
8071                 inode = block_group->inode;
8072                 block_group->iref = 0;
8073                 block_group->inode = NULL;
8074                 spin_unlock(&block_group->lock);
8075                 iput(inode);
8076                 last = block_group->key.objectid + block_group->key.offset;
8077                 btrfs_put_block_group(block_group);
8078         }
8079 }
8080
8081 int btrfs_free_block_groups(struct btrfs_fs_info *info)
8082 {
8083         struct btrfs_block_group_cache *block_group;
8084         struct btrfs_space_info *space_info;
8085         struct btrfs_caching_control *caching_ctl;
8086         struct rb_node *n;
8087
8088         down_write(&info->extent_commit_sem);
8089         while (!list_empty(&info->caching_block_groups)) {
8090                 caching_ctl = list_entry(info->caching_block_groups.next,
8091                                          struct btrfs_caching_control, list);
8092                 list_del(&caching_ctl->list);
8093                 put_caching_control(caching_ctl);
8094         }
8095         up_write(&info->extent_commit_sem);
8096
8097         spin_lock(&info->block_group_cache_lock);
8098         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8099                 block_group = rb_entry(n, struct btrfs_block_group_cache,
8100                                        cache_node);
8101                 rb_erase(&block_group->cache_node,
8102                          &info->block_group_cache_tree);
8103                 spin_unlock(&info->block_group_cache_lock);
8104
8105                 down_write(&block_group->space_info->groups_sem);
8106                 list_del(&block_group->list);
8107                 up_write(&block_group->space_info->groups_sem);
8108
8109                 if (block_group->cached == BTRFS_CACHE_STARTED)
8110                         wait_block_group_cache_done(block_group);
8111
8112                 btrfs_remove_free_space_cache(block_group);
8113                 btrfs_put_block_group(block_group);
8114
8115                 spin_lock(&info->block_group_cache_lock);
8116         }
8117         spin_unlock(&info->block_group_cache_lock);
8118
8119         /* now that all the block groups are freed, go through and
8120          * free all the space_info structs.  This is only called during
8121          * the final stages of unmount, and so we know nobody is
8122          * using them.  We call synchronize_rcu() once before we start,
8123          * just to be on the safe side.
8124          */
8125         synchronize_rcu();
8126
8127         release_global_block_rsv(info);
8128
8129         while(!list_empty(&info->space_info)) {
8130                 space_info = list_entry(info->space_info.next,
8131                                         struct btrfs_space_info,
8132                                         list);
8133                 if (space_info->bytes_pinned > 0 ||
8134                     space_info->bytes_reserved > 0) {
8135                         WARN_ON(1);
8136                         dump_space_info(space_info, 0, 0);
8137                 }
8138                 list_del(&space_info->list);
8139                 kfree(space_info);
8140         }
8141         return 0;
8142 }
8143
8144 static void __link_block_group(struct btrfs_space_info *space_info,
8145                                struct btrfs_block_group_cache *cache)
8146 {
8147         int index = get_block_group_index(cache);
8148
8149         down_write(&space_info->groups_sem);
8150         list_add_tail(&cache->list, &space_info->block_groups[index]);
8151         up_write(&space_info->groups_sem);
8152 }
8153
8154 int btrfs_read_block_groups(struct btrfs_root *root)
8155 {
8156         struct btrfs_path *path;
8157         int ret;
8158         struct btrfs_block_group_cache *cache;
8159         struct btrfs_fs_info *info = root->fs_info;
8160         struct btrfs_space_info *space_info;
8161         struct btrfs_key key;
8162         struct btrfs_key found_key;
8163         struct extent_buffer *leaf;
8164         int need_clear = 0;
8165         u64 cache_gen;
8166
8167         root = info->extent_root;
8168         key.objectid = 0;
8169         key.offset = 0;
8170         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8171         path = btrfs_alloc_path();
8172         if (!path)
8173                 return -ENOMEM;
8174
8175         cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
8176         if (cache_gen != 0 &&
8177             btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
8178                 need_clear = 1;
8179
8180         while (1) {
8181                 ret = find_first_block_group(root, path, &key);
8182                 if (ret > 0)
8183                         break;
8184                 if (ret != 0)
8185                         goto error;
8186
8187                 leaf = path->nodes[0];
8188                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8189                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8190                 if (!cache) {
8191                         ret = -ENOMEM;
8192                         goto error;
8193                 }
8194
8195                 atomic_set(&cache->count, 1);
8196                 spin_lock_init(&cache->lock);
8197                 spin_lock_init(&cache->tree_lock);
8198                 cache->fs_info = info;
8199                 INIT_LIST_HEAD(&cache->list);
8200                 INIT_LIST_HEAD(&cache->cluster_list);
8201
8202                 if (need_clear)
8203                         cache->disk_cache_state = BTRFS_DC_CLEAR;
8204
8205                 /*
8206                  * we only want to have 32k of ram per block group for keeping
8207                  * track of free space, and if we pass 1/2 of that we want to
8208                  * start converting things over to using bitmaps
8209                  */
8210                 cache->extents_thresh = ((1024 * 32) / 2) /
8211                         sizeof(struct btrfs_free_space);
8212
8213                 read_extent_buffer(leaf, &cache->item,
8214                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
8215                                    sizeof(cache->item));
8216                 memcpy(&cache->key, &found_key, sizeof(found_key));
8217
8218                 key.objectid = found_key.objectid + found_key.offset;
8219                 btrfs_release_path(root, path);
8220                 cache->flags = btrfs_block_group_flags(&cache->item);
8221                 cache->sectorsize = root->sectorsize;
8222
8223                 /*
8224                  * check for two cases, either we are full, and therefore
8225                  * don't need to bother with the caching work since we won't
8226                  * find any space, or we are empty, and we can just add all
8227                  * the space in and be done with it.  This saves us _alot_ of
8228                  * time, particularly in the full case.
8229                  */
8230                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8231                         exclude_super_stripes(root, cache);
8232                         cache->last_byte_to_unpin = (u64)-1;
8233                         cache->cached = BTRFS_CACHE_FINISHED;
8234                         free_excluded_extents(root, cache);
8235                 } else if (btrfs_block_group_used(&cache->item) == 0) {
8236                         exclude_super_stripes(root, cache);
8237                         cache->last_byte_to_unpin = (u64)-1;
8238                         cache->cached = BTRFS_CACHE_FINISHED;
8239                         add_new_free_space(cache, root->fs_info,
8240                                            found_key.objectid,
8241                                            found_key.objectid +
8242                                            found_key.offset);
8243                         free_excluded_extents(root, cache);
8244                 }
8245
8246                 ret = update_space_info(info, cache->flags, found_key.offset,
8247                                         btrfs_block_group_used(&cache->item),
8248                                         &space_info);
8249                 BUG_ON(ret);
8250                 cache->space_info = space_info;
8251                 spin_lock(&cache->space_info->lock);
8252                 cache->space_info->bytes_readonly += cache->bytes_super;
8253                 spin_unlock(&cache->space_info->lock);
8254
8255                 __link_block_group(space_info, cache);
8256
8257                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
8258                 BUG_ON(ret);
8259
8260                 set_avail_alloc_bits(root->fs_info, cache->flags);
8261                 if (btrfs_chunk_readonly(root, cache->key.objectid))
8262                         set_block_group_ro(cache);
8263         }
8264
8265         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8266                 if (!(get_alloc_profile(root, space_info->flags) &
8267                       (BTRFS_BLOCK_GROUP_RAID10 |
8268                        BTRFS_BLOCK_GROUP_RAID1 |
8269                        BTRFS_BLOCK_GROUP_DUP)))
8270                         continue;
8271                 /*
8272                  * avoid allocating from un-mirrored block group if there are
8273                  * mirrored block groups.
8274                  */
8275                 list_for_each_entry(cache, &space_info->block_groups[3], list)
8276                         set_block_group_ro(cache);
8277                 list_for_each_entry(cache, &space_info->block_groups[4], list)
8278                         set_block_group_ro(cache);
8279         }
8280
8281         init_global_block_rsv(info);
8282         ret = 0;
8283 error:
8284         btrfs_free_path(path);
8285         return ret;
8286 }
8287
8288 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8289                            struct btrfs_root *root, u64 bytes_used,
8290                            u64 type, u64 chunk_objectid, u64 chunk_offset,
8291                            u64 size)
8292 {
8293         int ret;
8294         struct btrfs_root *extent_root;
8295         struct btrfs_block_group_cache *cache;
8296
8297         extent_root = root->fs_info->extent_root;
8298
8299         root->fs_info->last_trans_log_full_commit = trans->transid;
8300
8301         cache = kzalloc(sizeof(*cache), GFP_NOFS);
8302         if (!cache)
8303                 return -ENOMEM;
8304
8305         cache->key.objectid = chunk_offset;
8306         cache->key.offset = size;
8307         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8308         cache->sectorsize = root->sectorsize;
8309         cache->fs_info = root->fs_info;
8310
8311         /*
8312          * we only want to have 32k of ram per block group for keeping track
8313          * of free space, and if we pass 1/2 of that we want to start
8314          * converting things over to using bitmaps
8315          */
8316         cache->extents_thresh = ((1024 * 32) / 2) /
8317                 sizeof(struct btrfs_free_space);
8318         atomic_set(&cache->count, 1);
8319         spin_lock_init(&cache->lock);
8320         spin_lock_init(&cache->tree_lock);
8321         INIT_LIST_HEAD(&cache->list);
8322         INIT_LIST_HEAD(&cache->cluster_list);
8323
8324         btrfs_set_block_group_used(&cache->item, bytes_used);
8325         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8326         cache->flags = type;
8327         btrfs_set_block_group_flags(&cache->item, type);
8328
8329         cache->last_byte_to_unpin = (u64)-1;
8330         cache->cached = BTRFS_CACHE_FINISHED;
8331         exclude_super_stripes(root, cache);
8332
8333         add_new_free_space(cache, root->fs_info, chunk_offset,
8334                            chunk_offset + size);
8335
8336         free_excluded_extents(root, cache);
8337
8338         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8339                                 &cache->space_info);
8340         BUG_ON(ret);
8341
8342         spin_lock(&cache->space_info->lock);
8343         cache->space_info->bytes_readonly += cache->bytes_super;
8344         spin_unlock(&cache->space_info->lock);
8345
8346         __link_block_group(cache->space_info, cache);
8347
8348         ret = btrfs_add_block_group_cache(root->fs_info, cache);
8349         BUG_ON(ret);
8350
8351         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
8352                                 sizeof(cache->item));
8353         BUG_ON(ret);
8354
8355         set_avail_alloc_bits(extent_root->fs_info, type);
8356
8357         return 0;
8358 }
8359
8360 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8361                              struct btrfs_root *root, u64 group_start)
8362 {
8363         struct btrfs_path *path;
8364         struct btrfs_block_group_cache *block_group;
8365         struct btrfs_free_cluster *cluster;
8366         struct btrfs_root *tree_root = root->fs_info->tree_root;
8367         struct btrfs_key key;
8368         struct inode *inode;
8369         int ret;
8370
8371         root = root->fs_info->extent_root;
8372
8373         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8374         BUG_ON(!block_group);
8375         BUG_ON(!block_group->ro);
8376
8377         /* make sure this block group isn't part of an allocation cluster */
8378         cluster = &root->fs_info->data_alloc_cluster;
8379         spin_lock(&cluster->refill_lock);
8380         btrfs_return_cluster_to_free_space(block_group, cluster);
8381         spin_unlock(&cluster->refill_lock);
8382
8383         /*
8384          * make sure this block group isn't part of a metadata
8385          * allocation cluster
8386          */
8387         cluster = &root->fs_info->meta_alloc_cluster;
8388         spin_lock(&cluster->refill_lock);
8389         btrfs_return_cluster_to_free_space(block_group, cluster);
8390         spin_unlock(&cluster->refill_lock);
8391
8392         path = btrfs_alloc_path();
8393         BUG_ON(!path);
8394
8395         inode = lookup_free_space_inode(root, block_group, path);
8396         if (!IS_ERR(inode)) {
8397                 btrfs_orphan_add(trans, inode);
8398                 clear_nlink(inode);
8399                 /* One for the block groups ref */
8400                 spin_lock(&block_group->lock);
8401                 if (block_group->iref) {
8402                         block_group->iref = 0;
8403                         block_group->inode = NULL;
8404                         spin_unlock(&block_group->lock);
8405                         iput(inode);
8406                 } else {
8407                         spin_unlock(&block_group->lock);
8408                 }
8409                 /* One for our lookup ref */
8410                 iput(inode);
8411         }
8412
8413         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8414         key.offset = block_group->key.objectid;
8415         key.type = 0;
8416
8417         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8418         if (ret < 0)
8419                 goto out;
8420         if (ret > 0)
8421                 btrfs_release_path(tree_root, path);
8422         if (ret == 0) {
8423                 ret = btrfs_del_item(trans, tree_root, path);
8424                 if (ret)
8425                         goto out;
8426                 btrfs_release_path(tree_root, path);
8427         }
8428
8429         spin_lock(&root->fs_info->block_group_cache_lock);
8430         rb_erase(&block_group->cache_node,
8431                  &root->fs_info->block_group_cache_tree);
8432         spin_unlock(&root->fs_info->block_group_cache_lock);
8433
8434         down_write(&block_group->space_info->groups_sem);
8435         /*
8436          * we must use list_del_init so people can check to see if they
8437          * are still on the list after taking the semaphore
8438          */
8439         list_del_init(&block_group->list);
8440         up_write(&block_group->space_info->groups_sem);
8441
8442         if (block_group->cached == BTRFS_CACHE_STARTED)
8443                 wait_block_group_cache_done(block_group);
8444
8445         btrfs_remove_free_space_cache(block_group);
8446
8447         spin_lock(&block_group->space_info->lock);
8448         block_group->space_info->total_bytes -= block_group->key.offset;
8449         block_group->space_info->bytes_readonly -= block_group->key.offset;
8450         spin_unlock(&block_group->space_info->lock);
8451
8452         memcpy(&key, &block_group->key, sizeof(key));
8453
8454         btrfs_clear_space_info_full(root->fs_info);
8455
8456         btrfs_put_block_group(block_group);
8457         btrfs_put_block_group(block_group);
8458
8459         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8460         if (ret > 0)
8461                 ret = -EIO;
8462         if (ret < 0)
8463                 goto out;
8464
8465         ret = btrfs_del_item(trans, root, path);
8466 out:
8467         btrfs_free_path(path);
8468         return ret;
8469 }