]> git.karo-electronics.de Git - linux-beck.git/blob - fs/btrfs/extent_io.c
btrfs: integrating raid-repair and scrub-fixup-nodatasum
[linux-beck.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21
22 static struct kmem_cache *extent_state_cache;
23 static struct kmem_cache *extent_buffer_cache;
24
25 static LIST_HEAD(buffers);
26 static LIST_HEAD(states);
27
28 #define LEAK_DEBUG 0
29 #if LEAK_DEBUG
30 static DEFINE_SPINLOCK(leak_lock);
31 #endif
32
33 #define BUFFER_LRU_MAX 64
34
35 struct tree_entry {
36         u64 start;
37         u64 end;
38         struct rb_node rb_node;
39 };
40
41 struct extent_page_data {
42         struct bio *bio;
43         struct extent_io_tree *tree;
44         get_extent_t *get_extent;
45
46         /* tells writepage not to lock the state bits for this range
47          * it still does the unlocking
48          */
49         unsigned int extent_locked:1;
50
51         /* tells the submit_bio code to use a WRITE_SYNC */
52         unsigned int sync_io:1;
53 };
54
55 int __init extent_io_init(void)
56 {
57         extent_state_cache = kmem_cache_create("extent_state",
58                         sizeof(struct extent_state), 0,
59                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
60         if (!extent_state_cache)
61                 return -ENOMEM;
62
63         extent_buffer_cache = kmem_cache_create("extent_buffers",
64                         sizeof(struct extent_buffer), 0,
65                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
66         if (!extent_buffer_cache)
67                 goto free_state_cache;
68         return 0;
69
70 free_state_cache:
71         kmem_cache_destroy(extent_state_cache);
72         return -ENOMEM;
73 }
74
75 void extent_io_exit(void)
76 {
77         struct extent_state *state;
78         struct extent_buffer *eb;
79
80         while (!list_empty(&states)) {
81                 state = list_entry(states.next, struct extent_state, leak_list);
82                 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
83                        "state %lu in tree %p refs %d\n",
84                        (unsigned long long)state->start,
85                        (unsigned long long)state->end,
86                        state->state, state->tree, atomic_read(&state->refs));
87                 list_del(&state->leak_list);
88                 kmem_cache_free(extent_state_cache, state);
89
90         }
91
92         while (!list_empty(&buffers)) {
93                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
94                 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
95                        "refs %d\n", (unsigned long long)eb->start,
96                        eb->len, atomic_read(&eb->refs));
97                 list_del(&eb->leak_list);
98                 kmem_cache_free(extent_buffer_cache, eb);
99         }
100         if (extent_state_cache)
101                 kmem_cache_destroy(extent_state_cache);
102         if (extent_buffer_cache)
103                 kmem_cache_destroy(extent_buffer_cache);
104 }
105
106 void extent_io_tree_init(struct extent_io_tree *tree,
107                          struct address_space *mapping)
108 {
109         tree->state = RB_ROOT;
110         INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
111         tree->ops = NULL;
112         tree->dirty_bytes = 0;
113         spin_lock_init(&tree->lock);
114         spin_lock_init(&tree->buffer_lock);
115         tree->mapping = mapping;
116 }
117
118 static struct extent_state *alloc_extent_state(gfp_t mask)
119 {
120         struct extent_state *state;
121 #if LEAK_DEBUG
122         unsigned long flags;
123 #endif
124
125         state = kmem_cache_alloc(extent_state_cache, mask);
126         if (!state)
127                 return state;
128         state->state = 0;
129         state->private = 0;
130         state->tree = NULL;
131 #if LEAK_DEBUG
132         spin_lock_irqsave(&leak_lock, flags);
133         list_add(&state->leak_list, &states);
134         spin_unlock_irqrestore(&leak_lock, flags);
135 #endif
136         atomic_set(&state->refs, 1);
137         init_waitqueue_head(&state->wq);
138         return state;
139 }
140
141 void free_extent_state(struct extent_state *state)
142 {
143         if (!state)
144                 return;
145         if (atomic_dec_and_test(&state->refs)) {
146 #if LEAK_DEBUG
147                 unsigned long flags;
148 #endif
149                 WARN_ON(state->tree);
150 #if LEAK_DEBUG
151                 spin_lock_irqsave(&leak_lock, flags);
152                 list_del(&state->leak_list);
153                 spin_unlock_irqrestore(&leak_lock, flags);
154 #endif
155                 kmem_cache_free(extent_state_cache, state);
156         }
157 }
158
159 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
160                                    struct rb_node *node)
161 {
162         struct rb_node **p = &root->rb_node;
163         struct rb_node *parent = NULL;
164         struct tree_entry *entry;
165
166         while (*p) {
167                 parent = *p;
168                 entry = rb_entry(parent, struct tree_entry, rb_node);
169
170                 if (offset < entry->start)
171                         p = &(*p)->rb_left;
172                 else if (offset > entry->end)
173                         p = &(*p)->rb_right;
174                 else
175                         return parent;
176         }
177
178         entry = rb_entry(node, struct tree_entry, rb_node);
179         rb_link_node(node, parent, p);
180         rb_insert_color(node, root);
181         return NULL;
182 }
183
184 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
185                                      struct rb_node **prev_ret,
186                                      struct rb_node **next_ret)
187 {
188         struct rb_root *root = &tree->state;
189         struct rb_node *n = root->rb_node;
190         struct rb_node *prev = NULL;
191         struct rb_node *orig_prev = NULL;
192         struct tree_entry *entry;
193         struct tree_entry *prev_entry = NULL;
194
195         while (n) {
196                 entry = rb_entry(n, struct tree_entry, rb_node);
197                 prev = n;
198                 prev_entry = entry;
199
200                 if (offset < entry->start)
201                         n = n->rb_left;
202                 else if (offset > entry->end)
203                         n = n->rb_right;
204                 else
205                         return n;
206         }
207
208         if (prev_ret) {
209                 orig_prev = prev;
210                 while (prev && offset > prev_entry->end) {
211                         prev = rb_next(prev);
212                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
213                 }
214                 *prev_ret = prev;
215                 prev = orig_prev;
216         }
217
218         if (next_ret) {
219                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220                 while (prev && offset < prev_entry->start) {
221                         prev = rb_prev(prev);
222                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
223                 }
224                 *next_ret = prev;
225         }
226         return NULL;
227 }
228
229 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
230                                           u64 offset)
231 {
232         struct rb_node *prev = NULL;
233         struct rb_node *ret;
234
235         ret = __etree_search(tree, offset, &prev, NULL);
236         if (!ret)
237                 return prev;
238         return ret;
239 }
240
241 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
242                      struct extent_state *other)
243 {
244         if (tree->ops && tree->ops->merge_extent_hook)
245                 tree->ops->merge_extent_hook(tree->mapping->host, new,
246                                              other);
247 }
248
249 /*
250  * utility function to look for merge candidates inside a given range.
251  * Any extents with matching state are merged together into a single
252  * extent in the tree.  Extents with EXTENT_IO in their state field
253  * are not merged because the end_io handlers need to be able to do
254  * operations on them without sleeping (or doing allocations/splits).
255  *
256  * This should be called with the tree lock held.
257  */
258 static void merge_state(struct extent_io_tree *tree,
259                         struct extent_state *state)
260 {
261         struct extent_state *other;
262         struct rb_node *other_node;
263
264         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
265                 return;
266
267         other_node = rb_prev(&state->rb_node);
268         if (other_node) {
269                 other = rb_entry(other_node, struct extent_state, rb_node);
270                 if (other->end == state->start - 1 &&
271                     other->state == state->state) {
272                         merge_cb(tree, state, other);
273                         state->start = other->start;
274                         other->tree = NULL;
275                         rb_erase(&other->rb_node, &tree->state);
276                         free_extent_state(other);
277                 }
278         }
279         other_node = rb_next(&state->rb_node);
280         if (other_node) {
281                 other = rb_entry(other_node, struct extent_state, rb_node);
282                 if (other->start == state->end + 1 &&
283                     other->state == state->state) {
284                         merge_cb(tree, state, other);
285                         state->end = other->end;
286                         other->tree = NULL;
287                         rb_erase(&other->rb_node, &tree->state);
288                         free_extent_state(other);
289                 }
290         }
291 }
292
293 static void set_state_cb(struct extent_io_tree *tree,
294                          struct extent_state *state, int *bits)
295 {
296         if (tree->ops && tree->ops->set_bit_hook)
297                 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
298 }
299
300 static void clear_state_cb(struct extent_io_tree *tree,
301                            struct extent_state *state, int *bits)
302 {
303         if (tree->ops && tree->ops->clear_bit_hook)
304                 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
305 }
306
307 static void set_state_bits(struct extent_io_tree *tree,
308                            struct extent_state *state, int *bits);
309
310 /*
311  * insert an extent_state struct into the tree.  'bits' are set on the
312  * struct before it is inserted.
313  *
314  * This may return -EEXIST if the extent is already there, in which case the
315  * state struct is freed.
316  *
317  * The tree lock is not taken internally.  This is a utility function and
318  * probably isn't what you want to call (see set/clear_extent_bit).
319  */
320 static int insert_state(struct extent_io_tree *tree,
321                         struct extent_state *state, u64 start, u64 end,
322                         int *bits)
323 {
324         struct rb_node *node;
325
326         if (end < start) {
327                 printk(KERN_ERR "btrfs end < start %llu %llu\n",
328                        (unsigned long long)end,
329                        (unsigned long long)start);
330                 WARN_ON(1);
331         }
332         state->start = start;
333         state->end = end;
334
335         set_state_bits(tree, state, bits);
336
337         node = tree_insert(&tree->state, end, &state->rb_node);
338         if (node) {
339                 struct extent_state *found;
340                 found = rb_entry(node, struct extent_state, rb_node);
341                 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
342                        "%llu %llu\n", (unsigned long long)found->start,
343                        (unsigned long long)found->end,
344                        (unsigned long long)start, (unsigned long long)end);
345                 return -EEXIST;
346         }
347         state->tree = tree;
348         merge_state(tree, state);
349         return 0;
350 }
351
352 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
353                      u64 split)
354 {
355         if (tree->ops && tree->ops->split_extent_hook)
356                 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
357 }
358
359 /*
360  * split a given extent state struct in two, inserting the preallocated
361  * struct 'prealloc' as the newly created second half.  'split' indicates an
362  * offset inside 'orig' where it should be split.
363  *
364  * Before calling,
365  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
366  * are two extent state structs in the tree:
367  * prealloc: [orig->start, split - 1]
368  * orig: [ split, orig->end ]
369  *
370  * The tree locks are not taken by this function. They need to be held
371  * by the caller.
372  */
373 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
374                        struct extent_state *prealloc, u64 split)
375 {
376         struct rb_node *node;
377
378         split_cb(tree, orig, split);
379
380         prealloc->start = orig->start;
381         prealloc->end = split - 1;
382         prealloc->state = orig->state;
383         orig->start = split;
384
385         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
386         if (node) {
387                 free_extent_state(prealloc);
388                 return -EEXIST;
389         }
390         prealloc->tree = tree;
391         return 0;
392 }
393
394 /*
395  * utility function to clear some bits in an extent state struct.
396  * it will optionally wake up any one waiting on this state (wake == 1), or
397  * forcibly remove the state from the tree (delete == 1).
398  *
399  * If no bits are set on the state struct after clearing things, the
400  * struct is freed and removed from the tree
401  */
402 static int clear_state_bit(struct extent_io_tree *tree,
403                             struct extent_state *state,
404                             int *bits, int wake)
405 {
406         int bits_to_clear = *bits & ~EXTENT_CTLBITS;
407         int ret = state->state & bits_to_clear;
408
409         if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
410                 u64 range = state->end - state->start + 1;
411                 WARN_ON(range > tree->dirty_bytes);
412                 tree->dirty_bytes -= range;
413         }
414         clear_state_cb(tree, state, bits);
415         state->state &= ~bits_to_clear;
416         if (wake)
417                 wake_up(&state->wq);
418         if (state->state == 0) {
419                 if (state->tree) {
420                         rb_erase(&state->rb_node, &tree->state);
421                         state->tree = NULL;
422                         free_extent_state(state);
423                 } else {
424                         WARN_ON(1);
425                 }
426         } else {
427                 merge_state(tree, state);
428         }
429         return ret;
430 }
431
432 static struct extent_state *
433 alloc_extent_state_atomic(struct extent_state *prealloc)
434 {
435         if (!prealloc)
436                 prealloc = alloc_extent_state(GFP_ATOMIC);
437
438         return prealloc;
439 }
440
441 /*
442  * clear some bits on a range in the tree.  This may require splitting
443  * or inserting elements in the tree, so the gfp mask is used to
444  * indicate which allocations or sleeping are allowed.
445  *
446  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
447  * the given range from the tree regardless of state (ie for truncate).
448  *
449  * the range [start, end] is inclusive.
450  *
451  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
452  * bits were already set, or zero if none of the bits were already set.
453  */
454 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
455                      int bits, int wake, int delete,
456                      struct extent_state **cached_state,
457                      gfp_t mask)
458 {
459         struct extent_state *state;
460         struct extent_state *cached;
461         struct extent_state *prealloc = NULL;
462         struct rb_node *next_node;
463         struct rb_node *node;
464         u64 last_end;
465         int err;
466         int set = 0;
467         int clear = 0;
468
469         if (delete)
470                 bits |= ~EXTENT_CTLBITS;
471         bits |= EXTENT_FIRST_DELALLOC;
472
473         if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
474                 clear = 1;
475 again:
476         if (!prealloc && (mask & __GFP_WAIT)) {
477                 prealloc = alloc_extent_state(mask);
478                 if (!prealloc)
479                         return -ENOMEM;
480         }
481
482         spin_lock(&tree->lock);
483         if (cached_state) {
484                 cached = *cached_state;
485
486                 if (clear) {
487                         *cached_state = NULL;
488                         cached_state = NULL;
489                 }
490
491                 if (cached && cached->tree && cached->start <= start &&
492                     cached->end > start) {
493                         if (clear)
494                                 atomic_dec(&cached->refs);
495                         state = cached;
496                         goto hit_next;
497                 }
498                 if (clear)
499                         free_extent_state(cached);
500         }
501         /*
502          * this search will find the extents that end after
503          * our range starts
504          */
505         node = tree_search(tree, start);
506         if (!node)
507                 goto out;
508         state = rb_entry(node, struct extent_state, rb_node);
509 hit_next:
510         if (state->start > end)
511                 goto out;
512         WARN_ON(state->end < start);
513         last_end = state->end;
514
515         /*
516          *     | ---- desired range ---- |
517          *  | state | or
518          *  | ------------- state -------------- |
519          *
520          * We need to split the extent we found, and may flip
521          * bits on second half.
522          *
523          * If the extent we found extends past our range, we
524          * just split and search again.  It'll get split again
525          * the next time though.
526          *
527          * If the extent we found is inside our range, we clear
528          * the desired bit on it.
529          */
530
531         if (state->start < start) {
532                 prealloc = alloc_extent_state_atomic(prealloc);
533                 BUG_ON(!prealloc);
534                 err = split_state(tree, state, prealloc, start);
535                 BUG_ON(err == -EEXIST);
536                 prealloc = NULL;
537                 if (err)
538                         goto out;
539                 if (state->end <= end) {
540                         set |= clear_state_bit(tree, state, &bits, wake);
541                         if (last_end == (u64)-1)
542                                 goto out;
543                         start = last_end + 1;
544                 }
545                 goto search_again;
546         }
547         /*
548          * | ---- desired range ---- |
549          *                        | state |
550          * We need to split the extent, and clear the bit
551          * on the first half
552          */
553         if (state->start <= end && state->end > end) {
554                 prealloc = alloc_extent_state_atomic(prealloc);
555                 BUG_ON(!prealloc);
556                 err = split_state(tree, state, prealloc, end + 1);
557                 BUG_ON(err == -EEXIST);
558                 if (wake)
559                         wake_up(&state->wq);
560
561                 set |= clear_state_bit(tree, prealloc, &bits, wake);
562
563                 prealloc = NULL;
564                 goto out;
565         }
566
567         if (state->end < end && prealloc && !need_resched())
568                 next_node = rb_next(&state->rb_node);
569         else
570                 next_node = NULL;
571
572         set |= clear_state_bit(tree, state, &bits, wake);
573         if (last_end == (u64)-1)
574                 goto out;
575         start = last_end + 1;
576         if (start <= end && next_node) {
577                 state = rb_entry(next_node, struct extent_state,
578                                  rb_node);
579                 if (state->start == start)
580                         goto hit_next;
581         }
582         goto search_again;
583
584 out:
585         spin_unlock(&tree->lock);
586         if (prealloc)
587                 free_extent_state(prealloc);
588
589         return set;
590
591 search_again:
592         if (start > end)
593                 goto out;
594         spin_unlock(&tree->lock);
595         if (mask & __GFP_WAIT)
596                 cond_resched();
597         goto again;
598 }
599
600 static int wait_on_state(struct extent_io_tree *tree,
601                          struct extent_state *state)
602                 __releases(tree->lock)
603                 __acquires(tree->lock)
604 {
605         DEFINE_WAIT(wait);
606         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
607         spin_unlock(&tree->lock);
608         schedule();
609         spin_lock(&tree->lock);
610         finish_wait(&state->wq, &wait);
611         return 0;
612 }
613
614 /*
615  * waits for one or more bits to clear on a range in the state tree.
616  * The range [start, end] is inclusive.
617  * The tree lock is taken by this function
618  */
619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
620 {
621         struct extent_state *state;
622         struct rb_node *node;
623
624         spin_lock(&tree->lock);
625 again:
626         while (1) {
627                 /*
628                  * this search will find all the extents that end after
629                  * our range starts
630                  */
631                 node = tree_search(tree, start);
632                 if (!node)
633                         break;
634
635                 state = rb_entry(node, struct extent_state, rb_node);
636
637                 if (state->start > end)
638                         goto out;
639
640                 if (state->state & bits) {
641                         start = state->start;
642                         atomic_inc(&state->refs);
643                         wait_on_state(tree, state);
644                         free_extent_state(state);
645                         goto again;
646                 }
647                 start = state->end + 1;
648
649                 if (start > end)
650                         break;
651
652                 cond_resched_lock(&tree->lock);
653         }
654 out:
655         spin_unlock(&tree->lock);
656         return 0;
657 }
658
659 static void set_state_bits(struct extent_io_tree *tree,
660                            struct extent_state *state,
661                            int *bits)
662 {
663         int bits_to_set = *bits & ~EXTENT_CTLBITS;
664
665         set_state_cb(tree, state, bits);
666         if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
667                 u64 range = state->end - state->start + 1;
668                 tree->dirty_bytes += range;
669         }
670         state->state |= bits_to_set;
671 }
672
673 static void cache_state(struct extent_state *state,
674                         struct extent_state **cached_ptr)
675 {
676         if (cached_ptr && !(*cached_ptr)) {
677                 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
678                         *cached_ptr = state;
679                         atomic_inc(&state->refs);
680                 }
681         }
682 }
683
684 static void uncache_state(struct extent_state **cached_ptr)
685 {
686         if (cached_ptr && (*cached_ptr)) {
687                 struct extent_state *state = *cached_ptr;
688                 *cached_ptr = NULL;
689                 free_extent_state(state);
690         }
691 }
692
693 /*
694  * set some bits on a range in the tree.  This may require allocations or
695  * sleeping, so the gfp mask is used to indicate what is allowed.
696  *
697  * If any of the exclusive bits are set, this will fail with -EEXIST if some
698  * part of the range already has the desired bits set.  The start of the
699  * existing range is returned in failed_start in this case.
700  *
701  * [start, end] is inclusive This takes the tree lock.
702  */
703
704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
705                    int bits, int exclusive_bits, u64 *failed_start,
706                    struct extent_state **cached_state, gfp_t mask)
707 {
708         struct extent_state *state;
709         struct extent_state *prealloc = NULL;
710         struct rb_node *node;
711         int err = 0;
712         u64 last_start;
713         u64 last_end;
714
715         bits |= EXTENT_FIRST_DELALLOC;
716 again:
717         if (!prealloc && (mask & __GFP_WAIT)) {
718                 prealloc = alloc_extent_state(mask);
719                 BUG_ON(!prealloc);
720         }
721
722         spin_lock(&tree->lock);
723         if (cached_state && *cached_state) {
724                 state = *cached_state;
725                 if (state->start <= start && state->end > start &&
726                     state->tree) {
727                         node = &state->rb_node;
728                         goto hit_next;
729                 }
730         }
731         /*
732          * this search will find all the extents that end after
733          * our range starts.
734          */
735         node = tree_search(tree, start);
736         if (!node) {
737                 prealloc = alloc_extent_state_atomic(prealloc);
738                 BUG_ON(!prealloc);
739                 err = insert_state(tree, prealloc, start, end, &bits);
740                 prealloc = NULL;
741                 BUG_ON(err == -EEXIST);
742                 goto out;
743         }
744         state = rb_entry(node, struct extent_state, rb_node);
745 hit_next:
746         last_start = state->start;
747         last_end = state->end;
748
749         /*
750          * | ---- desired range ---- |
751          * | state |
752          *
753          * Just lock what we found and keep going
754          */
755         if (state->start == start && state->end <= end) {
756                 struct rb_node *next_node;
757                 if (state->state & exclusive_bits) {
758                         *failed_start = state->start;
759                         err = -EEXIST;
760                         goto out;
761                 }
762
763                 set_state_bits(tree, state, &bits);
764
765                 cache_state(state, cached_state);
766                 merge_state(tree, state);
767                 if (last_end == (u64)-1)
768                         goto out;
769
770                 start = last_end + 1;
771                 next_node = rb_next(&state->rb_node);
772                 if (next_node && start < end && prealloc && !need_resched()) {
773                         state = rb_entry(next_node, struct extent_state,
774                                          rb_node);
775                         if (state->start == start)
776                                 goto hit_next;
777                 }
778                 goto search_again;
779         }
780
781         /*
782          *     | ---- desired range ---- |
783          * | state |
784          *   or
785          * | ------------- state -------------- |
786          *
787          * We need to split the extent we found, and may flip bits on
788          * second half.
789          *
790          * If the extent we found extends past our
791          * range, we just split and search again.  It'll get split
792          * again the next time though.
793          *
794          * If the extent we found is inside our range, we set the
795          * desired bit on it.
796          */
797         if (state->start < start) {
798                 if (state->state & exclusive_bits) {
799                         *failed_start = start;
800                         err = -EEXIST;
801                         goto out;
802                 }
803
804                 prealloc = alloc_extent_state_atomic(prealloc);
805                 BUG_ON(!prealloc);
806                 err = split_state(tree, state, prealloc, start);
807                 BUG_ON(err == -EEXIST);
808                 prealloc = NULL;
809                 if (err)
810                         goto out;
811                 if (state->end <= end) {
812                         set_state_bits(tree, state, &bits);
813                         cache_state(state, cached_state);
814                         merge_state(tree, state);
815                         if (last_end == (u64)-1)
816                                 goto out;
817                         start = last_end + 1;
818                 }
819                 goto search_again;
820         }
821         /*
822          * | ---- desired range ---- |
823          *     | state | or               | state |
824          *
825          * There's a hole, we need to insert something in it and
826          * ignore the extent we found.
827          */
828         if (state->start > start) {
829                 u64 this_end;
830                 if (end < last_start)
831                         this_end = end;
832                 else
833                         this_end = last_start - 1;
834
835                 prealloc = alloc_extent_state_atomic(prealloc);
836                 BUG_ON(!prealloc);
837
838                 /*
839                  * Avoid to free 'prealloc' if it can be merged with
840                  * the later extent.
841                  */
842                 err = insert_state(tree, prealloc, start, this_end,
843                                    &bits);
844                 BUG_ON(err == -EEXIST);
845                 if (err) {
846                         free_extent_state(prealloc);
847                         prealloc = NULL;
848                         goto out;
849                 }
850                 cache_state(prealloc, cached_state);
851                 prealloc = NULL;
852                 start = this_end + 1;
853                 goto search_again;
854         }
855         /*
856          * | ---- desired range ---- |
857          *                        | state |
858          * We need to split the extent, and set the bit
859          * on the first half
860          */
861         if (state->start <= end && state->end > end) {
862                 if (state->state & exclusive_bits) {
863                         *failed_start = start;
864                         err = -EEXIST;
865                         goto out;
866                 }
867
868                 prealloc = alloc_extent_state_atomic(prealloc);
869                 BUG_ON(!prealloc);
870                 err = split_state(tree, state, prealloc, end + 1);
871                 BUG_ON(err == -EEXIST);
872
873                 set_state_bits(tree, prealloc, &bits);
874                 cache_state(prealloc, cached_state);
875                 merge_state(tree, prealloc);
876                 prealloc = NULL;
877                 goto out;
878         }
879
880         goto search_again;
881
882 out:
883         spin_unlock(&tree->lock);
884         if (prealloc)
885                 free_extent_state(prealloc);
886
887         return err;
888
889 search_again:
890         if (start > end)
891                 goto out;
892         spin_unlock(&tree->lock);
893         if (mask & __GFP_WAIT)
894                 cond_resched();
895         goto again;
896 }
897
898 /* wrappers around set/clear extent bit */
899 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
900                      gfp_t mask)
901 {
902         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
903                               NULL, mask);
904 }
905
906 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
907                     int bits, gfp_t mask)
908 {
909         return set_extent_bit(tree, start, end, bits, 0, NULL,
910                               NULL, mask);
911 }
912
913 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
914                       int bits, gfp_t mask)
915 {
916         return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
917 }
918
919 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
920                         struct extent_state **cached_state, gfp_t mask)
921 {
922         return set_extent_bit(tree, start, end,
923                               EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
924                               0, NULL, cached_state, mask);
925 }
926
927 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
928                        gfp_t mask)
929 {
930         return clear_extent_bit(tree, start, end,
931                                 EXTENT_DIRTY | EXTENT_DELALLOC |
932                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
933 }
934
935 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
936                      gfp_t mask)
937 {
938         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
939                               NULL, mask);
940 }
941
942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
943                         struct extent_state **cached_state, gfp_t mask)
944 {
945         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
946                               NULL, cached_state, mask);
947 }
948
949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
950                                  u64 end, struct extent_state **cached_state,
951                                  gfp_t mask)
952 {
953         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
954                                 cached_state, mask);
955 }
956
957 /*
958  * either insert or lock state struct between start and end use mask to tell
959  * us if waiting is desired.
960  */
961 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
962                      int bits, struct extent_state **cached_state, gfp_t mask)
963 {
964         int err;
965         u64 failed_start;
966         while (1) {
967                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
968                                      EXTENT_LOCKED, &failed_start,
969                                      cached_state, mask);
970                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
971                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
972                         start = failed_start;
973                 } else {
974                         break;
975                 }
976                 WARN_ON(start > end);
977         }
978         return err;
979 }
980
981 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
982 {
983         return lock_extent_bits(tree, start, end, 0, NULL, mask);
984 }
985
986 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
987                     gfp_t mask)
988 {
989         int err;
990         u64 failed_start;
991
992         err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
993                              &failed_start, NULL, mask);
994         if (err == -EEXIST) {
995                 if (failed_start > start)
996                         clear_extent_bit(tree, start, failed_start - 1,
997                                          EXTENT_LOCKED, 1, 0, NULL, mask);
998                 return 0;
999         }
1000         return 1;
1001 }
1002
1003 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1004                          struct extent_state **cached, gfp_t mask)
1005 {
1006         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1007                                 mask);
1008 }
1009
1010 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1011 {
1012         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1013                                 mask);
1014 }
1015
1016 /*
1017  * helper function to set both pages and extents in the tree writeback
1018  */
1019 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1020 {
1021         unsigned long index = start >> PAGE_CACHE_SHIFT;
1022         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1023         struct page *page;
1024
1025         while (index <= end_index) {
1026                 page = find_get_page(tree->mapping, index);
1027                 BUG_ON(!page);
1028                 set_page_writeback(page);
1029                 page_cache_release(page);
1030                 index++;
1031         }
1032         return 0;
1033 }
1034
1035 /* find the first state struct with 'bits' set after 'start', and
1036  * return it.  tree->lock must be held.  NULL will returned if
1037  * nothing was found after 'start'
1038  */
1039 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1040                                                  u64 start, int bits)
1041 {
1042         struct rb_node *node;
1043         struct extent_state *state;
1044
1045         /*
1046          * this search will find all the extents that end after
1047          * our range starts.
1048          */
1049         node = tree_search(tree, start);
1050         if (!node)
1051                 goto out;
1052
1053         while (1) {
1054                 state = rb_entry(node, struct extent_state, rb_node);
1055                 if (state->end >= start && (state->state & bits))
1056                         return state;
1057
1058                 node = rb_next(node);
1059                 if (!node)
1060                         break;
1061         }
1062 out:
1063         return NULL;
1064 }
1065
1066 /*
1067  * find the first offset in the io tree with 'bits' set. zero is
1068  * returned if we find something, and *start_ret and *end_ret are
1069  * set to reflect the state struct that was found.
1070  *
1071  * If nothing was found, 1 is returned, < 0 on error
1072  */
1073 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1074                           u64 *start_ret, u64 *end_ret, int bits)
1075 {
1076         struct extent_state *state;
1077         int ret = 1;
1078
1079         spin_lock(&tree->lock);
1080         state = find_first_extent_bit_state(tree, start, bits);
1081         if (state) {
1082                 *start_ret = state->start;
1083                 *end_ret = state->end;
1084                 ret = 0;
1085         }
1086         spin_unlock(&tree->lock);
1087         return ret;
1088 }
1089
1090 /*
1091  * find a contiguous range of bytes in the file marked as delalloc, not
1092  * more than 'max_bytes'.  start and end are used to return the range,
1093  *
1094  * 1 is returned if we find something, 0 if nothing was in the tree
1095  */
1096 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1097                                         u64 *start, u64 *end, u64 max_bytes,
1098                                         struct extent_state **cached_state)
1099 {
1100         struct rb_node *node;
1101         struct extent_state *state;
1102         u64 cur_start = *start;
1103         u64 found = 0;
1104         u64 total_bytes = 0;
1105
1106         spin_lock(&tree->lock);
1107
1108         /*
1109          * this search will find all the extents that end after
1110          * our range starts.
1111          */
1112         node = tree_search(tree, cur_start);
1113         if (!node) {
1114                 if (!found)
1115                         *end = (u64)-1;
1116                 goto out;
1117         }
1118
1119         while (1) {
1120                 state = rb_entry(node, struct extent_state, rb_node);
1121                 if (found && (state->start != cur_start ||
1122                               (state->state & EXTENT_BOUNDARY))) {
1123                         goto out;
1124                 }
1125                 if (!(state->state & EXTENT_DELALLOC)) {
1126                         if (!found)
1127                                 *end = state->end;
1128                         goto out;
1129                 }
1130                 if (!found) {
1131                         *start = state->start;
1132                         *cached_state = state;
1133                         atomic_inc(&state->refs);
1134                 }
1135                 found++;
1136                 *end = state->end;
1137                 cur_start = state->end + 1;
1138                 node = rb_next(node);
1139                 if (!node)
1140                         break;
1141                 total_bytes += state->end - state->start + 1;
1142                 if (total_bytes >= max_bytes)
1143                         break;
1144         }
1145 out:
1146         spin_unlock(&tree->lock);
1147         return found;
1148 }
1149
1150 static noinline int __unlock_for_delalloc(struct inode *inode,
1151                                           struct page *locked_page,
1152                                           u64 start, u64 end)
1153 {
1154         int ret;
1155         struct page *pages[16];
1156         unsigned long index = start >> PAGE_CACHE_SHIFT;
1157         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1158         unsigned long nr_pages = end_index - index + 1;
1159         int i;
1160
1161         if (index == locked_page->index && end_index == index)
1162                 return 0;
1163
1164         while (nr_pages > 0) {
1165                 ret = find_get_pages_contig(inode->i_mapping, index,
1166                                      min_t(unsigned long, nr_pages,
1167                                      ARRAY_SIZE(pages)), pages);
1168                 for (i = 0; i < ret; i++) {
1169                         if (pages[i] != locked_page)
1170                                 unlock_page(pages[i]);
1171                         page_cache_release(pages[i]);
1172                 }
1173                 nr_pages -= ret;
1174                 index += ret;
1175                 cond_resched();
1176         }
1177         return 0;
1178 }
1179
1180 static noinline int lock_delalloc_pages(struct inode *inode,
1181                                         struct page *locked_page,
1182                                         u64 delalloc_start,
1183                                         u64 delalloc_end)
1184 {
1185         unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1186         unsigned long start_index = index;
1187         unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1188         unsigned long pages_locked = 0;
1189         struct page *pages[16];
1190         unsigned long nrpages;
1191         int ret;
1192         int i;
1193
1194         /* the caller is responsible for locking the start index */
1195         if (index == locked_page->index && index == end_index)
1196                 return 0;
1197
1198         /* skip the page at the start index */
1199         nrpages = end_index - index + 1;
1200         while (nrpages > 0) {
1201                 ret = find_get_pages_contig(inode->i_mapping, index,
1202                                      min_t(unsigned long,
1203                                      nrpages, ARRAY_SIZE(pages)), pages);
1204                 if (ret == 0) {
1205                         ret = -EAGAIN;
1206                         goto done;
1207                 }
1208                 /* now we have an array of pages, lock them all */
1209                 for (i = 0; i < ret; i++) {
1210                         /*
1211                          * the caller is taking responsibility for
1212                          * locked_page
1213                          */
1214                         if (pages[i] != locked_page) {
1215                                 lock_page(pages[i]);
1216                                 if (!PageDirty(pages[i]) ||
1217                                     pages[i]->mapping != inode->i_mapping) {
1218                                         ret = -EAGAIN;
1219                                         unlock_page(pages[i]);
1220                                         page_cache_release(pages[i]);
1221                                         goto done;
1222                                 }
1223                         }
1224                         page_cache_release(pages[i]);
1225                         pages_locked++;
1226                 }
1227                 nrpages -= ret;
1228                 index += ret;
1229                 cond_resched();
1230         }
1231         ret = 0;
1232 done:
1233         if (ret && pages_locked) {
1234                 __unlock_for_delalloc(inode, locked_page,
1235                               delalloc_start,
1236                               ((u64)(start_index + pages_locked - 1)) <<
1237                               PAGE_CACHE_SHIFT);
1238         }
1239         return ret;
1240 }
1241
1242 /*
1243  * find a contiguous range of bytes in the file marked as delalloc, not
1244  * more than 'max_bytes'.  start and end are used to return the range,
1245  *
1246  * 1 is returned if we find something, 0 if nothing was in the tree
1247  */
1248 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1249                                              struct extent_io_tree *tree,
1250                                              struct page *locked_page,
1251                                              u64 *start, u64 *end,
1252                                              u64 max_bytes)
1253 {
1254         u64 delalloc_start;
1255         u64 delalloc_end;
1256         u64 found;
1257         struct extent_state *cached_state = NULL;
1258         int ret;
1259         int loops = 0;
1260
1261 again:
1262         /* step one, find a bunch of delalloc bytes starting at start */
1263         delalloc_start = *start;
1264         delalloc_end = 0;
1265         found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1266                                     max_bytes, &cached_state);
1267         if (!found || delalloc_end <= *start) {
1268                 *start = delalloc_start;
1269                 *end = delalloc_end;
1270                 free_extent_state(cached_state);
1271                 return found;
1272         }
1273
1274         /*
1275          * start comes from the offset of locked_page.  We have to lock
1276          * pages in order, so we can't process delalloc bytes before
1277          * locked_page
1278          */
1279         if (delalloc_start < *start)
1280                 delalloc_start = *start;
1281
1282         /*
1283          * make sure to limit the number of pages we try to lock down
1284          * if we're looping.
1285          */
1286         if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1287                 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1288
1289         /* step two, lock all the pages after the page that has start */
1290         ret = lock_delalloc_pages(inode, locked_page,
1291                                   delalloc_start, delalloc_end);
1292         if (ret == -EAGAIN) {
1293                 /* some of the pages are gone, lets avoid looping by
1294                  * shortening the size of the delalloc range we're searching
1295                  */
1296                 free_extent_state(cached_state);
1297                 if (!loops) {
1298                         unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1299                         max_bytes = PAGE_CACHE_SIZE - offset;
1300                         loops = 1;
1301                         goto again;
1302                 } else {
1303                         found = 0;
1304                         goto out_failed;
1305                 }
1306         }
1307         BUG_ON(ret);
1308
1309         /* step three, lock the state bits for the whole range */
1310         lock_extent_bits(tree, delalloc_start, delalloc_end,
1311                          0, &cached_state, GFP_NOFS);
1312
1313         /* then test to make sure it is all still delalloc */
1314         ret = test_range_bit(tree, delalloc_start, delalloc_end,
1315                              EXTENT_DELALLOC, 1, cached_state);
1316         if (!ret) {
1317                 unlock_extent_cached(tree, delalloc_start, delalloc_end,
1318                                      &cached_state, GFP_NOFS);
1319                 __unlock_for_delalloc(inode, locked_page,
1320                               delalloc_start, delalloc_end);
1321                 cond_resched();
1322                 goto again;
1323         }
1324         free_extent_state(cached_state);
1325         *start = delalloc_start;
1326         *end = delalloc_end;
1327 out_failed:
1328         return found;
1329 }
1330
1331 int extent_clear_unlock_delalloc(struct inode *inode,
1332                                 struct extent_io_tree *tree,
1333                                 u64 start, u64 end, struct page *locked_page,
1334                                 unsigned long op)
1335 {
1336         int ret;
1337         struct page *pages[16];
1338         unsigned long index = start >> PAGE_CACHE_SHIFT;
1339         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1340         unsigned long nr_pages = end_index - index + 1;
1341         int i;
1342         int clear_bits = 0;
1343
1344         if (op & EXTENT_CLEAR_UNLOCK)
1345                 clear_bits |= EXTENT_LOCKED;
1346         if (op & EXTENT_CLEAR_DIRTY)
1347                 clear_bits |= EXTENT_DIRTY;
1348
1349         if (op & EXTENT_CLEAR_DELALLOC)
1350                 clear_bits |= EXTENT_DELALLOC;
1351
1352         clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1353         if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1354                     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1355                     EXTENT_SET_PRIVATE2)))
1356                 return 0;
1357
1358         while (nr_pages > 0) {
1359                 ret = find_get_pages_contig(inode->i_mapping, index,
1360                                      min_t(unsigned long,
1361                                      nr_pages, ARRAY_SIZE(pages)), pages);
1362                 for (i = 0; i < ret; i++) {
1363
1364                         if (op & EXTENT_SET_PRIVATE2)
1365                                 SetPagePrivate2(pages[i]);
1366
1367                         if (pages[i] == locked_page) {
1368                                 page_cache_release(pages[i]);
1369                                 continue;
1370                         }
1371                         if (op & EXTENT_CLEAR_DIRTY)
1372                                 clear_page_dirty_for_io(pages[i]);
1373                         if (op & EXTENT_SET_WRITEBACK)
1374                                 set_page_writeback(pages[i]);
1375                         if (op & EXTENT_END_WRITEBACK)
1376                                 end_page_writeback(pages[i]);
1377                         if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1378                                 unlock_page(pages[i]);
1379                         page_cache_release(pages[i]);
1380                 }
1381                 nr_pages -= ret;
1382                 index += ret;
1383                 cond_resched();
1384         }
1385         return 0;
1386 }
1387
1388 /*
1389  * count the number of bytes in the tree that have a given bit(s)
1390  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1391  * cached.  The total number found is returned.
1392  */
1393 u64 count_range_bits(struct extent_io_tree *tree,
1394                      u64 *start, u64 search_end, u64 max_bytes,
1395                      unsigned long bits, int contig)
1396 {
1397         struct rb_node *node;
1398         struct extent_state *state;
1399         u64 cur_start = *start;
1400         u64 total_bytes = 0;
1401         u64 last = 0;
1402         int found = 0;
1403
1404         if (search_end <= cur_start) {
1405                 WARN_ON(1);
1406                 return 0;
1407         }
1408
1409         spin_lock(&tree->lock);
1410         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1411                 total_bytes = tree->dirty_bytes;
1412                 goto out;
1413         }
1414         /*
1415          * this search will find all the extents that end after
1416          * our range starts.
1417          */
1418         node = tree_search(tree, cur_start);
1419         if (!node)
1420                 goto out;
1421
1422         while (1) {
1423                 state = rb_entry(node, struct extent_state, rb_node);
1424                 if (state->start > search_end)
1425                         break;
1426                 if (contig && found && state->start > last + 1)
1427                         break;
1428                 if (state->end >= cur_start && (state->state & bits) == bits) {
1429                         total_bytes += min(search_end, state->end) + 1 -
1430                                        max(cur_start, state->start);
1431                         if (total_bytes >= max_bytes)
1432                                 break;
1433                         if (!found) {
1434                                 *start = max(cur_start, state->start);
1435                                 found = 1;
1436                         }
1437                         last = state->end;
1438                 } else if (contig && found) {
1439                         break;
1440                 }
1441                 node = rb_next(node);
1442                 if (!node)
1443                         break;
1444         }
1445 out:
1446         spin_unlock(&tree->lock);
1447         return total_bytes;
1448 }
1449
1450 /*
1451  * set the private field for a given byte offset in the tree.  If there isn't
1452  * an extent_state there already, this does nothing.
1453  */
1454 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1455 {
1456         struct rb_node *node;
1457         struct extent_state *state;
1458         int ret = 0;
1459
1460         spin_lock(&tree->lock);
1461         /*
1462          * this search will find all the extents that end after
1463          * our range starts.
1464          */
1465         node = tree_search(tree, start);
1466         if (!node) {
1467                 ret = -ENOENT;
1468                 goto out;
1469         }
1470         state = rb_entry(node, struct extent_state, rb_node);
1471         if (state->start != start) {
1472                 ret = -ENOENT;
1473                 goto out;
1474         }
1475         state->private = private;
1476 out:
1477         spin_unlock(&tree->lock);
1478         return ret;
1479 }
1480
1481 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1482 {
1483         struct rb_node *node;
1484         struct extent_state *state;
1485         int ret = 0;
1486
1487         spin_lock(&tree->lock);
1488         /*
1489          * this search will find all the extents that end after
1490          * our range starts.
1491          */
1492         node = tree_search(tree, start);
1493         if (!node) {
1494                 ret = -ENOENT;
1495                 goto out;
1496         }
1497         state = rb_entry(node, struct extent_state, rb_node);
1498         if (state->start != start) {
1499                 ret = -ENOENT;
1500                 goto out;
1501         }
1502         *private = state->private;
1503 out:
1504         spin_unlock(&tree->lock);
1505         return ret;
1506 }
1507
1508 /*
1509  * searches a range in the state tree for a given mask.
1510  * If 'filled' == 1, this returns 1 only if every extent in the tree
1511  * has the bits set.  Otherwise, 1 is returned if any bit in the
1512  * range is found set.
1513  */
1514 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1515                    int bits, int filled, struct extent_state *cached)
1516 {
1517         struct extent_state *state = NULL;
1518         struct rb_node *node;
1519         int bitset = 0;
1520
1521         spin_lock(&tree->lock);
1522         if (cached && cached->tree && cached->start <= start &&
1523             cached->end > start)
1524                 node = &cached->rb_node;
1525         else
1526                 node = tree_search(tree, start);
1527         while (node && start <= end) {
1528                 state = rb_entry(node, struct extent_state, rb_node);
1529
1530                 if (filled && state->start > start) {
1531                         bitset = 0;
1532                         break;
1533                 }
1534
1535                 if (state->start > end)
1536                         break;
1537
1538                 if (state->state & bits) {
1539                         bitset = 1;
1540                         if (!filled)
1541                                 break;
1542                 } else if (filled) {
1543                         bitset = 0;
1544                         break;
1545                 }
1546
1547                 if (state->end == (u64)-1)
1548                         break;
1549
1550                 start = state->end + 1;
1551                 if (start > end)
1552                         break;
1553                 node = rb_next(node);
1554                 if (!node) {
1555                         if (filled)
1556                                 bitset = 0;
1557                         break;
1558                 }
1559         }
1560         spin_unlock(&tree->lock);
1561         return bitset;
1562 }
1563
1564 /*
1565  * helper function to set a given page up to date if all the
1566  * extents in the tree for that page are up to date
1567  */
1568 static int check_page_uptodate(struct extent_io_tree *tree,
1569                                struct page *page)
1570 {
1571         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1572         u64 end = start + PAGE_CACHE_SIZE - 1;
1573         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1574                 SetPageUptodate(page);
1575         return 0;
1576 }
1577
1578 /*
1579  * helper function to unlock a page if all the extents in the tree
1580  * for that page are unlocked
1581  */
1582 static int check_page_locked(struct extent_io_tree *tree,
1583                              struct page *page)
1584 {
1585         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1586         u64 end = start + PAGE_CACHE_SIZE - 1;
1587         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1588                 unlock_page(page);
1589         return 0;
1590 }
1591
1592 /*
1593  * helper function to end page writeback if all the extents
1594  * in the tree for that page are done with writeback
1595  */
1596 static int check_page_writeback(struct extent_io_tree *tree,
1597                              struct page *page)
1598 {
1599         end_page_writeback(page);
1600         return 0;
1601 }
1602
1603 /*
1604  * When IO fails, either with EIO or csum verification fails, we
1605  * try other mirrors that might have a good copy of the data.  This
1606  * io_failure_record is used to record state as we go through all the
1607  * mirrors.  If another mirror has good data, the page is set up to date
1608  * and things continue.  If a good mirror can't be found, the original
1609  * bio end_io callback is called to indicate things have failed.
1610  */
1611 struct io_failure_record {
1612         struct page *page;
1613         u64 start;
1614         u64 len;
1615         u64 logical;
1616         unsigned long bio_flags;
1617         int this_mirror;
1618         int failed_mirror;
1619         int in_validation;
1620 };
1621
1622 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1623                                 int did_repair)
1624 {
1625         int ret;
1626         int err = 0;
1627         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1628
1629         set_state_private(failure_tree, rec->start, 0);
1630         ret = clear_extent_bits(failure_tree, rec->start,
1631                                 rec->start + rec->len - 1,
1632                                 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1633         if (ret)
1634                 err = ret;
1635
1636         if (did_repair) {
1637                 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1638                                         rec->start + rec->len - 1,
1639                                         EXTENT_DAMAGED, GFP_NOFS);
1640                 if (ret && !err)
1641                         err = ret;
1642         }
1643
1644         kfree(rec);
1645         return err;
1646 }
1647
1648 static void repair_io_failure_callback(struct bio *bio, int err)
1649 {
1650         complete(bio->bi_private);
1651 }
1652
1653 /*
1654  * this bypasses the standard btrfs submit functions deliberately, as
1655  * the standard behavior is to write all copies in a raid setup. here we only
1656  * want to write the one bad copy. so we do the mapping for ourselves and issue
1657  * submit_bio directly.
1658  * to avoid any synchonization issues, wait for the data after writing, which
1659  * actually prevents the read that triggered the error from finishing.
1660  * currently, there can be no more than two copies of every data bit. thus,
1661  * exactly one rewrite is required.
1662  */
1663 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1664                         u64 length, u64 logical, struct page *page,
1665                         int mirror_num)
1666 {
1667         struct bio *bio;
1668         struct btrfs_device *dev;
1669         DECLARE_COMPLETION_ONSTACK(compl);
1670         u64 map_length = 0;
1671         u64 sector;
1672         struct btrfs_bio *bbio = NULL;
1673         int ret;
1674
1675         BUG_ON(!mirror_num);
1676
1677         bio = bio_alloc(GFP_NOFS, 1);
1678         if (!bio)
1679                 return -EIO;
1680         bio->bi_private = &compl;
1681         bio->bi_end_io = repair_io_failure_callback;
1682         bio->bi_size = 0;
1683         map_length = length;
1684
1685         ret = btrfs_map_block(map_tree, WRITE, logical,
1686                               &map_length, &bbio, mirror_num);
1687         if (ret) {
1688                 bio_put(bio);
1689                 return -EIO;
1690         }
1691         BUG_ON(mirror_num != bbio->mirror_num);
1692         sector = bbio->stripes[mirror_num-1].physical >> 9;
1693         bio->bi_sector = sector;
1694         dev = bbio->stripes[mirror_num-1].dev;
1695         kfree(bbio);
1696         if (!dev || !dev->bdev || !dev->writeable) {
1697                 bio_put(bio);
1698                 return -EIO;
1699         }
1700         bio->bi_bdev = dev->bdev;
1701         bio_add_page(bio, page, length, start-page_offset(page));
1702         submit_bio(WRITE_SYNC, bio);
1703         wait_for_completion(&compl);
1704
1705         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1706                 /* try to remap that extent elsewhere? */
1707                 bio_put(bio);
1708                 return -EIO;
1709         }
1710
1711         printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1712                         "sector %llu)\n", page->mapping->host->i_ino, start,
1713                         dev->name, sector);
1714
1715         bio_put(bio);
1716         return 0;
1717 }
1718
1719 /*
1720  * each time an IO finishes, we do a fast check in the IO failure tree
1721  * to see if we need to process or clean up an io_failure_record
1722  */
1723 static int clean_io_failure(u64 start, struct page *page)
1724 {
1725         u64 private;
1726         u64 private_failure;
1727         struct io_failure_record *failrec;
1728         struct btrfs_mapping_tree *map_tree;
1729         struct extent_state *state;
1730         int num_copies;
1731         int did_repair = 0;
1732         int ret;
1733         struct inode *inode = page->mapping->host;
1734
1735         private = 0;
1736         ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1737                                 (u64)-1, 1, EXTENT_DIRTY, 0);
1738         if (!ret)
1739                 return 0;
1740
1741         ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1742                                 &private_failure);
1743         if (ret)
1744                 return 0;
1745
1746         failrec = (struct io_failure_record *)(unsigned long) private_failure;
1747         BUG_ON(!failrec->this_mirror);
1748
1749         if (failrec->in_validation) {
1750                 /* there was no real error, just free the record */
1751                 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1752                          failrec->start);
1753                 did_repair = 1;
1754                 goto out;
1755         }
1756
1757         spin_lock(&BTRFS_I(inode)->io_tree.lock);
1758         state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1759                                             failrec->start,
1760                                             EXTENT_LOCKED);
1761         spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1762
1763         if (state && state->start == failrec->start) {
1764                 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1765                 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1766                                                 failrec->len);
1767                 if (num_copies > 1)  {
1768                         ret = repair_io_failure(map_tree, start, failrec->len,
1769                                                 failrec->logical, page,
1770                                                 failrec->failed_mirror);
1771                         did_repair = !ret;
1772                 }
1773         }
1774
1775 out:
1776         if (!ret)
1777                 ret = free_io_failure(inode, failrec, did_repair);
1778
1779         return ret;
1780 }
1781
1782 /*
1783  * this is a generic handler for readpage errors (default
1784  * readpage_io_failed_hook). if other copies exist, read those and write back
1785  * good data to the failed position. does not investigate in remapping the
1786  * failed extent elsewhere, hoping the device will be smart enough to do this as
1787  * needed
1788  */
1789
1790 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1791                                 u64 start, u64 end, int failed_mirror,
1792                                 struct extent_state *state)
1793 {
1794         struct io_failure_record *failrec = NULL;
1795         u64 private;
1796         struct extent_map *em;
1797         struct inode *inode = page->mapping->host;
1798         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1799         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1800         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1801         struct bio *bio;
1802         int num_copies;
1803         int ret;
1804         int read_mode;
1805         u64 logical;
1806
1807         BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1808
1809         ret = get_state_private(failure_tree, start, &private);
1810         if (ret) {
1811                 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
1812                 if (!failrec)
1813                         return -ENOMEM;
1814                 failrec->start = start;
1815                 failrec->len = end - start + 1;
1816                 failrec->this_mirror = 0;
1817                 failrec->bio_flags = 0;
1818                 failrec->in_validation = 0;
1819
1820                 read_lock(&em_tree->lock);
1821                 em = lookup_extent_mapping(em_tree, start, failrec->len);
1822                 if (!em) {
1823                         read_unlock(&em_tree->lock);
1824                         kfree(failrec);
1825                         return -EIO;
1826                 }
1827
1828                 if (em->start > start || em->start + em->len < start) {
1829                         free_extent_map(em);
1830                         em = NULL;
1831                 }
1832                 read_unlock(&em_tree->lock);
1833
1834                 if (!em || IS_ERR(em)) {
1835                         kfree(failrec);
1836                         return -EIO;
1837                 }
1838                 logical = start - em->start;
1839                 logical = em->block_start + logical;
1840                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1841                         logical = em->block_start;
1842                         failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1843                         extent_set_compress_type(&failrec->bio_flags,
1844                                                  em->compress_type);
1845                 }
1846                 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
1847                          "len=%llu\n", logical, start, failrec->len);
1848                 failrec->logical = logical;
1849                 free_extent_map(em);
1850
1851                 /* set the bits in the private failure tree */
1852                 ret = set_extent_bits(failure_tree, start, end,
1853                                         EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1854                 if (ret >= 0)
1855                         ret = set_state_private(failure_tree, start,
1856                                                 (u64)(unsigned long)failrec);
1857                 /* set the bits in the inode's tree */
1858                 if (ret >= 0)
1859                         ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
1860                                                 GFP_NOFS);
1861                 if (ret < 0) {
1862                         kfree(failrec);
1863                         return ret;
1864                 }
1865         } else {
1866                 failrec = (struct io_failure_record *)(unsigned long)private;
1867                 pr_debug("bio_readpage_error: (found) logical=%llu, "
1868                          "start=%llu, len=%llu, validation=%d\n",
1869                          failrec->logical, failrec->start, failrec->len,
1870                          failrec->in_validation);
1871                 /*
1872                  * when data can be on disk more than twice, add to failrec here
1873                  * (e.g. with a list for failed_mirror) to make
1874                  * clean_io_failure() clean all those errors at once.
1875                  */
1876         }
1877         num_copies = btrfs_num_copies(
1878                               &BTRFS_I(inode)->root->fs_info->mapping_tree,
1879                               failrec->logical, failrec->len);
1880         if (num_copies == 1) {
1881                 /*
1882                  * we only have a single copy of the data, so don't bother with
1883                  * all the retry and error correction code that follows. no
1884                  * matter what the error is, it is very likely to persist.
1885                  */
1886                 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
1887                          "state=%p, num_copies=%d, next_mirror %d, "
1888                          "failed_mirror %d\n", state, num_copies,
1889                          failrec->this_mirror, failed_mirror);
1890                 free_io_failure(inode, failrec, 0);
1891                 return -EIO;
1892         }
1893
1894         if (!state) {
1895                 spin_lock(&tree->lock);
1896                 state = find_first_extent_bit_state(tree, failrec->start,
1897                                                     EXTENT_LOCKED);
1898                 if (state && state->start != failrec->start)
1899                         state = NULL;
1900                 spin_unlock(&tree->lock);
1901         }
1902
1903         /*
1904          * there are two premises:
1905          *      a) deliver good data to the caller
1906          *      b) correct the bad sectors on disk
1907          */
1908         if (failed_bio->bi_vcnt > 1) {
1909                 /*
1910                  * to fulfill b), we need to know the exact failing sectors, as
1911                  * we don't want to rewrite any more than the failed ones. thus,
1912                  * we need separate read requests for the failed bio
1913                  *
1914                  * if the following BUG_ON triggers, our validation request got
1915                  * merged. we need separate requests for our algorithm to work.
1916                  */
1917                 BUG_ON(failrec->in_validation);
1918                 failrec->in_validation = 1;
1919                 failrec->this_mirror = failed_mirror;
1920                 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
1921         } else {
1922                 /*
1923                  * we're ready to fulfill a) and b) alongside. get a good copy
1924                  * of the failed sector and if we succeed, we have setup
1925                  * everything for repair_io_failure to do the rest for us.
1926                  */
1927                 if (failrec->in_validation) {
1928                         BUG_ON(failrec->this_mirror != failed_mirror);
1929                         failrec->in_validation = 0;
1930                         failrec->this_mirror = 0;
1931                 }
1932                 failrec->failed_mirror = failed_mirror;
1933                 failrec->this_mirror++;
1934                 if (failrec->this_mirror == failed_mirror)
1935                         failrec->this_mirror++;
1936                 read_mode = READ_SYNC;
1937         }
1938
1939         if (!state || failrec->this_mirror > num_copies) {
1940                 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
1941                          "next_mirror %d, failed_mirror %d\n", state,
1942                          num_copies, failrec->this_mirror, failed_mirror);
1943                 free_io_failure(inode, failrec, 0);
1944                 return -EIO;
1945         }
1946
1947         bio = bio_alloc(GFP_NOFS, 1);
1948         bio->bi_private = state;
1949         bio->bi_end_io = failed_bio->bi_end_io;
1950         bio->bi_sector = failrec->logical >> 9;
1951         bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1952         bio->bi_size = 0;
1953
1954         bio_add_page(bio, page, failrec->len, start - page_offset(page));
1955
1956         pr_debug("bio_readpage_error: submitting new read[%#x] to "
1957                  "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
1958                  failrec->this_mirror, num_copies, failrec->in_validation);
1959
1960         tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
1961                                         failrec->bio_flags, 0);
1962         return 0;
1963 }
1964
1965 /* lots and lots of room for performance fixes in the end_bio funcs */
1966
1967 /*
1968  * after a writepage IO is done, we need to:
1969  * clear the uptodate bits on error
1970  * clear the writeback bits in the extent tree for this IO
1971  * end_page_writeback if the page has no more pending IO
1972  *
1973  * Scheduling is not allowed, so the extent state tree is expected
1974  * to have one and only one object corresponding to this IO.
1975  */
1976 static void end_bio_extent_writepage(struct bio *bio, int err)
1977 {
1978         int uptodate = err == 0;
1979         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1980         struct extent_io_tree *tree;
1981         u64 start;
1982         u64 end;
1983         int whole_page;
1984         int ret;
1985
1986         do {
1987                 struct page *page = bvec->bv_page;
1988                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1989
1990                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1991                          bvec->bv_offset;
1992                 end = start + bvec->bv_len - 1;
1993
1994                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1995                         whole_page = 1;
1996                 else
1997                         whole_page = 0;
1998
1999                 if (--bvec >= bio->bi_io_vec)
2000                         prefetchw(&bvec->bv_page->flags);
2001                 if (tree->ops && tree->ops->writepage_end_io_hook) {
2002                         ret = tree->ops->writepage_end_io_hook(page, start,
2003                                                        end, NULL, uptodate);
2004                         if (ret)
2005                                 uptodate = 0;
2006                 }
2007
2008                 if (!uptodate && tree->ops &&
2009                     tree->ops->writepage_io_failed_hook) {
2010                         ret = tree->ops->writepage_io_failed_hook(bio, page,
2011                                                          start, end, NULL);
2012                         if (ret == 0) {
2013                                 uptodate = (err == 0);
2014                                 continue;
2015                         }
2016                 }
2017
2018                 if (!uptodate) {
2019                         clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
2020                         ClearPageUptodate(page);
2021                         SetPageError(page);
2022                 }
2023
2024                 if (whole_page)
2025                         end_page_writeback(page);
2026                 else
2027                         check_page_writeback(tree, page);
2028         } while (bvec >= bio->bi_io_vec);
2029
2030         bio_put(bio);
2031 }
2032
2033 /*
2034  * after a readpage IO is done, we need to:
2035  * clear the uptodate bits on error
2036  * set the uptodate bits if things worked
2037  * set the page up to date if all extents in the tree are uptodate
2038  * clear the lock bit in the extent tree
2039  * unlock the page if there are no other extents locked for it
2040  *
2041  * Scheduling is not allowed, so the extent state tree is expected
2042  * to have one and only one object corresponding to this IO.
2043  */
2044 static void end_bio_extent_readpage(struct bio *bio, int err)
2045 {
2046         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2047         struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2048         struct bio_vec *bvec = bio->bi_io_vec;
2049         struct extent_io_tree *tree;
2050         u64 start;
2051         u64 end;
2052         int whole_page;
2053         int ret;
2054
2055         if (err)
2056                 uptodate = 0;
2057
2058         do {
2059                 struct page *page = bvec->bv_page;
2060                 struct extent_state *cached = NULL;
2061                 struct extent_state *state;
2062
2063                 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2064                          "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2065                          (long int)bio->bi_bdev);
2066                 tree = &BTRFS_I(page->mapping->host)->io_tree;
2067
2068                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2069                         bvec->bv_offset;
2070                 end = start + bvec->bv_len - 1;
2071
2072                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2073                         whole_page = 1;
2074                 else
2075                         whole_page = 0;
2076
2077                 if (++bvec <= bvec_end)
2078                         prefetchw(&bvec->bv_page->flags);
2079
2080                 spin_lock(&tree->lock);
2081                 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2082                 if (state && state->start == start) {
2083                         /*
2084                          * take a reference on the state, unlock will drop
2085                          * the ref
2086                          */
2087                         cache_state(state, &cached);
2088                 }
2089                 spin_unlock(&tree->lock);
2090
2091                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2092                         ret = tree->ops->readpage_end_io_hook(page, start, end,
2093                                                               state);
2094                         if (ret)
2095                                 uptodate = 0;
2096                         else
2097                                 clean_io_failure(start, page);
2098                 }
2099                 if (!uptodate) {
2100                         u64 failed_mirror;
2101                         failed_mirror = (u64)bio->bi_bdev;
2102                         if (tree->ops && tree->ops->readpage_io_failed_hook)
2103                                 ret = tree->ops->readpage_io_failed_hook(
2104                                                 bio, page, start, end,
2105                                                 failed_mirror, NULL);
2106                         else
2107                                 ret = bio_readpage_error(bio, page, start, end,
2108                                                          failed_mirror, NULL);
2109                         if (ret == 0) {
2110                                 uptodate =
2111                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
2112                                 if (err)
2113                                         uptodate = 0;
2114                                 uncache_state(&cached);
2115                                 continue;
2116                         }
2117                 }
2118
2119                 if (uptodate) {
2120                         set_extent_uptodate(tree, start, end, &cached,
2121                                             GFP_ATOMIC);
2122                 }
2123                 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2124
2125                 if (whole_page) {
2126                         if (uptodate) {
2127                                 SetPageUptodate(page);
2128                         } else {
2129                                 ClearPageUptodate(page);
2130                                 SetPageError(page);
2131                         }
2132                         unlock_page(page);
2133                 } else {
2134                         if (uptodate) {
2135                                 check_page_uptodate(tree, page);
2136                         } else {
2137                                 ClearPageUptodate(page);
2138                                 SetPageError(page);
2139                         }
2140                         check_page_locked(tree, page);
2141                 }
2142         } while (bvec <= bvec_end);
2143
2144         bio_put(bio);
2145 }
2146
2147 struct bio *
2148 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2149                 gfp_t gfp_flags)
2150 {
2151         struct bio *bio;
2152
2153         bio = bio_alloc(gfp_flags, nr_vecs);
2154
2155         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2156                 while (!bio && (nr_vecs /= 2))
2157                         bio = bio_alloc(gfp_flags, nr_vecs);
2158         }
2159
2160         if (bio) {
2161                 bio->bi_size = 0;
2162                 bio->bi_bdev = bdev;
2163                 bio->bi_sector = first_sector;
2164         }
2165         return bio;
2166 }
2167
2168 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2169                           unsigned long bio_flags)
2170 {
2171         int ret = 0;
2172         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2173         struct page *page = bvec->bv_page;
2174         struct extent_io_tree *tree = bio->bi_private;
2175         u64 start;
2176
2177         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2178
2179         bio->bi_private = NULL;
2180
2181         bio_get(bio);
2182
2183         if (tree->ops && tree->ops->submit_bio_hook)
2184                 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2185                                            mirror_num, bio_flags, start);
2186         else
2187                 submit_bio(rw, bio);
2188
2189         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2190                 ret = -EOPNOTSUPP;
2191         bio_put(bio);
2192         return ret;
2193 }
2194
2195 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2196                               struct page *page, sector_t sector,
2197                               size_t size, unsigned long offset,
2198                               struct block_device *bdev,
2199                               struct bio **bio_ret,
2200                               unsigned long max_pages,
2201                               bio_end_io_t end_io_func,
2202                               int mirror_num,
2203                               unsigned long prev_bio_flags,
2204                               unsigned long bio_flags)
2205 {
2206         int ret = 0;
2207         struct bio *bio;
2208         int nr;
2209         int contig = 0;
2210         int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2211         int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2212         size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2213
2214         if (bio_ret && *bio_ret) {
2215                 bio = *bio_ret;
2216                 if (old_compressed)
2217                         contig = bio->bi_sector == sector;
2218                 else
2219                         contig = bio->bi_sector + (bio->bi_size >> 9) ==
2220                                 sector;
2221
2222                 if (prev_bio_flags != bio_flags || !contig ||
2223                     (tree->ops && tree->ops->merge_bio_hook &&
2224                      tree->ops->merge_bio_hook(page, offset, page_size, bio,
2225                                                bio_flags)) ||
2226                     bio_add_page(bio, page, page_size, offset) < page_size) {
2227                         ret = submit_one_bio(rw, bio, mirror_num,
2228                                              prev_bio_flags);
2229                         bio = NULL;
2230                 } else {
2231                         return 0;
2232                 }
2233         }
2234         if (this_compressed)
2235                 nr = BIO_MAX_PAGES;
2236         else
2237                 nr = bio_get_nr_vecs(bdev);
2238
2239         bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2240         if (!bio)
2241                 return -ENOMEM;
2242
2243         bio_add_page(bio, page, page_size, offset);
2244         bio->bi_end_io = end_io_func;
2245         bio->bi_private = tree;
2246
2247         if (bio_ret)
2248                 *bio_ret = bio;
2249         else
2250                 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2251
2252         return ret;
2253 }
2254
2255 void set_page_extent_mapped(struct page *page)
2256 {
2257         if (!PagePrivate(page)) {
2258                 SetPagePrivate(page);
2259                 page_cache_get(page);
2260                 set_page_private(page, EXTENT_PAGE_PRIVATE);
2261         }
2262 }
2263
2264 static void set_page_extent_head(struct page *page, unsigned long len)
2265 {
2266         WARN_ON(!PagePrivate(page));
2267         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
2268 }
2269
2270 /*
2271  * basic readpage implementation.  Locked extent state structs are inserted
2272  * into the tree that are removed when the IO is done (by the end_io
2273  * handlers)
2274  */
2275 static int __extent_read_full_page(struct extent_io_tree *tree,
2276                                    struct page *page,
2277                                    get_extent_t *get_extent,
2278                                    struct bio **bio, int mirror_num,
2279                                    unsigned long *bio_flags)
2280 {
2281         struct inode *inode = page->mapping->host;
2282         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2283         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2284         u64 end;
2285         u64 cur = start;
2286         u64 extent_offset;
2287         u64 last_byte = i_size_read(inode);
2288         u64 block_start;
2289         u64 cur_end;
2290         sector_t sector;
2291         struct extent_map *em;
2292         struct block_device *bdev;
2293         struct btrfs_ordered_extent *ordered;
2294         int ret;
2295         int nr = 0;
2296         size_t pg_offset = 0;
2297         size_t iosize;
2298         size_t disk_io_size;
2299         size_t blocksize = inode->i_sb->s_blocksize;
2300         unsigned long this_bio_flag = 0;
2301
2302         set_page_extent_mapped(page);
2303
2304         if (!PageUptodate(page)) {
2305                 if (cleancache_get_page(page) == 0) {
2306                         BUG_ON(blocksize != PAGE_SIZE);
2307                         goto out;
2308                 }
2309         }
2310
2311         end = page_end;
2312         while (1) {
2313                 lock_extent(tree, start, end, GFP_NOFS);
2314                 ordered = btrfs_lookup_ordered_extent(inode, start);
2315                 if (!ordered)
2316                         break;
2317                 unlock_extent(tree, start, end, GFP_NOFS);
2318                 btrfs_start_ordered_extent(inode, ordered, 1);
2319                 btrfs_put_ordered_extent(ordered);
2320         }
2321
2322         if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2323                 char *userpage;
2324                 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2325
2326                 if (zero_offset) {
2327                         iosize = PAGE_CACHE_SIZE - zero_offset;
2328                         userpage = kmap_atomic(page, KM_USER0);
2329                         memset(userpage + zero_offset, 0, iosize);
2330                         flush_dcache_page(page);
2331                         kunmap_atomic(userpage, KM_USER0);
2332                 }
2333         }
2334         while (cur <= end) {
2335                 if (cur >= last_byte) {
2336                         char *userpage;
2337                         struct extent_state *cached = NULL;
2338
2339                         iosize = PAGE_CACHE_SIZE - pg_offset;
2340                         userpage = kmap_atomic(page, KM_USER0);
2341                         memset(userpage + pg_offset, 0, iosize);
2342                         flush_dcache_page(page);
2343                         kunmap_atomic(userpage, KM_USER0);
2344                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2345                                             &cached, GFP_NOFS);
2346                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2347                                              &cached, GFP_NOFS);
2348                         break;
2349                 }
2350                 em = get_extent(inode, page, pg_offset, cur,
2351                                 end - cur + 1, 0);
2352                 if (IS_ERR_OR_NULL(em)) {
2353                         SetPageError(page);
2354                         unlock_extent(tree, cur, end, GFP_NOFS);
2355                         break;
2356                 }
2357                 extent_offset = cur - em->start;
2358                 BUG_ON(extent_map_end(em) <= cur);
2359                 BUG_ON(end < cur);
2360
2361                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2362                         this_bio_flag = EXTENT_BIO_COMPRESSED;
2363                         extent_set_compress_type(&this_bio_flag,
2364                                                  em->compress_type);
2365                 }
2366
2367                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2368                 cur_end = min(extent_map_end(em) - 1, end);
2369                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2370                 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2371                         disk_io_size = em->block_len;
2372                         sector = em->block_start >> 9;
2373                 } else {
2374                         sector = (em->block_start + extent_offset) >> 9;
2375                         disk_io_size = iosize;
2376                 }
2377                 bdev = em->bdev;
2378                 block_start = em->block_start;
2379                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2380                         block_start = EXTENT_MAP_HOLE;
2381                 free_extent_map(em);
2382                 em = NULL;
2383
2384                 /* we've found a hole, just zero and go on */
2385                 if (block_start == EXTENT_MAP_HOLE) {
2386                         char *userpage;
2387                         struct extent_state *cached = NULL;
2388
2389                         userpage = kmap_atomic(page, KM_USER0);
2390                         memset(userpage + pg_offset, 0, iosize);
2391                         flush_dcache_page(page);
2392                         kunmap_atomic(userpage, KM_USER0);
2393
2394                         set_extent_uptodate(tree, cur, cur + iosize - 1,
2395                                             &cached, GFP_NOFS);
2396                         unlock_extent_cached(tree, cur, cur + iosize - 1,
2397                                              &cached, GFP_NOFS);
2398                         cur = cur + iosize;
2399                         pg_offset += iosize;
2400                         continue;
2401                 }
2402                 /* the get_extent function already copied into the page */
2403                 if (test_range_bit(tree, cur, cur_end,
2404                                    EXTENT_UPTODATE, 1, NULL)) {
2405                         check_page_uptodate(tree, page);
2406                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2407                         cur = cur + iosize;
2408                         pg_offset += iosize;
2409                         continue;
2410                 }
2411                 /* we have an inline extent but it didn't get marked up
2412                  * to date.  Error out
2413                  */
2414                 if (block_start == EXTENT_MAP_INLINE) {
2415                         SetPageError(page);
2416                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2417                         cur = cur + iosize;
2418                         pg_offset += iosize;
2419                         continue;
2420                 }
2421
2422                 ret = 0;
2423                 if (tree->ops && tree->ops->readpage_io_hook) {
2424                         ret = tree->ops->readpage_io_hook(page, cur,
2425                                                           cur + iosize - 1);
2426                 }
2427                 if (!ret) {
2428                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2429                         pnr -= page->index;
2430                         ret = submit_extent_page(READ, tree, page,
2431                                          sector, disk_io_size, pg_offset,
2432                                          bdev, bio, pnr,
2433                                          end_bio_extent_readpage, mirror_num,
2434                                          *bio_flags,
2435                                          this_bio_flag);
2436                         nr++;
2437                         *bio_flags = this_bio_flag;
2438                 }
2439                 if (ret)
2440                         SetPageError(page);
2441                 cur = cur + iosize;
2442                 pg_offset += iosize;
2443         }
2444 out:
2445         if (!nr) {
2446                 if (!PageError(page))
2447                         SetPageUptodate(page);
2448                 unlock_page(page);
2449         }
2450         return 0;
2451 }
2452
2453 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2454                             get_extent_t *get_extent, int mirror_num)
2455 {
2456         struct bio *bio = NULL;
2457         unsigned long bio_flags = 0;
2458         int ret;
2459
2460         ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2461                                       &bio_flags);
2462         if (bio)
2463                 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2464         return ret;
2465 }
2466
2467 static noinline void update_nr_written(struct page *page,
2468                                       struct writeback_control *wbc,
2469                                       unsigned long nr_written)
2470 {
2471         wbc->nr_to_write -= nr_written;
2472         if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2473             wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2474                 page->mapping->writeback_index = page->index + nr_written;
2475 }
2476
2477 /*
2478  * the writepage semantics are similar to regular writepage.  extent
2479  * records are inserted to lock ranges in the tree, and as dirty areas
2480  * are found, they are marked writeback.  Then the lock bits are removed
2481  * and the end_io handler clears the writeback ranges
2482  */
2483 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2484                               void *data)
2485 {
2486         struct inode *inode = page->mapping->host;
2487         struct extent_page_data *epd = data;
2488         struct extent_io_tree *tree = epd->tree;
2489         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2490         u64 delalloc_start;
2491         u64 page_end = start + PAGE_CACHE_SIZE - 1;
2492         u64 end;
2493         u64 cur = start;
2494         u64 extent_offset;
2495         u64 last_byte = i_size_read(inode);
2496         u64 block_start;
2497         u64 iosize;
2498         sector_t sector;
2499         struct extent_state *cached_state = NULL;
2500         struct extent_map *em;
2501         struct block_device *bdev;
2502         int ret;
2503         int nr = 0;
2504         size_t pg_offset = 0;
2505         size_t blocksize;
2506         loff_t i_size = i_size_read(inode);
2507         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2508         u64 nr_delalloc;
2509         u64 delalloc_end;
2510         int page_started;
2511         int compressed;
2512         int write_flags;
2513         unsigned long nr_written = 0;
2514
2515         if (wbc->sync_mode == WB_SYNC_ALL)
2516                 write_flags = WRITE_SYNC;
2517         else
2518                 write_flags = WRITE;
2519
2520         trace___extent_writepage(page, inode, wbc);
2521
2522         WARN_ON(!PageLocked(page));
2523         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2524         if (page->index > end_index ||
2525            (page->index == end_index && !pg_offset)) {
2526                 page->mapping->a_ops->invalidatepage(page, 0);
2527                 unlock_page(page);
2528                 return 0;
2529         }
2530
2531         if (page->index == end_index) {
2532                 char *userpage;
2533
2534                 userpage = kmap_atomic(page, KM_USER0);
2535                 memset(userpage + pg_offset, 0,
2536                        PAGE_CACHE_SIZE - pg_offset);
2537                 kunmap_atomic(userpage, KM_USER0);
2538                 flush_dcache_page(page);
2539         }
2540         pg_offset = 0;
2541
2542         set_page_extent_mapped(page);
2543
2544         delalloc_start = start;
2545         delalloc_end = 0;
2546         page_started = 0;
2547         if (!epd->extent_locked) {
2548                 u64 delalloc_to_write = 0;
2549                 /*
2550                  * make sure the wbc mapping index is at least updated
2551                  * to this page.
2552                  */
2553                 update_nr_written(page, wbc, 0);
2554
2555                 while (delalloc_end < page_end) {
2556                         nr_delalloc = find_lock_delalloc_range(inode, tree,
2557                                                        page,
2558                                                        &delalloc_start,
2559                                                        &delalloc_end,
2560                                                        128 * 1024 * 1024);
2561                         if (nr_delalloc == 0) {
2562                                 delalloc_start = delalloc_end + 1;
2563                                 continue;
2564                         }
2565                         tree->ops->fill_delalloc(inode, page, delalloc_start,
2566                                                  delalloc_end, &page_started,
2567                                                  &nr_written);
2568                         /*
2569                          * delalloc_end is already one less than the total
2570                          * length, so we don't subtract one from
2571                          * PAGE_CACHE_SIZE
2572                          */
2573                         delalloc_to_write += (delalloc_end - delalloc_start +
2574                                               PAGE_CACHE_SIZE) >>
2575                                               PAGE_CACHE_SHIFT;
2576                         delalloc_start = delalloc_end + 1;
2577                 }
2578                 if (wbc->nr_to_write < delalloc_to_write) {
2579                         int thresh = 8192;
2580
2581                         if (delalloc_to_write < thresh * 2)
2582                                 thresh = delalloc_to_write;
2583                         wbc->nr_to_write = min_t(u64, delalloc_to_write,
2584                                                  thresh);
2585                 }
2586
2587                 /* did the fill delalloc function already unlock and start
2588                  * the IO?
2589                  */
2590                 if (page_started) {
2591                         ret = 0;
2592                         /*
2593                          * we've unlocked the page, so we can't update
2594                          * the mapping's writeback index, just update
2595                          * nr_to_write.
2596                          */
2597                         wbc->nr_to_write -= nr_written;
2598                         goto done_unlocked;
2599                 }
2600         }
2601         if (tree->ops && tree->ops->writepage_start_hook) {
2602                 ret = tree->ops->writepage_start_hook(page, start,
2603                                                       page_end);
2604                 if (ret == -EAGAIN) {
2605                         redirty_page_for_writepage(wbc, page);
2606                         update_nr_written(page, wbc, nr_written);
2607                         unlock_page(page);
2608                         ret = 0;
2609                         goto done_unlocked;
2610                 }
2611         }
2612
2613         /*
2614          * we don't want to touch the inode after unlocking the page,
2615          * so we update the mapping writeback index now
2616          */
2617         update_nr_written(page, wbc, nr_written + 1);
2618
2619         end = page_end;
2620         if (last_byte <= start) {
2621                 if (tree->ops && tree->ops->writepage_end_io_hook)
2622                         tree->ops->writepage_end_io_hook(page, start,
2623                                                          page_end, NULL, 1);
2624                 goto done;
2625         }
2626
2627         blocksize = inode->i_sb->s_blocksize;
2628
2629         while (cur <= end) {
2630                 if (cur >= last_byte) {
2631                         if (tree->ops && tree->ops->writepage_end_io_hook)
2632                                 tree->ops->writepage_end_io_hook(page, cur,
2633                                                          page_end, NULL, 1);
2634                         break;
2635                 }
2636                 em = epd->get_extent(inode, page, pg_offset, cur,
2637                                      end - cur + 1, 1);
2638                 if (IS_ERR_OR_NULL(em)) {
2639                         SetPageError(page);
2640                         break;
2641                 }
2642
2643                 extent_offset = cur - em->start;
2644                 BUG_ON(extent_map_end(em) <= cur);
2645                 BUG_ON(end < cur);
2646                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2647                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2648                 sector = (em->block_start + extent_offset) >> 9;
2649                 bdev = em->bdev;
2650                 block_start = em->block_start;
2651                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2652                 free_extent_map(em);
2653                 em = NULL;
2654
2655                 /*
2656                  * compressed and inline extents are written through other
2657                  * paths in the FS
2658                  */
2659                 if (compressed || block_start == EXTENT_MAP_HOLE ||
2660                     block_start == EXTENT_MAP_INLINE) {
2661                         /*
2662                          * end_io notification does not happen here for
2663                          * compressed extents
2664                          */
2665                         if (!compressed && tree->ops &&
2666                             tree->ops->writepage_end_io_hook)
2667                                 tree->ops->writepage_end_io_hook(page, cur,
2668                                                          cur + iosize - 1,
2669                                                          NULL, 1);
2670                         else if (compressed) {
2671                                 /* we don't want to end_page_writeback on
2672                                  * a compressed extent.  this happens
2673                                  * elsewhere
2674                                  */
2675                                 nr++;
2676                         }
2677
2678                         cur += iosize;
2679                         pg_offset += iosize;
2680                         continue;
2681                 }
2682                 /* leave this out until we have a page_mkwrite call */
2683                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2684                                    EXTENT_DIRTY, 0, NULL)) {
2685                         cur = cur + iosize;
2686                         pg_offset += iosize;
2687                         continue;
2688                 }
2689
2690                 if (tree->ops && tree->ops->writepage_io_hook) {
2691                         ret = tree->ops->writepage_io_hook(page, cur,
2692                                                 cur + iosize - 1);
2693                 } else {
2694                         ret = 0;
2695                 }
2696                 if (ret) {
2697                         SetPageError(page);
2698                 } else {
2699                         unsigned long max_nr = end_index + 1;
2700
2701                         set_range_writeback(tree, cur, cur + iosize - 1);
2702                         if (!PageWriteback(page)) {
2703                                 printk(KERN_ERR "btrfs warning page %lu not "
2704                                        "writeback, cur %llu end %llu\n",
2705                                        page->index, (unsigned long long)cur,
2706                                        (unsigned long long)end);
2707                         }
2708
2709                         ret = submit_extent_page(write_flags, tree, page,
2710                                                  sector, iosize, pg_offset,
2711                                                  bdev, &epd->bio, max_nr,
2712                                                  end_bio_extent_writepage,
2713                                                  0, 0, 0);
2714                         if (ret)
2715                                 SetPageError(page);
2716                 }
2717                 cur = cur + iosize;
2718                 pg_offset += iosize;
2719                 nr++;
2720         }
2721 done:
2722         if (nr == 0) {
2723                 /* make sure the mapping tag for page dirty gets cleared */
2724                 set_page_writeback(page);
2725                 end_page_writeback(page);
2726         }
2727         unlock_page(page);
2728
2729 done_unlocked:
2730
2731         /* drop our reference on any cached states */
2732         free_extent_state(cached_state);
2733         return 0;
2734 }
2735
2736 /**
2737  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2738  * @mapping: address space structure to write
2739  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2740  * @writepage: function called for each page
2741  * @data: data passed to writepage function
2742  *
2743  * If a page is already under I/O, write_cache_pages() skips it, even
2744  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2745  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2746  * and msync() need to guarantee that all the data which was dirty at the time
2747  * the call was made get new I/O started against them.  If wbc->sync_mode is
2748  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2749  * existing IO to complete.
2750  */
2751 static int extent_write_cache_pages(struct extent_io_tree *tree,
2752                              struct address_space *mapping,
2753                              struct writeback_control *wbc,
2754                              writepage_t writepage, void *data,
2755                              void (*flush_fn)(void *))
2756 {
2757         int ret = 0;
2758         int done = 0;
2759         int nr_to_write_done = 0;
2760         struct pagevec pvec;
2761         int nr_pages;
2762         pgoff_t index;
2763         pgoff_t end;            /* Inclusive */
2764         int scanned = 0;
2765         int tag;
2766
2767         pagevec_init(&pvec, 0);
2768         if (wbc->range_cyclic) {
2769                 index = mapping->writeback_index; /* Start from prev offset */
2770                 end = -1;
2771         } else {
2772                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2773                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2774                 scanned = 1;
2775         }
2776         if (wbc->sync_mode == WB_SYNC_ALL)
2777                 tag = PAGECACHE_TAG_TOWRITE;
2778         else
2779                 tag = PAGECACHE_TAG_DIRTY;
2780 retry:
2781         if (wbc->sync_mode == WB_SYNC_ALL)
2782                 tag_pages_for_writeback(mapping, index, end);
2783         while (!done && !nr_to_write_done && (index <= end) &&
2784                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2785                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2786                 unsigned i;
2787
2788                 scanned = 1;
2789                 for (i = 0; i < nr_pages; i++) {
2790                         struct page *page = pvec.pages[i];
2791
2792                         /*
2793                          * At this point we hold neither mapping->tree_lock nor
2794                          * lock on the page itself: the page may be truncated or
2795                          * invalidated (changing page->mapping to NULL), or even
2796                          * swizzled back from swapper_space to tmpfs file
2797                          * mapping
2798                          */
2799                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2800                                 tree->ops->write_cache_pages_lock_hook(page);
2801                         else
2802                                 lock_page(page);
2803
2804                         if (unlikely(page->mapping != mapping)) {
2805                                 unlock_page(page);
2806                                 continue;
2807                         }
2808
2809                         if (!wbc->range_cyclic && page->index > end) {
2810                                 done = 1;
2811                                 unlock_page(page);
2812                                 continue;
2813                         }
2814
2815                         if (wbc->sync_mode != WB_SYNC_NONE) {
2816                                 if (PageWriteback(page))
2817                                         flush_fn(data);
2818                                 wait_on_page_writeback(page);
2819                         }
2820
2821                         if (PageWriteback(page) ||
2822                             !clear_page_dirty_for_io(page)) {
2823                                 unlock_page(page);
2824                                 continue;
2825                         }
2826
2827                         ret = (*writepage)(page, wbc, data);
2828
2829                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2830                                 unlock_page(page);
2831                                 ret = 0;
2832                         }
2833                         if (ret)
2834                                 done = 1;
2835
2836                         /*
2837                          * the filesystem may choose to bump up nr_to_write.
2838                          * We have to make sure to honor the new nr_to_write
2839                          * at any time
2840                          */
2841                         nr_to_write_done = wbc->nr_to_write <= 0;
2842                 }
2843                 pagevec_release(&pvec);
2844                 cond_resched();
2845         }
2846         if (!scanned && !done) {
2847                 /*
2848                  * We hit the last page and there is more work to be done: wrap
2849                  * back to the start of the file
2850                  */
2851                 scanned = 1;
2852                 index = 0;
2853                 goto retry;
2854         }
2855         return ret;
2856 }
2857
2858 static void flush_epd_write_bio(struct extent_page_data *epd)
2859 {
2860         if (epd->bio) {
2861                 if (epd->sync_io)
2862                         submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2863                 else
2864                         submit_one_bio(WRITE, epd->bio, 0, 0);
2865                 epd->bio = NULL;
2866         }
2867 }
2868
2869 static noinline void flush_write_bio(void *data)
2870 {
2871         struct extent_page_data *epd = data;
2872         flush_epd_write_bio(epd);
2873 }
2874
2875 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2876                           get_extent_t *get_extent,
2877                           struct writeback_control *wbc)
2878 {
2879         int ret;
2880         struct extent_page_data epd = {
2881                 .bio = NULL,
2882                 .tree = tree,
2883                 .get_extent = get_extent,
2884                 .extent_locked = 0,
2885                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2886         };
2887
2888         ret = __extent_writepage(page, wbc, &epd);
2889
2890         flush_epd_write_bio(&epd);
2891         return ret;
2892 }
2893
2894 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2895                               u64 start, u64 end, get_extent_t *get_extent,
2896                               int mode)
2897 {
2898         int ret = 0;
2899         struct address_space *mapping = inode->i_mapping;
2900         struct page *page;
2901         unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2902                 PAGE_CACHE_SHIFT;
2903
2904         struct extent_page_data epd = {
2905                 .bio = NULL,
2906                 .tree = tree,
2907                 .get_extent = get_extent,
2908                 .extent_locked = 1,
2909                 .sync_io = mode == WB_SYNC_ALL,
2910         };
2911         struct writeback_control wbc_writepages = {
2912                 .sync_mode      = mode,
2913                 .nr_to_write    = nr_pages * 2,
2914                 .range_start    = start,
2915                 .range_end      = end + 1,
2916         };
2917
2918         while (start <= end) {
2919                 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2920                 if (clear_page_dirty_for_io(page))
2921                         ret = __extent_writepage(page, &wbc_writepages, &epd);
2922                 else {
2923                         if (tree->ops && tree->ops->writepage_end_io_hook)
2924                                 tree->ops->writepage_end_io_hook(page, start,
2925                                                  start + PAGE_CACHE_SIZE - 1,
2926                                                  NULL, 1);
2927                         unlock_page(page);
2928                 }
2929                 page_cache_release(page);
2930                 start += PAGE_CACHE_SIZE;
2931         }
2932
2933         flush_epd_write_bio(&epd);
2934         return ret;
2935 }
2936
2937 int extent_writepages(struct extent_io_tree *tree,
2938                       struct address_space *mapping,
2939                       get_extent_t *get_extent,
2940                       struct writeback_control *wbc)
2941 {
2942         int ret = 0;
2943         struct extent_page_data epd = {
2944                 .bio = NULL,
2945                 .tree = tree,
2946                 .get_extent = get_extent,
2947                 .extent_locked = 0,
2948                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2949         };
2950
2951         ret = extent_write_cache_pages(tree, mapping, wbc,
2952                                        __extent_writepage, &epd,
2953                                        flush_write_bio);
2954         flush_epd_write_bio(&epd);
2955         return ret;
2956 }
2957
2958 int extent_readpages(struct extent_io_tree *tree,
2959                      struct address_space *mapping,
2960                      struct list_head *pages, unsigned nr_pages,
2961                      get_extent_t get_extent)
2962 {
2963         struct bio *bio = NULL;
2964         unsigned page_idx;
2965         unsigned long bio_flags = 0;
2966
2967         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2968                 struct page *page = list_entry(pages->prev, struct page, lru);
2969
2970                 prefetchw(&page->flags);
2971                 list_del(&page->lru);
2972                 if (!add_to_page_cache_lru(page, mapping,
2973                                         page->index, GFP_NOFS)) {
2974                         __extent_read_full_page(tree, page, get_extent,
2975                                                 &bio, 0, &bio_flags);
2976                 }
2977                 page_cache_release(page);
2978         }
2979         BUG_ON(!list_empty(pages));
2980         if (bio)
2981                 submit_one_bio(READ, bio, 0, bio_flags);
2982         return 0;
2983 }
2984
2985 /*
2986  * basic invalidatepage code, this waits on any locked or writeback
2987  * ranges corresponding to the page, and then deletes any extent state
2988  * records from the tree
2989  */
2990 int extent_invalidatepage(struct extent_io_tree *tree,
2991                           struct page *page, unsigned long offset)
2992 {
2993         struct extent_state *cached_state = NULL;
2994         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2995         u64 end = start + PAGE_CACHE_SIZE - 1;
2996         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2997
2998         start += (offset + blocksize - 1) & ~(blocksize - 1);
2999         if (start > end)
3000                 return 0;
3001
3002         lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
3003         wait_on_page_writeback(page);
3004         clear_extent_bit(tree, start, end,
3005                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3006                          EXTENT_DO_ACCOUNTING,
3007                          1, 1, &cached_state, GFP_NOFS);
3008         return 0;
3009 }
3010
3011 /*
3012  * a helper for releasepage, this tests for areas of the page that
3013  * are locked or under IO and drops the related state bits if it is safe
3014  * to drop the page.
3015  */
3016 int try_release_extent_state(struct extent_map_tree *map,
3017                              struct extent_io_tree *tree, struct page *page,
3018                              gfp_t mask)
3019 {
3020         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3021         u64 end = start + PAGE_CACHE_SIZE - 1;
3022         int ret = 1;
3023
3024         if (test_range_bit(tree, start, end,
3025                            EXTENT_IOBITS, 0, NULL))
3026                 ret = 0;
3027         else {
3028                 if ((mask & GFP_NOFS) == GFP_NOFS)
3029                         mask = GFP_NOFS;
3030                 /*
3031                  * at this point we can safely clear everything except the
3032                  * locked bit and the nodatasum bit
3033                  */
3034                 ret = clear_extent_bit(tree, start, end,
3035                                  ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3036                                  0, 0, NULL, mask);
3037
3038                 /* if clear_extent_bit failed for enomem reasons,
3039                  * we can't allow the release to continue.
3040                  */
3041                 if (ret < 0)
3042                         ret = 0;
3043                 else
3044                         ret = 1;
3045         }
3046         return ret;
3047 }
3048
3049 /*
3050  * a helper for releasepage.  As long as there are no locked extents
3051  * in the range corresponding to the page, both state records and extent
3052  * map records are removed
3053  */
3054 int try_release_extent_mapping(struct extent_map_tree *map,
3055                                struct extent_io_tree *tree, struct page *page,
3056                                gfp_t mask)
3057 {
3058         struct extent_map *em;
3059         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3060         u64 end = start + PAGE_CACHE_SIZE - 1;
3061
3062         if ((mask & __GFP_WAIT) &&
3063             page->mapping->host->i_size > 16 * 1024 * 1024) {
3064                 u64 len;
3065                 while (start <= end) {
3066                         len = end - start + 1;
3067                         write_lock(&map->lock);
3068                         em = lookup_extent_mapping(map, start, len);
3069                         if (IS_ERR_OR_NULL(em)) {
3070                                 write_unlock(&map->lock);
3071                                 break;
3072                         }
3073                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3074                             em->start != start) {
3075                                 write_unlock(&map->lock);
3076                                 free_extent_map(em);
3077                                 break;
3078                         }
3079                         if (!test_range_bit(tree, em->start,
3080                                             extent_map_end(em) - 1,
3081                                             EXTENT_LOCKED | EXTENT_WRITEBACK,
3082                                             0, NULL)) {
3083                                 remove_extent_mapping(map, em);
3084                                 /* once for the rb tree */
3085                                 free_extent_map(em);
3086                         }
3087                         start = extent_map_end(em);
3088                         write_unlock(&map->lock);
3089
3090                         /* once for us */
3091                         free_extent_map(em);
3092                 }
3093         }
3094         return try_release_extent_state(map, tree, page, mask);
3095 }
3096
3097 /*
3098  * helper function for fiemap, which doesn't want to see any holes.
3099  * This maps until we find something past 'last'
3100  */
3101 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3102                                                 u64 offset,
3103                                                 u64 last,
3104                                                 get_extent_t *get_extent)
3105 {
3106         u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3107         struct extent_map *em;
3108         u64 len;
3109
3110         if (offset >= last)
3111                 return NULL;
3112
3113         while(1) {
3114                 len = last - offset;
3115                 if (len == 0)
3116                         break;
3117                 len = (len + sectorsize - 1) & ~(sectorsize - 1);
3118                 em = get_extent(inode, NULL, 0, offset, len, 0);
3119                 if (IS_ERR_OR_NULL(em))
3120                         return em;
3121
3122                 /* if this isn't a hole return it */
3123                 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3124                     em->block_start != EXTENT_MAP_HOLE) {
3125                         return em;
3126                 }
3127
3128                 /* this is a hole, advance to the next extent */
3129                 offset = extent_map_end(em);
3130                 free_extent_map(em);
3131                 if (offset >= last)
3132                         break;
3133         }
3134         return NULL;
3135 }
3136
3137 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3138                 __u64 start, __u64 len, get_extent_t *get_extent)
3139 {
3140         int ret = 0;
3141         u64 off = start;
3142         u64 max = start + len;
3143         u32 flags = 0;
3144         u32 found_type;
3145         u64 last;
3146         u64 last_for_get_extent = 0;
3147         u64 disko = 0;
3148         u64 isize = i_size_read(inode);
3149         struct btrfs_key found_key;
3150         struct extent_map *em = NULL;
3151         struct extent_state *cached_state = NULL;
3152         struct btrfs_path *path;
3153         struct btrfs_file_extent_item *item;
3154         int end = 0;
3155         u64 em_start = 0;
3156         u64 em_len = 0;
3157         u64 em_end = 0;
3158         unsigned long emflags;
3159
3160         if (len == 0)
3161                 return -EINVAL;
3162
3163         path = btrfs_alloc_path();
3164         if (!path)
3165                 return -ENOMEM;
3166         path->leave_spinning = 1;
3167
3168         /*
3169          * lookup the last file extent.  We're not using i_size here
3170          * because there might be preallocation past i_size
3171          */
3172         ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3173                                        path, btrfs_ino(inode), -1, 0);
3174         if (ret < 0) {
3175                 btrfs_free_path(path);
3176                 return ret;
3177         }
3178         WARN_ON(!ret);
3179         path->slots[0]--;
3180         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3181                               struct btrfs_file_extent_item);
3182         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3183         found_type = btrfs_key_type(&found_key);
3184
3185         /* No extents, but there might be delalloc bits */
3186         if (found_key.objectid != btrfs_ino(inode) ||
3187             found_type != BTRFS_EXTENT_DATA_KEY) {
3188                 /* have to trust i_size as the end */
3189                 last = (u64)-1;
3190                 last_for_get_extent = isize;
3191         } else {
3192                 /*
3193                  * remember the start of the last extent.  There are a
3194                  * bunch of different factors that go into the length of the
3195                  * extent, so its much less complex to remember where it started
3196                  */
3197                 last = found_key.offset;
3198                 last_for_get_extent = last + 1;
3199         }
3200         btrfs_free_path(path);
3201
3202         /*
3203          * we might have some extents allocated but more delalloc past those
3204          * extents.  so, we trust isize unless the start of the last extent is
3205          * beyond isize
3206          */
3207         if (last < isize) {
3208                 last = (u64)-1;
3209                 last_for_get_extent = isize;
3210         }
3211
3212         lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3213                          &cached_state, GFP_NOFS);
3214
3215         em = get_extent_skip_holes(inode, off, last_for_get_extent,
3216                                    get_extent);
3217         if (!em)
3218                 goto out;
3219         if (IS_ERR(em)) {
3220                 ret = PTR_ERR(em);
3221                 goto out;
3222         }
3223
3224         while (!end) {
3225                 u64 offset_in_extent;
3226
3227                 /* break if the extent we found is outside the range */
3228                 if (em->start >= max || extent_map_end(em) < off)
3229                         break;
3230
3231                 /*
3232                  * get_extent may return an extent that starts before our
3233                  * requested range.  We have to make sure the ranges
3234                  * we return to fiemap always move forward and don't
3235                  * overlap, so adjust the offsets here
3236                  */
3237                 em_start = max(em->start, off);
3238
3239                 /*
3240                  * record the offset from the start of the extent
3241                  * for adjusting the disk offset below
3242                  */
3243                 offset_in_extent = em_start - em->start;
3244                 em_end = extent_map_end(em);
3245                 em_len = em_end - em_start;
3246                 emflags = em->flags;
3247                 disko = 0;
3248                 flags = 0;
3249
3250                 /*
3251                  * bump off for our next call to get_extent
3252                  */
3253                 off = extent_map_end(em);
3254                 if (off >= max)
3255                         end = 1;
3256
3257                 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3258                         end = 1;
3259                         flags |= FIEMAP_EXTENT_LAST;
3260                 } else if (em->block_start == EXTENT_MAP_INLINE) {
3261                         flags |= (FIEMAP_EXTENT_DATA_INLINE |
3262                                   FIEMAP_EXTENT_NOT_ALIGNED);
3263                 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3264                         flags |= (FIEMAP_EXTENT_DELALLOC |
3265                                   FIEMAP_EXTENT_UNKNOWN);
3266                 } else {
3267                         disko = em->block_start + offset_in_extent;
3268                 }
3269                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3270                         flags |= FIEMAP_EXTENT_ENCODED;
3271
3272                 free_extent_map(em);
3273                 em = NULL;
3274                 if ((em_start >= last) || em_len == (u64)-1 ||
3275                    (last == (u64)-1 && isize <= em_end)) {
3276                         flags |= FIEMAP_EXTENT_LAST;
3277                         end = 1;
3278                 }
3279
3280                 /* now scan forward to see if this is really the last extent. */
3281                 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3282                                            get_extent);
3283                 if (IS_ERR(em)) {
3284                         ret = PTR_ERR(em);
3285                         goto out;
3286                 }
3287                 if (!em) {
3288                         flags |= FIEMAP_EXTENT_LAST;
3289                         end = 1;
3290                 }
3291                 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3292                                               em_len, flags);
3293                 if (ret)
3294                         goto out_free;
3295         }
3296 out_free:
3297         free_extent_map(em);
3298 out:
3299         unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3300                              &cached_state, GFP_NOFS);
3301         return ret;
3302 }
3303
3304 inline struct page *extent_buffer_page(struct extent_buffer *eb,
3305                                               unsigned long i)
3306 {
3307         struct page *p;
3308         struct address_space *mapping;
3309
3310         if (i == 0)
3311                 return eb->first_page;
3312         i += eb->start >> PAGE_CACHE_SHIFT;
3313         mapping = eb->first_page->mapping;
3314         if (!mapping)
3315                 return NULL;
3316
3317         /*
3318          * extent_buffer_page is only called after pinning the page
3319          * by increasing the reference count.  So we know the page must
3320          * be in the radix tree.
3321          */
3322         rcu_read_lock();
3323         p = radix_tree_lookup(&mapping->page_tree, i);
3324         rcu_read_unlock();
3325
3326         return p;
3327 }
3328
3329 inline unsigned long num_extent_pages(u64 start, u64 len)
3330 {
3331         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3332                 (start >> PAGE_CACHE_SHIFT);
3333 }
3334
3335 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3336                                                    u64 start,
3337                                                    unsigned long len,
3338                                                    gfp_t mask)
3339 {
3340         struct extent_buffer *eb = NULL;
3341 #if LEAK_DEBUG
3342         unsigned long flags;
3343 #endif
3344
3345         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3346         if (eb == NULL)
3347                 return NULL;
3348         eb->start = start;
3349         eb->len = len;
3350         rwlock_init(&eb->lock);
3351         atomic_set(&eb->write_locks, 0);
3352         atomic_set(&eb->read_locks, 0);
3353         atomic_set(&eb->blocking_readers, 0);
3354         atomic_set(&eb->blocking_writers, 0);
3355         atomic_set(&eb->spinning_readers, 0);
3356         atomic_set(&eb->spinning_writers, 0);
3357         init_waitqueue_head(&eb->write_lock_wq);
3358         init_waitqueue_head(&eb->read_lock_wq);
3359
3360 #if LEAK_DEBUG
3361         spin_lock_irqsave(&leak_lock, flags);
3362         list_add(&eb->leak_list, &buffers);
3363         spin_unlock_irqrestore(&leak_lock, flags);
3364 #endif
3365         atomic_set(&eb->refs, 1);
3366
3367         return eb;
3368 }
3369
3370 static void __free_extent_buffer(struct extent_buffer *eb)
3371 {
3372 #if LEAK_DEBUG
3373         unsigned long flags;
3374         spin_lock_irqsave(&leak_lock, flags);
3375         list_del(&eb->leak_list);
3376         spin_unlock_irqrestore(&leak_lock, flags);
3377 #endif
3378         kmem_cache_free(extent_buffer_cache, eb);
3379 }
3380
3381 /*
3382  * Helper for releasing extent buffer page.
3383  */
3384 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3385                                                 unsigned long start_idx)
3386 {
3387         unsigned long index;
3388         struct page *page;
3389
3390         if (!eb->first_page)
3391                 return;
3392
3393         index = num_extent_pages(eb->start, eb->len);
3394         if (start_idx >= index)
3395                 return;
3396
3397         do {
3398                 index--;
3399                 page = extent_buffer_page(eb, index);
3400                 if (page)
3401                         page_cache_release(page);
3402         } while (index != start_idx);
3403 }
3404
3405 /*
3406  * Helper for releasing the extent buffer.
3407  */
3408 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3409 {
3410         btrfs_release_extent_buffer_page(eb, 0);
3411         __free_extent_buffer(eb);
3412 }
3413
3414 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3415                                           u64 start, unsigned long len,
3416                                           struct page *page0)
3417 {
3418         unsigned long num_pages = num_extent_pages(start, len);
3419         unsigned long i;
3420         unsigned long index = start >> PAGE_CACHE_SHIFT;
3421         struct extent_buffer *eb;
3422         struct extent_buffer *exists = NULL;
3423         struct page *p;
3424         struct address_space *mapping = tree->mapping;
3425         int uptodate = 1;
3426         int ret;
3427
3428         rcu_read_lock();
3429         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3430         if (eb && atomic_inc_not_zero(&eb->refs)) {
3431                 rcu_read_unlock();
3432                 mark_page_accessed(eb->first_page);
3433                 return eb;
3434         }
3435         rcu_read_unlock();
3436
3437         eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3438         if (!eb)
3439                 return NULL;
3440
3441         if (page0) {
3442                 eb->first_page = page0;
3443                 i = 1;
3444                 index++;
3445                 page_cache_get(page0);
3446                 mark_page_accessed(page0);
3447                 set_page_extent_mapped(page0);
3448                 set_page_extent_head(page0, len);
3449                 uptodate = PageUptodate(page0);
3450         } else {
3451                 i = 0;
3452         }
3453         for (; i < num_pages; i++, index++) {
3454                 p = find_or_create_page(mapping, index, GFP_NOFS);
3455                 if (!p) {
3456                         WARN_ON(1);
3457                         goto free_eb;
3458                 }
3459                 set_page_extent_mapped(p);
3460                 mark_page_accessed(p);
3461                 if (i == 0) {
3462                         eb->first_page = p;
3463                         set_page_extent_head(p, len);
3464                 } else {
3465                         set_page_private(p, EXTENT_PAGE_PRIVATE);
3466                 }
3467                 if (!PageUptodate(p))
3468                         uptodate = 0;
3469
3470                 /*
3471                  * see below about how we avoid a nasty race with release page
3472                  * and why we unlock later
3473                  */
3474                 if (i != 0)
3475                         unlock_page(p);
3476         }
3477         if (uptodate)
3478                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3479
3480         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3481         if (ret)
3482                 goto free_eb;
3483
3484         spin_lock(&tree->buffer_lock);
3485         ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3486         if (ret == -EEXIST) {
3487                 exists = radix_tree_lookup(&tree->buffer,
3488                                                 start >> PAGE_CACHE_SHIFT);
3489                 /* add one reference for the caller */
3490                 atomic_inc(&exists->refs);
3491                 spin_unlock(&tree->buffer_lock);
3492                 radix_tree_preload_end();
3493                 goto free_eb;
3494         }
3495         /* add one reference for the tree */
3496         atomic_inc(&eb->refs);
3497         spin_unlock(&tree->buffer_lock);
3498         radix_tree_preload_end();
3499
3500         /*
3501          * there is a race where release page may have
3502          * tried to find this extent buffer in the radix
3503          * but failed.  It will tell the VM it is safe to
3504          * reclaim the, and it will clear the page private bit.
3505          * We must make sure to set the page private bit properly
3506          * after the extent buffer is in the radix tree so
3507          * it doesn't get lost
3508          */
3509         set_page_extent_mapped(eb->first_page);
3510         set_page_extent_head(eb->first_page, eb->len);
3511         if (!page0)
3512                 unlock_page(eb->first_page);
3513         return eb;
3514
3515 free_eb:
3516         if (eb->first_page && !page0)
3517                 unlock_page(eb->first_page);
3518
3519         if (!atomic_dec_and_test(&eb->refs))
3520                 return exists;
3521         btrfs_release_extent_buffer(eb);
3522         return exists;
3523 }
3524
3525 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3526                                          u64 start, unsigned long len)
3527 {
3528         struct extent_buffer *eb;
3529
3530         rcu_read_lock();
3531         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3532         if (eb && atomic_inc_not_zero(&eb->refs)) {
3533                 rcu_read_unlock();
3534                 mark_page_accessed(eb->first_page);
3535                 return eb;
3536         }
3537         rcu_read_unlock();
3538
3539         return NULL;
3540 }
3541
3542 void free_extent_buffer(struct extent_buffer *eb)
3543 {
3544         if (!eb)
3545                 return;
3546
3547         if (!atomic_dec_and_test(&eb->refs))
3548                 return;
3549
3550         WARN_ON(1);
3551 }
3552
3553 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3554                               struct extent_buffer *eb)
3555 {
3556         unsigned long i;
3557         unsigned long num_pages;
3558         struct page *page;
3559
3560         num_pages = num_extent_pages(eb->start, eb->len);
3561
3562         for (i = 0; i < num_pages; i++) {
3563                 page = extent_buffer_page(eb, i);
3564                 if (!PageDirty(page))
3565                         continue;
3566
3567                 lock_page(page);
3568                 WARN_ON(!PagePrivate(page));
3569
3570                 set_page_extent_mapped(page);
3571                 if (i == 0)
3572                         set_page_extent_head(page, eb->len);
3573
3574                 clear_page_dirty_for_io(page);
3575                 spin_lock_irq(&page->mapping->tree_lock);
3576                 if (!PageDirty(page)) {
3577                         radix_tree_tag_clear(&page->mapping->page_tree,
3578                                                 page_index(page),
3579                                                 PAGECACHE_TAG_DIRTY);
3580                 }
3581                 spin_unlock_irq(&page->mapping->tree_lock);
3582                 unlock_page(page);
3583         }
3584         return 0;
3585 }
3586
3587 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3588                              struct extent_buffer *eb)
3589 {
3590         unsigned long i;
3591         unsigned long num_pages;
3592         int was_dirty = 0;
3593
3594         was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3595         num_pages = num_extent_pages(eb->start, eb->len);
3596         for (i = 0; i < num_pages; i++)
3597                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3598         return was_dirty;
3599 }
3600
3601 static int __eb_straddles_pages(u64 start, u64 len)
3602 {
3603         if (len < PAGE_CACHE_SIZE)
3604                 return 1;
3605         if (start & (PAGE_CACHE_SIZE - 1))
3606                 return 1;
3607         if ((start + len) & (PAGE_CACHE_SIZE - 1))
3608                 return 1;
3609         return 0;
3610 }
3611
3612 static int eb_straddles_pages(struct extent_buffer *eb)
3613 {
3614         return __eb_straddles_pages(eb->start, eb->len);
3615 }
3616
3617 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3618                                 struct extent_buffer *eb,
3619                                 struct extent_state **cached_state)
3620 {
3621         unsigned long i;
3622         struct page *page;
3623         unsigned long num_pages;
3624
3625         num_pages = num_extent_pages(eb->start, eb->len);
3626         clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3627
3628         if (eb_straddles_pages(eb)) {
3629                 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3630                                       cached_state, GFP_NOFS);
3631         }
3632         for (i = 0; i < num_pages; i++) {
3633                 page = extent_buffer_page(eb, i);
3634                 if (page)
3635                         ClearPageUptodate(page);
3636         }
3637         return 0;
3638 }
3639
3640 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3641                                 struct extent_buffer *eb)
3642 {
3643         unsigned long i;
3644         struct page *page;
3645         unsigned long num_pages;
3646
3647         num_pages = num_extent_pages(eb->start, eb->len);
3648
3649         if (eb_straddles_pages(eb)) {
3650                 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3651                                     NULL, GFP_NOFS);
3652         }
3653         for (i = 0; i < num_pages; i++) {
3654                 page = extent_buffer_page(eb, i);
3655                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3656                     ((i == num_pages - 1) &&
3657                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3658                         check_page_uptodate(tree, page);
3659                         continue;
3660                 }
3661                 SetPageUptodate(page);
3662         }
3663         return 0;
3664 }
3665
3666 int extent_range_uptodate(struct extent_io_tree *tree,
3667                           u64 start, u64 end)
3668 {
3669         struct page *page;
3670         int ret;
3671         int pg_uptodate = 1;
3672         int uptodate;
3673         unsigned long index;
3674
3675         if (__eb_straddles_pages(start, end - start + 1)) {
3676                 ret = test_range_bit(tree, start, end,
3677                                      EXTENT_UPTODATE, 1, NULL);
3678                 if (ret)
3679                         return 1;
3680         }
3681         while (start <= end) {
3682                 index = start >> PAGE_CACHE_SHIFT;
3683                 page = find_get_page(tree->mapping, index);
3684                 uptodate = PageUptodate(page);
3685                 page_cache_release(page);
3686                 if (!uptodate) {
3687                         pg_uptodate = 0;
3688                         break;
3689                 }
3690                 start += PAGE_CACHE_SIZE;
3691         }
3692         return pg_uptodate;
3693 }
3694
3695 int extent_buffer_uptodate(struct extent_io_tree *tree,
3696                            struct extent_buffer *eb,
3697                            struct extent_state *cached_state)
3698 {
3699         int ret = 0;
3700         unsigned long num_pages;
3701         unsigned long i;
3702         struct page *page;
3703         int pg_uptodate = 1;
3704
3705         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3706                 return 1;
3707
3708         if (eb_straddles_pages(eb)) {
3709                 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3710                                    EXTENT_UPTODATE, 1, cached_state);
3711                 if (ret)
3712                         return ret;
3713         }
3714
3715         num_pages = num_extent_pages(eb->start, eb->len);
3716         for (i = 0; i < num_pages; i++) {
3717                 page = extent_buffer_page(eb, i);
3718                 if (!PageUptodate(page)) {
3719                         pg_uptodate = 0;
3720                         break;
3721                 }
3722         }
3723         return pg_uptodate;
3724 }
3725
3726 int read_extent_buffer_pages(struct extent_io_tree *tree,
3727                              struct extent_buffer *eb,
3728                              u64 start, int wait,
3729                              get_extent_t *get_extent, int mirror_num)
3730 {
3731         unsigned long i;
3732         unsigned long start_i;
3733         struct page *page;
3734         int err;
3735         int ret = 0;
3736         int locked_pages = 0;
3737         int all_uptodate = 1;
3738         int inc_all_pages = 0;
3739         unsigned long num_pages;
3740         struct bio *bio = NULL;
3741         unsigned long bio_flags = 0;
3742
3743         if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3744                 return 0;
3745
3746         if (eb_straddles_pages(eb)) {
3747                 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3748                                    EXTENT_UPTODATE, 1, NULL)) {
3749                         return 0;
3750                 }
3751         }
3752
3753         if (start) {
3754                 WARN_ON(start < eb->start);
3755                 start_i = (start >> PAGE_CACHE_SHIFT) -
3756                         (eb->start >> PAGE_CACHE_SHIFT);
3757         } else {
3758                 start_i = 0;
3759         }
3760
3761         num_pages = num_extent_pages(eb->start, eb->len);
3762         for (i = start_i; i < num_pages; i++) {
3763                 page = extent_buffer_page(eb, i);
3764                 if (!wait) {
3765                         if (!trylock_page(page))
3766                                 goto unlock_exit;
3767                 } else {
3768                         lock_page(page);
3769                 }
3770                 locked_pages++;
3771                 if (!PageUptodate(page))
3772                         all_uptodate = 0;
3773         }
3774         if (all_uptodate) {
3775                 if (start_i == 0)
3776                         set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3777                 goto unlock_exit;
3778         }
3779
3780         for (i = start_i; i < num_pages; i++) {
3781                 page = extent_buffer_page(eb, i);
3782
3783                 WARN_ON(!PagePrivate(page));
3784
3785                 set_page_extent_mapped(page);
3786                 if (i == 0)
3787                         set_page_extent_head(page, eb->len);
3788
3789                 if (inc_all_pages)
3790                         page_cache_get(page);
3791                 if (!PageUptodate(page)) {
3792                         if (start_i == 0)
3793                                 inc_all_pages = 1;
3794                         ClearPageError(page);
3795                         err = __extent_read_full_page(tree, page,
3796                                                       get_extent, &bio,
3797                                                       mirror_num, &bio_flags);
3798                         if (err)
3799                                 ret = err;
3800                 } else {
3801                         unlock_page(page);
3802                 }
3803         }
3804
3805         if (bio)
3806                 submit_one_bio(READ, bio, mirror_num, bio_flags);
3807
3808         if (ret || !wait)
3809                 return ret;
3810
3811         for (i = start_i; i < num_pages; i++) {
3812                 page = extent_buffer_page(eb, i);
3813                 wait_on_page_locked(page);
3814                 if (!PageUptodate(page))
3815                         ret = -EIO;
3816         }
3817
3818         if (!ret)
3819                 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3820         return ret;
3821
3822 unlock_exit:
3823         i = start_i;
3824         while (locked_pages > 0) {
3825                 page = extent_buffer_page(eb, i);
3826                 i++;
3827                 unlock_page(page);
3828                 locked_pages--;
3829         }
3830         return ret;
3831 }
3832
3833 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3834                         unsigned long start,
3835                         unsigned long len)
3836 {
3837         size_t cur;
3838         size_t offset;
3839         struct page *page;
3840         char *kaddr;
3841         char *dst = (char *)dstv;
3842         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3843         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3844
3845         WARN_ON(start > eb->len);
3846         WARN_ON(start + len > eb->start + eb->len);
3847
3848         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3849
3850         while (len > 0) {
3851                 page = extent_buffer_page(eb, i);
3852
3853                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3854                 kaddr = page_address(page);
3855                 memcpy(dst, kaddr + offset, cur);
3856
3857                 dst += cur;
3858                 len -= cur;
3859                 offset = 0;
3860                 i++;
3861         }
3862 }
3863
3864 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3865                                unsigned long min_len, char **map,
3866                                unsigned long *map_start,
3867                                unsigned long *map_len)
3868 {
3869         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3870         char *kaddr;
3871         struct page *p;
3872         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3873         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3874         unsigned long end_i = (start_offset + start + min_len - 1) >>
3875                 PAGE_CACHE_SHIFT;
3876
3877         if (i != end_i)
3878                 return -EINVAL;
3879
3880         if (i == 0) {
3881                 offset = start_offset;
3882                 *map_start = 0;
3883         } else {
3884                 offset = 0;
3885                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3886         }
3887
3888         if (start + min_len > eb->len) {
3889                 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3890                        "wanted %lu %lu\n", (unsigned long long)eb->start,
3891                        eb->len, start, min_len);
3892                 WARN_ON(1);
3893                 return -EINVAL;
3894         }
3895
3896         p = extent_buffer_page(eb, i);
3897         kaddr = page_address(p);
3898         *map = kaddr + offset;
3899         *map_len = PAGE_CACHE_SIZE - offset;
3900         return 0;
3901 }
3902
3903 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3904                           unsigned long start,
3905                           unsigned long len)
3906 {
3907         size_t cur;
3908         size_t offset;
3909         struct page *page;
3910         char *kaddr;
3911         char *ptr = (char *)ptrv;
3912         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3913         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3914         int ret = 0;
3915
3916         WARN_ON(start > eb->len);
3917         WARN_ON(start + len > eb->start + eb->len);
3918
3919         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3920
3921         while (len > 0) {
3922                 page = extent_buffer_page(eb, i);
3923
3924                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3925
3926                 kaddr = page_address(page);
3927                 ret = memcmp(ptr, kaddr + offset, cur);
3928                 if (ret)
3929                         break;
3930
3931                 ptr += cur;
3932                 len -= cur;
3933                 offset = 0;
3934                 i++;
3935         }
3936         return ret;
3937 }
3938
3939 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3940                          unsigned long start, unsigned long len)
3941 {
3942         size_t cur;
3943         size_t offset;
3944         struct page *page;
3945         char *kaddr;
3946         char *src = (char *)srcv;
3947         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3948         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3949
3950         WARN_ON(start > eb->len);
3951         WARN_ON(start + len > eb->start + eb->len);
3952
3953         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3954
3955         while (len > 0) {
3956                 page = extent_buffer_page(eb, i);
3957                 WARN_ON(!PageUptodate(page));
3958
3959                 cur = min(len, PAGE_CACHE_SIZE - offset);
3960                 kaddr = page_address(page);
3961                 memcpy(kaddr + offset, src, cur);
3962
3963                 src += cur;
3964                 len -= cur;
3965                 offset = 0;
3966                 i++;
3967         }
3968 }
3969
3970 void memset_extent_buffer(struct extent_buffer *eb, char c,
3971                           unsigned long start, unsigned long len)
3972 {
3973         size_t cur;
3974         size_t offset;
3975         struct page *page;
3976         char *kaddr;
3977         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3978         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3979
3980         WARN_ON(start > eb->len);
3981         WARN_ON(start + len > eb->start + eb->len);
3982
3983         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3984
3985         while (len > 0) {
3986                 page = extent_buffer_page(eb, i);
3987                 WARN_ON(!PageUptodate(page));
3988
3989                 cur = min(len, PAGE_CACHE_SIZE - offset);
3990                 kaddr = page_address(page);
3991                 memset(kaddr + offset, c, cur);
3992
3993                 len -= cur;
3994                 offset = 0;
3995                 i++;
3996         }
3997 }
3998
3999 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4000                         unsigned long dst_offset, unsigned long src_offset,
4001                         unsigned long len)
4002 {
4003         u64 dst_len = dst->len;
4004         size_t cur;
4005         size_t offset;
4006         struct page *page;
4007         char *kaddr;
4008         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4009         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4010
4011         WARN_ON(src->len != dst_len);
4012
4013         offset = (start_offset + dst_offset) &
4014                 ((unsigned long)PAGE_CACHE_SIZE - 1);
4015
4016         while (len > 0) {
4017                 page = extent_buffer_page(dst, i);
4018                 WARN_ON(!PageUptodate(page));
4019
4020                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4021
4022                 kaddr = page_address(page);
4023                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
4024
4025                 src_offset += cur;
4026                 len -= cur;
4027                 offset = 0;
4028                 i++;
4029         }
4030 }
4031
4032 static void move_pages(struct page *dst_page, struct page *src_page,
4033                        unsigned long dst_off, unsigned long src_off,
4034                        unsigned long len)
4035 {
4036         char *dst_kaddr = page_address(dst_page);
4037         if (dst_page == src_page) {
4038                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4039         } else {
4040                 char *src_kaddr = page_address(src_page);
4041                 char *p = dst_kaddr + dst_off + len;
4042                 char *s = src_kaddr + src_off + len;
4043
4044                 while (len--)
4045                         *--p = *--s;
4046         }
4047 }
4048
4049 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4050 {
4051         unsigned long distance = (src > dst) ? src - dst : dst - src;
4052         return distance < len;
4053 }
4054
4055 static void copy_pages(struct page *dst_page, struct page *src_page,
4056                        unsigned long dst_off, unsigned long src_off,
4057                        unsigned long len)
4058 {
4059         char *dst_kaddr = page_address(dst_page);
4060         char *src_kaddr;
4061
4062         if (dst_page != src_page) {
4063                 src_kaddr = page_address(src_page);
4064         } else {
4065                 src_kaddr = dst_kaddr;
4066                 BUG_ON(areas_overlap(src_off, dst_off, len));
4067         }
4068
4069         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4070 }
4071
4072 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4073                            unsigned long src_offset, unsigned long len)
4074 {
4075         size_t cur;
4076         size_t dst_off_in_page;
4077         size_t src_off_in_page;
4078         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4079         unsigned long dst_i;
4080         unsigned long src_i;
4081
4082         if (src_offset + len > dst->len) {
4083                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4084                        "len %lu dst len %lu\n", src_offset, len, dst->len);
4085                 BUG_ON(1);
4086         }
4087         if (dst_offset + len > dst->len) {
4088                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4089                        "len %lu dst len %lu\n", dst_offset, len, dst->len);
4090                 BUG_ON(1);
4091         }
4092
4093         while (len > 0) {
4094                 dst_off_in_page = (start_offset + dst_offset) &
4095                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4096                 src_off_in_page = (start_offset + src_offset) &
4097                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4098
4099                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4100                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4101
4102                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4103                                                src_off_in_page));
4104                 cur = min_t(unsigned long, cur,
4105                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4106
4107                 copy_pages(extent_buffer_page(dst, dst_i),
4108                            extent_buffer_page(dst, src_i),
4109                            dst_off_in_page, src_off_in_page, cur);
4110
4111                 src_offset += cur;
4112                 dst_offset += cur;
4113                 len -= cur;
4114         }
4115 }
4116
4117 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4118                            unsigned long src_offset, unsigned long len)
4119 {
4120         size_t cur;
4121         size_t dst_off_in_page;
4122         size_t src_off_in_page;
4123         unsigned long dst_end = dst_offset + len - 1;
4124         unsigned long src_end = src_offset + len - 1;
4125         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4126         unsigned long dst_i;
4127         unsigned long src_i;
4128
4129         if (src_offset + len > dst->len) {
4130                 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4131                        "len %lu len %lu\n", src_offset, len, dst->len);
4132                 BUG_ON(1);
4133         }
4134         if (dst_offset + len > dst->len) {
4135                 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4136                        "len %lu len %lu\n", dst_offset, len, dst->len);
4137                 BUG_ON(1);
4138         }
4139         if (!areas_overlap(src_offset, dst_offset, len)) {
4140                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4141                 return;
4142         }
4143         while (len > 0) {
4144                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4145                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4146
4147                 dst_off_in_page = (start_offset + dst_end) &
4148                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4149                 src_off_in_page = (start_offset + src_end) &
4150                         ((unsigned long)PAGE_CACHE_SIZE - 1);
4151
4152                 cur = min_t(unsigned long, len, src_off_in_page + 1);
4153                 cur = min(cur, dst_off_in_page + 1);
4154                 move_pages(extent_buffer_page(dst, dst_i),
4155                            extent_buffer_page(dst, src_i),
4156                            dst_off_in_page - cur + 1,
4157                            src_off_in_page - cur + 1, cur);
4158
4159                 dst_end -= cur;
4160                 src_end -= cur;
4161                 len -= cur;
4162         }
4163 }
4164
4165 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4166 {
4167         struct extent_buffer *eb =
4168                         container_of(head, struct extent_buffer, rcu_head);
4169
4170         btrfs_release_extent_buffer(eb);
4171 }
4172
4173 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4174 {
4175         u64 start = page_offset(page);
4176         struct extent_buffer *eb;
4177         int ret = 1;
4178
4179         spin_lock(&tree->buffer_lock);
4180         eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4181         if (!eb) {
4182                 spin_unlock(&tree->buffer_lock);
4183                 return ret;
4184         }
4185
4186         if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4187                 ret = 0;
4188                 goto out;
4189         }
4190
4191         /*
4192          * set @eb->refs to 0 if it is already 1, and then release the @eb.
4193          * Or go back.
4194          */
4195         if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
4196                 ret = 0;
4197                 goto out;
4198         }
4199
4200         radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4201 out:
4202         spin_unlock(&tree->buffer_lock);
4203
4204         /* at this point we can safely release the extent buffer */
4205         if (atomic_read(&eb->refs) == 0)
4206                 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4207         return ret;
4208 }