]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/btrfs/scrub.c
Merge tag 'drm-intel-next-2017-05-29' of git://anongit.freedesktop.org/git/drm-intel...
[karo-tx-linux.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31 #include "raid56.h"
32
33 /*
34  * This is only the first step towards a full-features scrub. It reads all
35  * extent and super block and verifies the checksums. In case a bad checksum
36  * is found or the extent cannot be read, good data will be written back if
37  * any can be found.
38  *
39  * Future enhancements:
40  *  - In case an unrepairable extent is encountered, track which files are
41  *    affected and report them
42  *  - track and record media errors, throw out bad devices
43  *  - add a mode to also read unallocated space
44  */
45
46 struct scrub_block;
47 struct scrub_ctx;
48
49 /*
50  * the following three values only influence the performance.
51  * The last one configures the number of parallel and outstanding I/O
52  * operations. The first two values configure an upper limit for the number
53  * of (dynamically allocated) pages that are added to a bio.
54  */
55 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
56 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
57 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
58
59 /*
60  * the following value times PAGE_SIZE needs to be large enough to match the
61  * largest node/leaf/sector size that shall be supported.
62  * Values larger than BTRFS_STRIPE_LEN are not supported.
63  */
64 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
65
66 struct scrub_recover {
67         refcount_t              refs;
68         struct btrfs_bio        *bbio;
69         u64                     map_length;
70 };
71
72 struct scrub_page {
73         struct scrub_block      *sblock;
74         struct page             *page;
75         struct btrfs_device     *dev;
76         struct list_head        list;
77         u64                     flags;  /* extent flags */
78         u64                     generation;
79         u64                     logical;
80         u64                     physical;
81         u64                     physical_for_dev_replace;
82         atomic_t                refs;
83         struct {
84                 unsigned int    mirror_num:8;
85                 unsigned int    have_csum:1;
86                 unsigned int    io_error:1;
87         };
88         u8                      csum[BTRFS_CSUM_SIZE];
89
90         struct scrub_recover    *recover;
91 };
92
93 struct scrub_bio {
94         int                     index;
95         struct scrub_ctx        *sctx;
96         struct btrfs_device     *dev;
97         struct bio              *bio;
98         int                     err;
99         u64                     logical;
100         u64                     physical;
101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
103 #else
104         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
105 #endif
106         int                     page_count;
107         int                     next_free;
108         struct btrfs_work       work;
109 };
110
111 struct scrub_block {
112         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
113         int                     page_count;
114         atomic_t                outstanding_pages;
115         refcount_t              refs; /* free mem on transition to zero */
116         struct scrub_ctx        *sctx;
117         struct scrub_parity     *sparity;
118         struct {
119                 unsigned int    header_error:1;
120                 unsigned int    checksum_error:1;
121                 unsigned int    no_io_error_seen:1;
122                 unsigned int    generation_error:1; /* also sets header_error */
123
124                 /* The following is for the data used to check parity */
125                 /* It is for the data with checksum */
126                 unsigned int    data_corrected:1;
127         };
128         struct btrfs_work       work;
129 };
130
131 /* Used for the chunks with parity stripe such RAID5/6 */
132 struct scrub_parity {
133         struct scrub_ctx        *sctx;
134
135         struct btrfs_device     *scrub_dev;
136
137         u64                     logic_start;
138
139         u64                     logic_end;
140
141         int                     nsectors;
142
143         u64                     stripe_len;
144
145         refcount_t              refs;
146
147         struct list_head        spages;
148
149         /* Work of parity check and repair */
150         struct btrfs_work       work;
151
152         /* Mark the parity blocks which have data */
153         unsigned long           *dbitmap;
154
155         /*
156          * Mark the parity blocks which have data, but errors happen when
157          * read data or check data
158          */
159         unsigned long           *ebitmap;
160
161         unsigned long           bitmap[0];
162 };
163
164 struct scrub_wr_ctx {
165         struct scrub_bio *wr_curr_bio;
166         struct btrfs_device *tgtdev;
167         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
168         atomic_t flush_all_writes;
169         struct mutex wr_lock;
170 };
171
172 struct scrub_ctx {
173         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
174         struct btrfs_fs_info    *fs_info;
175         int                     first_free;
176         int                     curr;
177         atomic_t                bios_in_flight;
178         atomic_t                workers_pending;
179         spinlock_t              list_lock;
180         wait_queue_head_t       list_wait;
181         u16                     csum_size;
182         struct list_head        csum_list;
183         atomic_t                cancel_req;
184         int                     readonly;
185         int                     pages_per_rd_bio;
186         u32                     sectorsize;
187         u32                     nodesize;
188
189         int                     is_dev_replace;
190         struct scrub_wr_ctx     wr_ctx;
191
192         /*
193          * statistics
194          */
195         struct btrfs_scrub_progress stat;
196         spinlock_t              stat_lock;
197
198         /*
199          * Use a ref counter to avoid use-after-free issues. Scrub workers
200          * decrement bios_in_flight and workers_pending and then do a wakeup
201          * on the list_wait wait queue. We must ensure the main scrub task
202          * doesn't free the scrub context before or while the workers are
203          * doing the wakeup() call.
204          */
205         refcount_t              refs;
206 };
207
208 struct scrub_fixup_nodatasum {
209         struct scrub_ctx        *sctx;
210         struct btrfs_device     *dev;
211         u64                     logical;
212         struct btrfs_root       *root;
213         struct btrfs_work       work;
214         int                     mirror_num;
215 };
216
217 struct scrub_nocow_inode {
218         u64                     inum;
219         u64                     offset;
220         u64                     root;
221         struct list_head        list;
222 };
223
224 struct scrub_copy_nocow_ctx {
225         struct scrub_ctx        *sctx;
226         u64                     logical;
227         u64                     len;
228         int                     mirror_num;
229         u64                     physical_for_dev_replace;
230         struct list_head        inodes;
231         struct btrfs_work       work;
232 };
233
234 struct scrub_warning {
235         struct btrfs_path       *path;
236         u64                     extent_item_size;
237         const char              *errstr;
238         sector_t                sector;
239         u64                     logical;
240         struct btrfs_device     *dev;
241 };
242
243 struct full_stripe_lock {
244         struct rb_node node;
245         u64 logical;
246         u64 refs;
247         struct mutex mutex;
248 };
249
250 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
251 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
252 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
253 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
254 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
255 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
256                                      struct scrub_block *sblocks_for_recheck);
257 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
258                                 struct scrub_block *sblock,
259                                 int retry_failed_mirror);
260 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
261 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
262                                              struct scrub_block *sblock_good);
263 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
264                                             struct scrub_block *sblock_good,
265                                             int page_num, int force_write);
266 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
267 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
268                                            int page_num);
269 static int scrub_checksum_data(struct scrub_block *sblock);
270 static int scrub_checksum_tree_block(struct scrub_block *sblock);
271 static int scrub_checksum_super(struct scrub_block *sblock);
272 static void scrub_block_get(struct scrub_block *sblock);
273 static void scrub_block_put(struct scrub_block *sblock);
274 static void scrub_page_get(struct scrub_page *spage);
275 static void scrub_page_put(struct scrub_page *spage);
276 static void scrub_parity_get(struct scrub_parity *sparity);
277 static void scrub_parity_put(struct scrub_parity *sparity);
278 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
279                                     struct scrub_page *spage);
280 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
281                        u64 physical, struct btrfs_device *dev, u64 flags,
282                        u64 gen, int mirror_num, u8 *csum, int force,
283                        u64 physical_for_dev_replace);
284 static void scrub_bio_end_io(struct bio *bio);
285 static void scrub_bio_end_io_worker(struct btrfs_work *work);
286 static void scrub_block_complete(struct scrub_block *sblock);
287 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
288                                u64 extent_logical, u64 extent_len,
289                                u64 *extent_physical,
290                                struct btrfs_device **extent_dev,
291                                int *extent_mirror_num);
292 static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
293                               struct btrfs_device *dev,
294                               int is_dev_replace);
295 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
296 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
297                                     struct scrub_page *spage);
298 static void scrub_wr_submit(struct scrub_ctx *sctx);
299 static void scrub_wr_bio_end_io(struct bio *bio);
300 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
301 static int write_page_nocow(struct scrub_ctx *sctx,
302                             u64 physical_for_dev_replace, struct page *page);
303 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
304                                       struct scrub_copy_nocow_ctx *ctx);
305 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
306                             int mirror_num, u64 physical_for_dev_replace);
307 static void copy_nocow_pages_worker(struct btrfs_work *work);
308 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
309 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
310 static void scrub_put_ctx(struct scrub_ctx *sctx);
311
312
313 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
314 {
315         refcount_inc(&sctx->refs);
316         atomic_inc(&sctx->bios_in_flight);
317 }
318
319 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
320 {
321         atomic_dec(&sctx->bios_in_flight);
322         wake_up(&sctx->list_wait);
323         scrub_put_ctx(sctx);
324 }
325
326 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
327 {
328         while (atomic_read(&fs_info->scrub_pause_req)) {
329                 mutex_unlock(&fs_info->scrub_lock);
330                 wait_event(fs_info->scrub_pause_wait,
331                    atomic_read(&fs_info->scrub_pause_req) == 0);
332                 mutex_lock(&fs_info->scrub_lock);
333         }
334 }
335
336 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
337 {
338         atomic_inc(&fs_info->scrubs_paused);
339         wake_up(&fs_info->scrub_pause_wait);
340 }
341
342 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
343 {
344         mutex_lock(&fs_info->scrub_lock);
345         __scrub_blocked_if_needed(fs_info);
346         atomic_dec(&fs_info->scrubs_paused);
347         mutex_unlock(&fs_info->scrub_lock);
348
349         wake_up(&fs_info->scrub_pause_wait);
350 }
351
352 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
353 {
354         scrub_pause_on(fs_info);
355         scrub_pause_off(fs_info);
356 }
357
358 /*
359  * Insert new full stripe lock into full stripe locks tree
360  *
361  * Return pointer to existing or newly inserted full_stripe_lock structure if
362  * everything works well.
363  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
364  *
365  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
366  * function
367  */
368 static struct full_stripe_lock *insert_full_stripe_lock(
369                 struct btrfs_full_stripe_locks_tree *locks_root,
370                 u64 fstripe_logical)
371 {
372         struct rb_node **p;
373         struct rb_node *parent = NULL;
374         struct full_stripe_lock *entry;
375         struct full_stripe_lock *ret;
376
377         WARN_ON(!mutex_is_locked(&locks_root->lock));
378
379         p = &locks_root->root.rb_node;
380         while (*p) {
381                 parent = *p;
382                 entry = rb_entry(parent, struct full_stripe_lock, node);
383                 if (fstripe_logical < entry->logical) {
384                         p = &(*p)->rb_left;
385                 } else if (fstripe_logical > entry->logical) {
386                         p = &(*p)->rb_right;
387                 } else {
388                         entry->refs++;
389                         return entry;
390                 }
391         }
392
393         /* Insert new lock */
394         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
395         if (!ret)
396                 return ERR_PTR(-ENOMEM);
397         ret->logical = fstripe_logical;
398         ret->refs = 1;
399         mutex_init(&ret->mutex);
400
401         rb_link_node(&ret->node, parent, p);
402         rb_insert_color(&ret->node, &locks_root->root);
403         return ret;
404 }
405
406 /*
407  * Search for a full stripe lock of a block group
408  *
409  * Return pointer to existing full stripe lock if found
410  * Return NULL if not found
411  */
412 static struct full_stripe_lock *search_full_stripe_lock(
413                 struct btrfs_full_stripe_locks_tree *locks_root,
414                 u64 fstripe_logical)
415 {
416         struct rb_node *node;
417         struct full_stripe_lock *entry;
418
419         WARN_ON(!mutex_is_locked(&locks_root->lock));
420
421         node = locks_root->root.rb_node;
422         while (node) {
423                 entry = rb_entry(node, struct full_stripe_lock, node);
424                 if (fstripe_logical < entry->logical)
425                         node = node->rb_left;
426                 else if (fstripe_logical > entry->logical)
427                         node = node->rb_right;
428                 else
429                         return entry;
430         }
431         return NULL;
432 }
433
434 /*
435  * Helper to get full stripe logical from a normal bytenr.
436  *
437  * Caller must ensure @cache is a RAID56 block group.
438  */
439 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
440                                    u64 bytenr)
441 {
442         u64 ret;
443
444         /*
445          * Due to chunk item size limit, full stripe length should not be
446          * larger than U32_MAX. Just a sanity check here.
447          */
448         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
449
450         /*
451          * round_down() can only handle power of 2, while RAID56 full
452          * stripe length can be 64KiB * n, so we need to manually round down.
453          */
454         ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
455                 cache->full_stripe_len + cache->key.objectid;
456         return ret;
457 }
458
459 /*
460  * Lock a full stripe to avoid concurrency of recovery and read
461  *
462  * It's only used for profiles with parities (RAID5/6), for other profiles it
463  * does nothing.
464  *
465  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
466  * So caller must call unlock_full_stripe() at the same context.
467  *
468  * Return <0 if encounters error.
469  */
470 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
471                             bool *locked_ret)
472 {
473         struct btrfs_block_group_cache *bg_cache;
474         struct btrfs_full_stripe_locks_tree *locks_root;
475         struct full_stripe_lock *existing;
476         u64 fstripe_start;
477         int ret = 0;
478
479         *locked_ret = false;
480         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
481         if (!bg_cache) {
482                 ASSERT(0);
483                 return -ENOENT;
484         }
485
486         /* Profiles not based on parity don't need full stripe lock */
487         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
488                 goto out;
489         locks_root = &bg_cache->full_stripe_locks_root;
490
491         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
492
493         /* Now insert the full stripe lock */
494         mutex_lock(&locks_root->lock);
495         existing = insert_full_stripe_lock(locks_root, fstripe_start);
496         mutex_unlock(&locks_root->lock);
497         if (IS_ERR(existing)) {
498                 ret = PTR_ERR(existing);
499                 goto out;
500         }
501         mutex_lock(&existing->mutex);
502         *locked_ret = true;
503 out:
504         btrfs_put_block_group(bg_cache);
505         return ret;
506 }
507
508 /*
509  * Unlock a full stripe.
510  *
511  * NOTE: Caller must ensure it's the same context calling corresponding
512  * lock_full_stripe().
513  *
514  * Return 0 if we unlock full stripe without problem.
515  * Return <0 for error
516  */
517 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
518                               bool locked)
519 {
520         struct btrfs_block_group_cache *bg_cache;
521         struct btrfs_full_stripe_locks_tree *locks_root;
522         struct full_stripe_lock *fstripe_lock;
523         u64 fstripe_start;
524         bool freeit = false;
525         int ret = 0;
526
527         /* If we didn't acquire full stripe lock, no need to continue */
528         if (!locked)
529                 return 0;
530
531         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
532         if (!bg_cache) {
533                 ASSERT(0);
534                 return -ENOENT;
535         }
536         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
537                 goto out;
538
539         locks_root = &bg_cache->full_stripe_locks_root;
540         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
541
542         mutex_lock(&locks_root->lock);
543         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
544         /* Unpaired unlock_full_stripe() detected */
545         if (!fstripe_lock) {
546                 WARN_ON(1);
547                 ret = -ENOENT;
548                 mutex_unlock(&locks_root->lock);
549                 goto out;
550         }
551
552         if (fstripe_lock->refs == 0) {
553                 WARN_ON(1);
554                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
555                         fstripe_lock->logical);
556         } else {
557                 fstripe_lock->refs--;
558         }
559
560         if (fstripe_lock->refs == 0) {
561                 rb_erase(&fstripe_lock->node, &locks_root->root);
562                 freeit = true;
563         }
564         mutex_unlock(&locks_root->lock);
565
566         mutex_unlock(&fstripe_lock->mutex);
567         if (freeit)
568                 kfree(fstripe_lock);
569 out:
570         btrfs_put_block_group(bg_cache);
571         return ret;
572 }
573
574 /*
575  * used for workers that require transaction commits (i.e., for the
576  * NOCOW case)
577  */
578 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
579 {
580         struct btrfs_fs_info *fs_info = sctx->fs_info;
581
582         refcount_inc(&sctx->refs);
583         /*
584          * increment scrubs_running to prevent cancel requests from
585          * completing as long as a worker is running. we must also
586          * increment scrubs_paused to prevent deadlocking on pause
587          * requests used for transactions commits (as the worker uses a
588          * transaction context). it is safe to regard the worker
589          * as paused for all matters practical. effectively, we only
590          * avoid cancellation requests from completing.
591          */
592         mutex_lock(&fs_info->scrub_lock);
593         atomic_inc(&fs_info->scrubs_running);
594         atomic_inc(&fs_info->scrubs_paused);
595         mutex_unlock(&fs_info->scrub_lock);
596
597         /*
598          * check if @scrubs_running=@scrubs_paused condition
599          * inside wait_event() is not an atomic operation.
600          * which means we may inc/dec @scrub_running/paused
601          * at any time. Let's wake up @scrub_pause_wait as
602          * much as we can to let commit transaction blocked less.
603          */
604         wake_up(&fs_info->scrub_pause_wait);
605
606         atomic_inc(&sctx->workers_pending);
607 }
608
609 /* used for workers that require transaction commits */
610 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
611 {
612         struct btrfs_fs_info *fs_info = sctx->fs_info;
613
614         /*
615          * see scrub_pending_trans_workers_inc() why we're pretending
616          * to be paused in the scrub counters
617          */
618         mutex_lock(&fs_info->scrub_lock);
619         atomic_dec(&fs_info->scrubs_running);
620         atomic_dec(&fs_info->scrubs_paused);
621         mutex_unlock(&fs_info->scrub_lock);
622         atomic_dec(&sctx->workers_pending);
623         wake_up(&fs_info->scrub_pause_wait);
624         wake_up(&sctx->list_wait);
625         scrub_put_ctx(sctx);
626 }
627
628 static void scrub_free_csums(struct scrub_ctx *sctx)
629 {
630         while (!list_empty(&sctx->csum_list)) {
631                 struct btrfs_ordered_sum *sum;
632                 sum = list_first_entry(&sctx->csum_list,
633                                        struct btrfs_ordered_sum, list);
634                 list_del(&sum->list);
635                 kfree(sum);
636         }
637 }
638
639 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
640 {
641         int i;
642
643         if (!sctx)
644                 return;
645
646         scrub_free_wr_ctx(&sctx->wr_ctx);
647
648         /* this can happen when scrub is cancelled */
649         if (sctx->curr != -1) {
650                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
651
652                 for (i = 0; i < sbio->page_count; i++) {
653                         WARN_ON(!sbio->pagev[i]->page);
654                         scrub_block_put(sbio->pagev[i]->sblock);
655                 }
656                 bio_put(sbio->bio);
657         }
658
659         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
660                 struct scrub_bio *sbio = sctx->bios[i];
661
662                 if (!sbio)
663                         break;
664                 kfree(sbio);
665         }
666
667         scrub_free_csums(sctx);
668         kfree(sctx);
669 }
670
671 static void scrub_put_ctx(struct scrub_ctx *sctx)
672 {
673         if (refcount_dec_and_test(&sctx->refs))
674                 scrub_free_ctx(sctx);
675 }
676
677 static noinline_for_stack
678 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
679 {
680         struct scrub_ctx *sctx;
681         int             i;
682         struct btrfs_fs_info *fs_info = dev->fs_info;
683         int ret;
684
685         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
686         if (!sctx)
687                 goto nomem;
688         refcount_set(&sctx->refs, 1);
689         sctx->is_dev_replace = is_dev_replace;
690         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
691         sctx->curr = -1;
692         sctx->fs_info = dev->fs_info;
693         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
694                 struct scrub_bio *sbio;
695
696                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
697                 if (!sbio)
698                         goto nomem;
699                 sctx->bios[i] = sbio;
700
701                 sbio->index = i;
702                 sbio->sctx = sctx;
703                 sbio->page_count = 0;
704                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
705                                 scrub_bio_end_io_worker, NULL, NULL);
706
707                 if (i != SCRUB_BIOS_PER_SCTX - 1)
708                         sctx->bios[i]->next_free = i + 1;
709                 else
710                         sctx->bios[i]->next_free = -1;
711         }
712         sctx->first_free = 0;
713         sctx->nodesize = fs_info->nodesize;
714         sctx->sectorsize = fs_info->sectorsize;
715         atomic_set(&sctx->bios_in_flight, 0);
716         atomic_set(&sctx->workers_pending, 0);
717         atomic_set(&sctx->cancel_req, 0);
718         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
719         INIT_LIST_HEAD(&sctx->csum_list);
720
721         spin_lock_init(&sctx->list_lock);
722         spin_lock_init(&sctx->stat_lock);
723         init_waitqueue_head(&sctx->list_wait);
724
725         ret = scrub_setup_wr_ctx(&sctx->wr_ctx,
726                                  fs_info->dev_replace.tgtdev, is_dev_replace);
727         if (ret) {
728                 scrub_free_ctx(sctx);
729                 return ERR_PTR(ret);
730         }
731         return sctx;
732
733 nomem:
734         scrub_free_ctx(sctx);
735         return ERR_PTR(-ENOMEM);
736 }
737
738 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
739                                      void *warn_ctx)
740 {
741         u64 isize;
742         u32 nlink;
743         int ret;
744         int i;
745         struct extent_buffer *eb;
746         struct btrfs_inode_item *inode_item;
747         struct scrub_warning *swarn = warn_ctx;
748         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
749         struct inode_fs_paths *ipath = NULL;
750         struct btrfs_root *local_root;
751         struct btrfs_key root_key;
752         struct btrfs_key key;
753
754         root_key.objectid = root;
755         root_key.type = BTRFS_ROOT_ITEM_KEY;
756         root_key.offset = (u64)-1;
757         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
758         if (IS_ERR(local_root)) {
759                 ret = PTR_ERR(local_root);
760                 goto err;
761         }
762
763         /*
764          * this makes the path point to (inum INODE_ITEM ioff)
765          */
766         key.objectid = inum;
767         key.type = BTRFS_INODE_ITEM_KEY;
768         key.offset = 0;
769
770         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
771         if (ret) {
772                 btrfs_release_path(swarn->path);
773                 goto err;
774         }
775
776         eb = swarn->path->nodes[0];
777         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
778                                         struct btrfs_inode_item);
779         isize = btrfs_inode_size(eb, inode_item);
780         nlink = btrfs_inode_nlink(eb, inode_item);
781         btrfs_release_path(swarn->path);
782
783         ipath = init_ipath(4096, local_root, swarn->path);
784         if (IS_ERR(ipath)) {
785                 ret = PTR_ERR(ipath);
786                 ipath = NULL;
787                 goto err;
788         }
789         ret = paths_from_inode(inum, ipath);
790
791         if (ret < 0)
792                 goto err;
793
794         /*
795          * we deliberately ignore the bit ipath might have been too small to
796          * hold all of the paths here
797          */
798         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
799                 btrfs_warn_in_rcu(fs_info,
800                                   "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
801                                   swarn->errstr, swarn->logical,
802                                   rcu_str_deref(swarn->dev->name),
803                                   (unsigned long long)swarn->sector,
804                                   root, inum, offset,
805                                   min(isize - offset, (u64)PAGE_SIZE), nlink,
806                                   (char *)(unsigned long)ipath->fspath->val[i]);
807
808         free_ipath(ipath);
809         return 0;
810
811 err:
812         btrfs_warn_in_rcu(fs_info,
813                           "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
814                           swarn->errstr, swarn->logical,
815                           rcu_str_deref(swarn->dev->name),
816                           (unsigned long long)swarn->sector,
817                           root, inum, offset, ret);
818
819         free_ipath(ipath);
820         return 0;
821 }
822
823 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
824 {
825         struct btrfs_device *dev;
826         struct btrfs_fs_info *fs_info;
827         struct btrfs_path *path;
828         struct btrfs_key found_key;
829         struct extent_buffer *eb;
830         struct btrfs_extent_item *ei;
831         struct scrub_warning swarn;
832         unsigned long ptr = 0;
833         u64 extent_item_pos;
834         u64 flags = 0;
835         u64 ref_root;
836         u32 item_size;
837         u8 ref_level = 0;
838         int ret;
839
840         WARN_ON(sblock->page_count < 1);
841         dev = sblock->pagev[0]->dev;
842         fs_info = sblock->sctx->fs_info;
843
844         path = btrfs_alloc_path();
845         if (!path)
846                 return;
847
848         swarn.sector = (sblock->pagev[0]->physical) >> 9;
849         swarn.logical = sblock->pagev[0]->logical;
850         swarn.errstr = errstr;
851         swarn.dev = NULL;
852
853         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
854                                   &flags);
855         if (ret < 0)
856                 goto out;
857
858         extent_item_pos = swarn.logical - found_key.objectid;
859         swarn.extent_item_size = found_key.offset;
860
861         eb = path->nodes[0];
862         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
863         item_size = btrfs_item_size_nr(eb, path->slots[0]);
864
865         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
866                 do {
867                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
868                                                       item_size, &ref_root,
869                                                       &ref_level);
870                         btrfs_warn_in_rcu(fs_info,
871                                 "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu",
872                                 errstr, swarn.logical,
873                                 rcu_str_deref(dev->name),
874                                 (unsigned long long)swarn.sector,
875                                 ref_level ? "node" : "leaf",
876                                 ret < 0 ? -1 : ref_level,
877                                 ret < 0 ? -1 : ref_root);
878                 } while (ret != 1);
879                 btrfs_release_path(path);
880         } else {
881                 btrfs_release_path(path);
882                 swarn.path = path;
883                 swarn.dev = dev;
884                 iterate_extent_inodes(fs_info, found_key.objectid,
885                                         extent_item_pos, 1,
886                                         scrub_print_warning_inode, &swarn);
887         }
888
889 out:
890         btrfs_free_path(path);
891 }
892
893 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
894 {
895         struct page *page = NULL;
896         unsigned long index;
897         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
898         int ret;
899         int corrected = 0;
900         struct btrfs_key key;
901         struct inode *inode = NULL;
902         struct btrfs_fs_info *fs_info;
903         u64 end = offset + PAGE_SIZE - 1;
904         struct btrfs_root *local_root;
905         int srcu_index;
906
907         key.objectid = root;
908         key.type = BTRFS_ROOT_ITEM_KEY;
909         key.offset = (u64)-1;
910
911         fs_info = fixup->root->fs_info;
912         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
913
914         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
915         if (IS_ERR(local_root)) {
916                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
917                 return PTR_ERR(local_root);
918         }
919
920         key.type = BTRFS_INODE_ITEM_KEY;
921         key.objectid = inum;
922         key.offset = 0;
923         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
924         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
925         if (IS_ERR(inode))
926                 return PTR_ERR(inode);
927
928         index = offset >> PAGE_SHIFT;
929
930         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
931         if (!page) {
932                 ret = -ENOMEM;
933                 goto out;
934         }
935
936         if (PageUptodate(page)) {
937                 if (PageDirty(page)) {
938                         /*
939                          * we need to write the data to the defect sector. the
940                          * data that was in that sector is not in memory,
941                          * because the page was modified. we must not write the
942                          * modified page to that sector.
943                          *
944                          * TODO: what could be done here: wait for the delalloc
945                          *       runner to write out that page (might involve
946                          *       COW) and see whether the sector is still
947                          *       referenced afterwards.
948                          *
949                          * For the meantime, we'll treat this error
950                          * incorrectable, although there is a chance that a
951                          * later scrub will find the bad sector again and that
952                          * there's no dirty page in memory, then.
953                          */
954                         ret = -EIO;
955                         goto out;
956                 }
957                 ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE,
958                                         fixup->logical, page,
959                                         offset - page_offset(page),
960                                         fixup->mirror_num);
961                 unlock_page(page);
962                 corrected = !ret;
963         } else {
964                 /*
965                  * we need to get good data first. the general readpage path
966                  * will call repair_io_failure for us, we just have to make
967                  * sure we read the bad mirror.
968                  */
969                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
970                                         EXTENT_DAMAGED);
971                 if (ret) {
972                         /* set_extent_bits should give proper error */
973                         WARN_ON(ret > 0);
974                         if (ret > 0)
975                                 ret = -EFAULT;
976                         goto out;
977                 }
978
979                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
980                                                 btrfs_get_extent,
981                                                 fixup->mirror_num);
982                 wait_on_page_locked(page);
983
984                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
985                                                 end, EXTENT_DAMAGED, 0, NULL);
986                 if (!corrected)
987                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
988                                                 EXTENT_DAMAGED);
989         }
990
991 out:
992         if (page)
993                 put_page(page);
994
995         iput(inode);
996
997         if (ret < 0)
998                 return ret;
999
1000         if (ret == 0 && corrected) {
1001                 /*
1002                  * we only need to call readpage for one of the inodes belonging
1003                  * to this extent. so make iterate_extent_inodes stop
1004                  */
1005                 return 1;
1006         }
1007
1008         return -EIO;
1009 }
1010
1011 static void scrub_fixup_nodatasum(struct btrfs_work *work)
1012 {
1013         struct btrfs_fs_info *fs_info;
1014         int ret;
1015         struct scrub_fixup_nodatasum *fixup;
1016         struct scrub_ctx *sctx;
1017         struct btrfs_trans_handle *trans = NULL;
1018         struct btrfs_path *path;
1019         int uncorrectable = 0;
1020
1021         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
1022         sctx = fixup->sctx;
1023         fs_info = fixup->root->fs_info;
1024
1025         path = btrfs_alloc_path();
1026         if (!path) {
1027                 spin_lock(&sctx->stat_lock);
1028                 ++sctx->stat.malloc_errors;
1029                 spin_unlock(&sctx->stat_lock);
1030                 uncorrectable = 1;
1031                 goto out;
1032         }
1033
1034         trans = btrfs_join_transaction(fixup->root);
1035         if (IS_ERR(trans)) {
1036                 uncorrectable = 1;
1037                 goto out;
1038         }
1039
1040         /*
1041          * the idea is to trigger a regular read through the standard path. we
1042          * read a page from the (failed) logical address by specifying the
1043          * corresponding copynum of the failed sector. thus, that readpage is
1044          * expected to fail.
1045          * that is the point where on-the-fly error correction will kick in
1046          * (once it's finished) and rewrite the failed sector if a good copy
1047          * can be found.
1048          */
1049         ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1050                                           scrub_fixup_readpage, fixup);
1051         if (ret < 0) {
1052                 uncorrectable = 1;
1053                 goto out;
1054         }
1055         WARN_ON(ret != 1);
1056
1057         spin_lock(&sctx->stat_lock);
1058         ++sctx->stat.corrected_errors;
1059         spin_unlock(&sctx->stat_lock);
1060
1061 out:
1062         if (trans && !IS_ERR(trans))
1063                 btrfs_end_transaction(trans);
1064         if (uncorrectable) {
1065                 spin_lock(&sctx->stat_lock);
1066                 ++sctx->stat.uncorrectable_errors;
1067                 spin_unlock(&sctx->stat_lock);
1068                 btrfs_dev_replace_stats_inc(
1069                         &fs_info->dev_replace.num_uncorrectable_read_errors);
1070                 btrfs_err_rl_in_rcu(fs_info,
1071                     "unable to fixup (nodatasum) error at logical %llu on dev %s",
1072                         fixup->logical, rcu_str_deref(fixup->dev->name));
1073         }
1074
1075         btrfs_free_path(path);
1076         kfree(fixup);
1077
1078         scrub_pending_trans_workers_dec(sctx);
1079 }
1080
1081 static inline void scrub_get_recover(struct scrub_recover *recover)
1082 {
1083         refcount_inc(&recover->refs);
1084 }
1085
1086 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1087                                      struct scrub_recover *recover)
1088 {
1089         if (refcount_dec_and_test(&recover->refs)) {
1090                 btrfs_bio_counter_dec(fs_info);
1091                 btrfs_put_bbio(recover->bbio);
1092                 kfree(recover);
1093         }
1094 }
1095
1096 /*
1097  * scrub_handle_errored_block gets called when either verification of the
1098  * pages failed or the bio failed to read, e.g. with EIO. In the latter
1099  * case, this function handles all pages in the bio, even though only one
1100  * may be bad.
1101  * The goal of this function is to repair the errored block by using the
1102  * contents of one of the mirrors.
1103  */
1104 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1105 {
1106         struct scrub_ctx *sctx = sblock_to_check->sctx;
1107         struct btrfs_device *dev;
1108         struct btrfs_fs_info *fs_info;
1109         u64 length;
1110         u64 logical;
1111         unsigned int failed_mirror_index;
1112         unsigned int is_metadata;
1113         unsigned int have_csum;
1114         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1115         struct scrub_block *sblock_bad;
1116         int ret;
1117         int mirror_index;
1118         int page_num;
1119         int success;
1120         bool full_stripe_locked;
1121         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1122                                       DEFAULT_RATELIMIT_BURST);
1123
1124         BUG_ON(sblock_to_check->page_count < 1);
1125         fs_info = sctx->fs_info;
1126         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1127                 /*
1128                  * if we find an error in a super block, we just report it.
1129                  * They will get written with the next transaction commit
1130                  * anyway
1131                  */
1132                 spin_lock(&sctx->stat_lock);
1133                 ++sctx->stat.super_errors;
1134                 spin_unlock(&sctx->stat_lock);
1135                 return 0;
1136         }
1137         length = sblock_to_check->page_count * PAGE_SIZE;
1138         logical = sblock_to_check->pagev[0]->logical;
1139         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1140         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1141         is_metadata = !(sblock_to_check->pagev[0]->flags &
1142                         BTRFS_EXTENT_FLAG_DATA);
1143         have_csum = sblock_to_check->pagev[0]->have_csum;
1144         dev = sblock_to_check->pagev[0]->dev;
1145
1146         /*
1147          * For RAID5/6, race can happen for a different device scrub thread.
1148          * For data corruption, Parity and Data threads will both try
1149          * to recovery the data.
1150          * Race can lead to doubly added csum error, or even unrecoverable
1151          * error.
1152          */
1153         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1154         if (ret < 0) {
1155                 spin_lock(&sctx->stat_lock);
1156                 if (ret == -ENOMEM)
1157                         sctx->stat.malloc_errors++;
1158                 sctx->stat.read_errors++;
1159                 sctx->stat.uncorrectable_errors++;
1160                 spin_unlock(&sctx->stat_lock);
1161                 return ret;
1162         }
1163
1164         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1165                 sblocks_for_recheck = NULL;
1166                 goto nodatasum_case;
1167         }
1168
1169         /*
1170          * read all mirrors one after the other. This includes to
1171          * re-read the extent or metadata block that failed (that was
1172          * the cause that this fixup code is called) another time,
1173          * page by page this time in order to know which pages
1174          * caused I/O errors and which ones are good (for all mirrors).
1175          * It is the goal to handle the situation when more than one
1176          * mirror contains I/O errors, but the errors do not
1177          * overlap, i.e. the data can be repaired by selecting the
1178          * pages from those mirrors without I/O error on the
1179          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1180          * would be that mirror #1 has an I/O error on the first page,
1181          * the second page is good, and mirror #2 has an I/O error on
1182          * the second page, but the first page is good.
1183          * Then the first page of the first mirror can be repaired by
1184          * taking the first page of the second mirror, and the
1185          * second page of the second mirror can be repaired by
1186          * copying the contents of the 2nd page of the 1st mirror.
1187          * One more note: if the pages of one mirror contain I/O
1188          * errors, the checksum cannot be verified. In order to get
1189          * the best data for repairing, the first attempt is to find
1190          * a mirror without I/O errors and with a validated checksum.
1191          * Only if this is not possible, the pages are picked from
1192          * mirrors with I/O errors without considering the checksum.
1193          * If the latter is the case, at the end, the checksum of the
1194          * repaired area is verified in order to correctly maintain
1195          * the statistics.
1196          */
1197
1198         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1199                                       sizeof(*sblocks_for_recheck), GFP_NOFS);
1200         if (!sblocks_for_recheck) {
1201                 spin_lock(&sctx->stat_lock);
1202                 sctx->stat.malloc_errors++;
1203                 sctx->stat.read_errors++;
1204                 sctx->stat.uncorrectable_errors++;
1205                 spin_unlock(&sctx->stat_lock);
1206                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1207                 goto out;
1208         }
1209
1210         /* setup the context, map the logical blocks and alloc the pages */
1211         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1212         if (ret) {
1213                 spin_lock(&sctx->stat_lock);
1214                 sctx->stat.read_errors++;
1215                 sctx->stat.uncorrectable_errors++;
1216                 spin_unlock(&sctx->stat_lock);
1217                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1218                 goto out;
1219         }
1220         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1221         sblock_bad = sblocks_for_recheck + failed_mirror_index;
1222
1223         /* build and submit the bios for the failed mirror, check checksums */
1224         scrub_recheck_block(fs_info, sblock_bad, 1);
1225
1226         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1227             sblock_bad->no_io_error_seen) {
1228                 /*
1229                  * the error disappeared after reading page by page, or
1230                  * the area was part of a huge bio and other parts of the
1231                  * bio caused I/O errors, or the block layer merged several
1232                  * read requests into one and the error is caused by a
1233                  * different bio (usually one of the two latter cases is
1234                  * the cause)
1235                  */
1236                 spin_lock(&sctx->stat_lock);
1237                 sctx->stat.unverified_errors++;
1238                 sblock_to_check->data_corrected = 1;
1239                 spin_unlock(&sctx->stat_lock);
1240
1241                 if (sctx->is_dev_replace)
1242                         scrub_write_block_to_dev_replace(sblock_bad);
1243                 goto out;
1244         }
1245
1246         if (!sblock_bad->no_io_error_seen) {
1247                 spin_lock(&sctx->stat_lock);
1248                 sctx->stat.read_errors++;
1249                 spin_unlock(&sctx->stat_lock);
1250                 if (__ratelimit(&_rs))
1251                         scrub_print_warning("i/o error", sblock_to_check);
1252                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1253         } else if (sblock_bad->checksum_error) {
1254                 spin_lock(&sctx->stat_lock);
1255                 sctx->stat.csum_errors++;
1256                 spin_unlock(&sctx->stat_lock);
1257                 if (__ratelimit(&_rs))
1258                         scrub_print_warning("checksum error", sblock_to_check);
1259                 btrfs_dev_stat_inc_and_print(dev,
1260                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1261         } else if (sblock_bad->header_error) {
1262                 spin_lock(&sctx->stat_lock);
1263                 sctx->stat.verify_errors++;
1264                 spin_unlock(&sctx->stat_lock);
1265                 if (__ratelimit(&_rs))
1266                         scrub_print_warning("checksum/header error",
1267                                             sblock_to_check);
1268                 if (sblock_bad->generation_error)
1269                         btrfs_dev_stat_inc_and_print(dev,
1270                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1271                 else
1272                         btrfs_dev_stat_inc_and_print(dev,
1273                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1274         }
1275
1276         if (sctx->readonly) {
1277                 ASSERT(!sctx->is_dev_replace);
1278                 goto out;
1279         }
1280
1281         if (!is_metadata && !have_csum) {
1282                 struct scrub_fixup_nodatasum *fixup_nodatasum;
1283
1284                 WARN_ON(sctx->is_dev_replace);
1285
1286 nodatasum_case:
1287
1288                 /*
1289                  * !is_metadata and !have_csum, this means that the data
1290                  * might not be COWed, that it might be modified
1291                  * concurrently. The general strategy to work on the
1292                  * commit root does not help in the case when COW is not
1293                  * used.
1294                  */
1295                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1296                 if (!fixup_nodatasum)
1297                         goto did_not_correct_error;
1298                 fixup_nodatasum->sctx = sctx;
1299                 fixup_nodatasum->dev = dev;
1300                 fixup_nodatasum->logical = logical;
1301                 fixup_nodatasum->root = fs_info->extent_root;
1302                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1303                 scrub_pending_trans_workers_inc(sctx);
1304                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1305                                 scrub_fixup_nodatasum, NULL, NULL);
1306                 btrfs_queue_work(fs_info->scrub_workers,
1307                                  &fixup_nodatasum->work);
1308                 goto out;
1309         }
1310
1311         /*
1312          * now build and submit the bios for the other mirrors, check
1313          * checksums.
1314          * First try to pick the mirror which is completely without I/O
1315          * errors and also does not have a checksum error.
1316          * If one is found, and if a checksum is present, the full block
1317          * that is known to contain an error is rewritten. Afterwards
1318          * the block is known to be corrected.
1319          * If a mirror is found which is completely correct, and no
1320          * checksum is present, only those pages are rewritten that had
1321          * an I/O error in the block to be repaired, since it cannot be
1322          * determined, which copy of the other pages is better (and it
1323          * could happen otherwise that a correct page would be
1324          * overwritten by a bad one).
1325          */
1326         for (mirror_index = 0;
1327              mirror_index < BTRFS_MAX_MIRRORS &&
1328              sblocks_for_recheck[mirror_index].page_count > 0;
1329              mirror_index++) {
1330                 struct scrub_block *sblock_other;
1331
1332                 if (mirror_index == failed_mirror_index)
1333                         continue;
1334                 sblock_other = sblocks_for_recheck + mirror_index;
1335
1336                 /* build and submit the bios, check checksums */
1337                 scrub_recheck_block(fs_info, sblock_other, 0);
1338
1339                 if (!sblock_other->header_error &&
1340                     !sblock_other->checksum_error &&
1341                     sblock_other->no_io_error_seen) {
1342                         if (sctx->is_dev_replace) {
1343                                 scrub_write_block_to_dev_replace(sblock_other);
1344                                 goto corrected_error;
1345                         } else {
1346                                 ret = scrub_repair_block_from_good_copy(
1347                                                 sblock_bad, sblock_other);
1348                                 if (!ret)
1349                                         goto corrected_error;
1350                         }
1351                 }
1352         }
1353
1354         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1355                 goto did_not_correct_error;
1356
1357         /*
1358          * In case of I/O errors in the area that is supposed to be
1359          * repaired, continue by picking good copies of those pages.
1360          * Select the good pages from mirrors to rewrite bad pages from
1361          * the area to fix. Afterwards verify the checksum of the block
1362          * that is supposed to be repaired. This verification step is
1363          * only done for the purpose of statistic counting and for the
1364          * final scrub report, whether errors remain.
1365          * A perfect algorithm could make use of the checksum and try
1366          * all possible combinations of pages from the different mirrors
1367          * until the checksum verification succeeds. For example, when
1368          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1369          * of mirror #2 is readable but the final checksum test fails,
1370          * then the 2nd page of mirror #3 could be tried, whether now
1371          * the final checksum succeeds. But this would be a rare
1372          * exception and is therefore not implemented. At least it is
1373          * avoided that the good copy is overwritten.
1374          * A more useful improvement would be to pick the sectors
1375          * without I/O error based on sector sizes (512 bytes on legacy
1376          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1377          * mirror could be repaired by taking 512 byte of a different
1378          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1379          * area are unreadable.
1380          */
1381         success = 1;
1382         for (page_num = 0; page_num < sblock_bad->page_count;
1383              page_num++) {
1384                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1385                 struct scrub_block *sblock_other = NULL;
1386
1387                 /* skip no-io-error page in scrub */
1388                 if (!page_bad->io_error && !sctx->is_dev_replace)
1389                         continue;
1390
1391                 /* try to find no-io-error page in mirrors */
1392                 if (page_bad->io_error) {
1393                         for (mirror_index = 0;
1394                              mirror_index < BTRFS_MAX_MIRRORS &&
1395                              sblocks_for_recheck[mirror_index].page_count > 0;
1396                              mirror_index++) {
1397                                 if (!sblocks_for_recheck[mirror_index].
1398                                     pagev[page_num]->io_error) {
1399                                         sblock_other = sblocks_for_recheck +
1400                                                        mirror_index;
1401                                         break;
1402                                 }
1403                         }
1404                         if (!sblock_other)
1405                                 success = 0;
1406                 }
1407
1408                 if (sctx->is_dev_replace) {
1409                         /*
1410                          * did not find a mirror to fetch the page
1411                          * from. scrub_write_page_to_dev_replace()
1412                          * handles this case (page->io_error), by
1413                          * filling the block with zeros before
1414                          * submitting the write request
1415                          */
1416                         if (!sblock_other)
1417                                 sblock_other = sblock_bad;
1418
1419                         if (scrub_write_page_to_dev_replace(sblock_other,
1420                                                             page_num) != 0) {
1421                                 btrfs_dev_replace_stats_inc(
1422                                         &fs_info->dev_replace.num_write_errors);
1423                                 success = 0;
1424                         }
1425                 } else if (sblock_other) {
1426                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1427                                                                sblock_other,
1428                                                                page_num, 0);
1429                         if (0 == ret)
1430                                 page_bad->io_error = 0;
1431                         else
1432                                 success = 0;
1433                 }
1434         }
1435
1436         if (success && !sctx->is_dev_replace) {
1437                 if (is_metadata || have_csum) {
1438                         /*
1439                          * need to verify the checksum now that all
1440                          * sectors on disk are repaired (the write
1441                          * request for data to be repaired is on its way).
1442                          * Just be lazy and use scrub_recheck_block()
1443                          * which re-reads the data before the checksum
1444                          * is verified, but most likely the data comes out
1445                          * of the page cache.
1446                          */
1447                         scrub_recheck_block(fs_info, sblock_bad, 1);
1448                         if (!sblock_bad->header_error &&
1449                             !sblock_bad->checksum_error &&
1450                             sblock_bad->no_io_error_seen)
1451                                 goto corrected_error;
1452                         else
1453                                 goto did_not_correct_error;
1454                 } else {
1455 corrected_error:
1456                         spin_lock(&sctx->stat_lock);
1457                         sctx->stat.corrected_errors++;
1458                         sblock_to_check->data_corrected = 1;
1459                         spin_unlock(&sctx->stat_lock);
1460                         btrfs_err_rl_in_rcu(fs_info,
1461                                 "fixed up error at logical %llu on dev %s",
1462                                 logical, rcu_str_deref(dev->name));
1463                 }
1464         } else {
1465 did_not_correct_error:
1466                 spin_lock(&sctx->stat_lock);
1467                 sctx->stat.uncorrectable_errors++;
1468                 spin_unlock(&sctx->stat_lock);
1469                 btrfs_err_rl_in_rcu(fs_info,
1470                         "unable to fixup (regular) error at logical %llu on dev %s",
1471                         logical, rcu_str_deref(dev->name));
1472         }
1473
1474 out:
1475         if (sblocks_for_recheck) {
1476                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1477                      mirror_index++) {
1478                         struct scrub_block *sblock = sblocks_for_recheck +
1479                                                      mirror_index;
1480                         struct scrub_recover *recover;
1481                         int page_index;
1482
1483                         for (page_index = 0; page_index < sblock->page_count;
1484                              page_index++) {
1485                                 sblock->pagev[page_index]->sblock = NULL;
1486                                 recover = sblock->pagev[page_index]->recover;
1487                                 if (recover) {
1488                                         scrub_put_recover(fs_info, recover);
1489                                         sblock->pagev[page_index]->recover =
1490                                                                         NULL;
1491                                 }
1492                                 scrub_page_put(sblock->pagev[page_index]);
1493                         }
1494                 }
1495                 kfree(sblocks_for_recheck);
1496         }
1497
1498         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1499         if (ret < 0)
1500                 return ret;
1501         return 0;
1502 }
1503
1504 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1505 {
1506         if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1507                 return 2;
1508         else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1509                 return 3;
1510         else
1511                 return (int)bbio->num_stripes;
1512 }
1513
1514 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1515                                                  u64 *raid_map,
1516                                                  u64 mapped_length,
1517                                                  int nstripes, int mirror,
1518                                                  int *stripe_index,
1519                                                  u64 *stripe_offset)
1520 {
1521         int i;
1522
1523         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1524                 /* RAID5/6 */
1525                 for (i = 0; i < nstripes; i++) {
1526                         if (raid_map[i] == RAID6_Q_STRIPE ||
1527                             raid_map[i] == RAID5_P_STRIPE)
1528                                 continue;
1529
1530                         if (logical >= raid_map[i] &&
1531                             logical < raid_map[i] + mapped_length)
1532                                 break;
1533                 }
1534
1535                 *stripe_index = i;
1536                 *stripe_offset = logical - raid_map[i];
1537         } else {
1538                 /* The other RAID type */
1539                 *stripe_index = mirror;
1540                 *stripe_offset = 0;
1541         }
1542 }
1543
1544 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1545                                      struct scrub_block *sblocks_for_recheck)
1546 {
1547         struct scrub_ctx *sctx = original_sblock->sctx;
1548         struct btrfs_fs_info *fs_info = sctx->fs_info;
1549         u64 length = original_sblock->page_count * PAGE_SIZE;
1550         u64 logical = original_sblock->pagev[0]->logical;
1551         u64 generation = original_sblock->pagev[0]->generation;
1552         u64 flags = original_sblock->pagev[0]->flags;
1553         u64 have_csum = original_sblock->pagev[0]->have_csum;
1554         struct scrub_recover *recover;
1555         struct btrfs_bio *bbio;
1556         u64 sublen;
1557         u64 mapped_length;
1558         u64 stripe_offset;
1559         int stripe_index;
1560         int page_index = 0;
1561         int mirror_index;
1562         int nmirrors;
1563         int ret;
1564
1565         /*
1566          * note: the two members refs and outstanding_pages
1567          * are not used (and not set) in the blocks that are used for
1568          * the recheck procedure
1569          */
1570
1571         while (length > 0) {
1572                 sublen = min_t(u64, length, PAGE_SIZE);
1573                 mapped_length = sublen;
1574                 bbio = NULL;
1575
1576                 /*
1577                  * with a length of PAGE_SIZE, each returned stripe
1578                  * represents one mirror
1579                  */
1580                 btrfs_bio_counter_inc_blocked(fs_info);
1581                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1582                                 logical, &mapped_length, &bbio);
1583                 if (ret || !bbio || mapped_length < sublen) {
1584                         btrfs_put_bbio(bbio);
1585                         btrfs_bio_counter_dec(fs_info);
1586                         return -EIO;
1587                 }
1588
1589                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1590                 if (!recover) {
1591                         btrfs_put_bbio(bbio);
1592                         btrfs_bio_counter_dec(fs_info);
1593                         return -ENOMEM;
1594                 }
1595
1596                 refcount_set(&recover->refs, 1);
1597                 recover->bbio = bbio;
1598                 recover->map_length = mapped_length;
1599
1600                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1601
1602                 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1603
1604                 for (mirror_index = 0; mirror_index < nmirrors;
1605                      mirror_index++) {
1606                         struct scrub_block *sblock;
1607                         struct scrub_page *page;
1608
1609                         sblock = sblocks_for_recheck + mirror_index;
1610                         sblock->sctx = sctx;
1611
1612                         page = kzalloc(sizeof(*page), GFP_NOFS);
1613                         if (!page) {
1614 leave_nomem:
1615                                 spin_lock(&sctx->stat_lock);
1616                                 sctx->stat.malloc_errors++;
1617                                 spin_unlock(&sctx->stat_lock);
1618                                 scrub_put_recover(fs_info, recover);
1619                                 return -ENOMEM;
1620                         }
1621                         scrub_page_get(page);
1622                         sblock->pagev[page_index] = page;
1623                         page->sblock = sblock;
1624                         page->flags = flags;
1625                         page->generation = generation;
1626                         page->logical = logical;
1627                         page->have_csum = have_csum;
1628                         if (have_csum)
1629                                 memcpy(page->csum,
1630                                        original_sblock->pagev[0]->csum,
1631                                        sctx->csum_size);
1632
1633                         scrub_stripe_index_and_offset(logical,
1634                                                       bbio->map_type,
1635                                                       bbio->raid_map,
1636                                                       mapped_length,
1637                                                       bbio->num_stripes -
1638                                                       bbio->num_tgtdevs,
1639                                                       mirror_index,
1640                                                       &stripe_index,
1641                                                       &stripe_offset);
1642                         page->physical = bbio->stripes[stripe_index].physical +
1643                                          stripe_offset;
1644                         page->dev = bbio->stripes[stripe_index].dev;
1645
1646                         BUG_ON(page_index >= original_sblock->page_count);
1647                         page->physical_for_dev_replace =
1648                                 original_sblock->pagev[page_index]->
1649                                 physical_for_dev_replace;
1650                         /* for missing devices, dev->bdev is NULL */
1651                         page->mirror_num = mirror_index + 1;
1652                         sblock->page_count++;
1653                         page->page = alloc_page(GFP_NOFS);
1654                         if (!page->page)
1655                                 goto leave_nomem;
1656
1657                         scrub_get_recover(recover);
1658                         page->recover = recover;
1659                 }
1660                 scrub_put_recover(fs_info, recover);
1661                 length -= sublen;
1662                 logical += sublen;
1663                 page_index++;
1664         }
1665
1666         return 0;
1667 }
1668
1669 struct scrub_bio_ret {
1670         struct completion event;
1671         int error;
1672 };
1673
1674 static void scrub_bio_wait_endio(struct bio *bio)
1675 {
1676         struct scrub_bio_ret *ret = bio->bi_private;
1677
1678         ret->error = bio->bi_error;
1679         complete(&ret->event);
1680 }
1681
1682 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1683 {
1684         return page->recover &&
1685                (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1686 }
1687
1688 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1689                                         struct bio *bio,
1690                                         struct scrub_page *page)
1691 {
1692         struct scrub_bio_ret done;
1693         int ret;
1694
1695         init_completion(&done.event);
1696         done.error = 0;
1697         bio->bi_iter.bi_sector = page->logical >> 9;
1698         bio->bi_private = &done;
1699         bio->bi_end_io = scrub_bio_wait_endio;
1700
1701         ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1702                                     page->recover->map_length,
1703                                     page->mirror_num, 0);
1704         if (ret)
1705                 return ret;
1706
1707         wait_for_completion(&done.event);
1708         if (done.error)
1709                 return -EIO;
1710
1711         return 0;
1712 }
1713
1714 /*
1715  * this function will check the on disk data for checksum errors, header
1716  * errors and read I/O errors. If any I/O errors happen, the exact pages
1717  * which are errored are marked as being bad. The goal is to enable scrub
1718  * to take those pages that are not errored from all the mirrors so that
1719  * the pages that are errored in the just handled mirror can be repaired.
1720  */
1721 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1722                                 struct scrub_block *sblock,
1723                                 int retry_failed_mirror)
1724 {
1725         int page_num;
1726
1727         sblock->no_io_error_seen = 1;
1728
1729         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1730                 struct bio *bio;
1731                 struct scrub_page *page = sblock->pagev[page_num];
1732
1733                 if (page->dev->bdev == NULL) {
1734                         page->io_error = 1;
1735                         sblock->no_io_error_seen = 0;
1736                         continue;
1737                 }
1738
1739                 WARN_ON(!page->page);
1740                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1741                 if (!bio) {
1742                         page->io_error = 1;
1743                         sblock->no_io_error_seen = 0;
1744                         continue;
1745                 }
1746                 bio->bi_bdev = page->dev->bdev;
1747
1748                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1749                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1750                         if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
1751                                 page->io_error = 1;
1752                                 sblock->no_io_error_seen = 0;
1753                         }
1754                 } else {
1755                         bio->bi_iter.bi_sector = page->physical >> 9;
1756                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
1757
1758                         if (btrfsic_submit_bio_wait(bio)) {
1759                                 page->io_error = 1;
1760                                 sblock->no_io_error_seen = 0;
1761                         }
1762                 }
1763
1764                 bio_put(bio);
1765         }
1766
1767         if (sblock->no_io_error_seen)
1768                 scrub_recheck_block_checksum(sblock);
1769 }
1770
1771 static inline int scrub_check_fsid(u8 fsid[],
1772                                    struct scrub_page *spage)
1773 {
1774         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1775         int ret;
1776
1777         ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1778         return !ret;
1779 }
1780
1781 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1782 {
1783         sblock->header_error = 0;
1784         sblock->checksum_error = 0;
1785         sblock->generation_error = 0;
1786
1787         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1788                 scrub_checksum_data(sblock);
1789         else
1790                 scrub_checksum_tree_block(sblock);
1791 }
1792
1793 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1794                                              struct scrub_block *sblock_good)
1795 {
1796         int page_num;
1797         int ret = 0;
1798
1799         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1800                 int ret_sub;
1801
1802                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1803                                                            sblock_good,
1804                                                            page_num, 1);
1805                 if (ret_sub)
1806                         ret = ret_sub;
1807         }
1808
1809         return ret;
1810 }
1811
1812 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1813                                             struct scrub_block *sblock_good,
1814                                             int page_num, int force_write)
1815 {
1816         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1817         struct scrub_page *page_good = sblock_good->pagev[page_num];
1818         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1819
1820         BUG_ON(page_bad->page == NULL);
1821         BUG_ON(page_good->page == NULL);
1822         if (force_write || sblock_bad->header_error ||
1823             sblock_bad->checksum_error || page_bad->io_error) {
1824                 struct bio *bio;
1825                 int ret;
1826
1827                 if (!page_bad->dev->bdev) {
1828                         btrfs_warn_rl(fs_info,
1829                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1830                         return -EIO;
1831                 }
1832
1833                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1834                 if (!bio)
1835                         return -EIO;
1836                 bio->bi_bdev = page_bad->dev->bdev;
1837                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1838                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1839
1840                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1841                 if (PAGE_SIZE != ret) {
1842                         bio_put(bio);
1843                         return -EIO;
1844                 }
1845
1846                 if (btrfsic_submit_bio_wait(bio)) {
1847                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1848                                 BTRFS_DEV_STAT_WRITE_ERRS);
1849                         btrfs_dev_replace_stats_inc(
1850                                 &fs_info->dev_replace.num_write_errors);
1851                         bio_put(bio);
1852                         return -EIO;
1853                 }
1854                 bio_put(bio);
1855         }
1856
1857         return 0;
1858 }
1859
1860 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1861 {
1862         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1863         int page_num;
1864
1865         /*
1866          * This block is used for the check of the parity on the source device,
1867          * so the data needn't be written into the destination device.
1868          */
1869         if (sblock->sparity)
1870                 return;
1871
1872         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1873                 int ret;
1874
1875                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1876                 if (ret)
1877                         btrfs_dev_replace_stats_inc(
1878                                 &fs_info->dev_replace.num_write_errors);
1879         }
1880 }
1881
1882 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1883                                            int page_num)
1884 {
1885         struct scrub_page *spage = sblock->pagev[page_num];
1886
1887         BUG_ON(spage->page == NULL);
1888         if (spage->io_error) {
1889                 void *mapped_buffer = kmap_atomic(spage->page);
1890
1891                 clear_page(mapped_buffer);
1892                 flush_dcache_page(spage->page);
1893                 kunmap_atomic(mapped_buffer);
1894         }
1895         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1896 }
1897
1898 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1899                                     struct scrub_page *spage)
1900 {
1901         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1902         struct scrub_bio *sbio;
1903         int ret;
1904
1905         mutex_lock(&wr_ctx->wr_lock);
1906 again:
1907         if (!wr_ctx->wr_curr_bio) {
1908                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1909                                               GFP_KERNEL);
1910                 if (!wr_ctx->wr_curr_bio) {
1911                         mutex_unlock(&wr_ctx->wr_lock);
1912                         return -ENOMEM;
1913                 }
1914                 wr_ctx->wr_curr_bio->sctx = sctx;
1915                 wr_ctx->wr_curr_bio->page_count = 0;
1916         }
1917         sbio = wr_ctx->wr_curr_bio;
1918         if (sbio->page_count == 0) {
1919                 struct bio *bio;
1920
1921                 sbio->physical = spage->physical_for_dev_replace;
1922                 sbio->logical = spage->logical;
1923                 sbio->dev = wr_ctx->tgtdev;
1924                 bio = sbio->bio;
1925                 if (!bio) {
1926                         bio = btrfs_io_bio_alloc(GFP_KERNEL,
1927                                         wr_ctx->pages_per_wr_bio);
1928                         if (!bio) {
1929                                 mutex_unlock(&wr_ctx->wr_lock);
1930                                 return -ENOMEM;
1931                         }
1932                         sbio->bio = bio;
1933                 }
1934
1935                 bio->bi_private = sbio;
1936                 bio->bi_end_io = scrub_wr_bio_end_io;
1937                 bio->bi_bdev = sbio->dev->bdev;
1938                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1939                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1940                 sbio->err = 0;
1941         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1942                    spage->physical_for_dev_replace ||
1943                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1944                    spage->logical) {
1945                 scrub_wr_submit(sctx);
1946                 goto again;
1947         }
1948
1949         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1950         if (ret != PAGE_SIZE) {
1951                 if (sbio->page_count < 1) {
1952                         bio_put(sbio->bio);
1953                         sbio->bio = NULL;
1954                         mutex_unlock(&wr_ctx->wr_lock);
1955                         return -EIO;
1956                 }
1957                 scrub_wr_submit(sctx);
1958                 goto again;
1959         }
1960
1961         sbio->pagev[sbio->page_count] = spage;
1962         scrub_page_get(spage);
1963         sbio->page_count++;
1964         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1965                 scrub_wr_submit(sctx);
1966         mutex_unlock(&wr_ctx->wr_lock);
1967
1968         return 0;
1969 }
1970
1971 static void scrub_wr_submit(struct scrub_ctx *sctx)
1972 {
1973         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1974         struct scrub_bio *sbio;
1975
1976         if (!wr_ctx->wr_curr_bio)
1977                 return;
1978
1979         sbio = wr_ctx->wr_curr_bio;
1980         wr_ctx->wr_curr_bio = NULL;
1981         WARN_ON(!sbio->bio->bi_bdev);
1982         scrub_pending_bio_inc(sctx);
1983         /* process all writes in a single worker thread. Then the block layer
1984          * orders the requests before sending them to the driver which
1985          * doubled the write performance on spinning disks when measured
1986          * with Linux 3.5 */
1987         btrfsic_submit_bio(sbio->bio);
1988 }
1989
1990 static void scrub_wr_bio_end_io(struct bio *bio)
1991 {
1992         struct scrub_bio *sbio = bio->bi_private;
1993         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1994
1995         sbio->err = bio->bi_error;
1996         sbio->bio = bio;
1997
1998         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1999                          scrub_wr_bio_end_io_worker, NULL, NULL);
2000         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2001 }
2002
2003 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2004 {
2005         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2006         struct scrub_ctx *sctx = sbio->sctx;
2007         int i;
2008
2009         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2010         if (sbio->err) {
2011                 struct btrfs_dev_replace *dev_replace =
2012                         &sbio->sctx->fs_info->dev_replace;
2013
2014                 for (i = 0; i < sbio->page_count; i++) {
2015                         struct scrub_page *spage = sbio->pagev[i];
2016
2017                         spage->io_error = 1;
2018                         btrfs_dev_replace_stats_inc(&dev_replace->
2019                                                     num_write_errors);
2020                 }
2021         }
2022
2023         for (i = 0; i < sbio->page_count; i++)
2024                 scrub_page_put(sbio->pagev[i]);
2025
2026         bio_put(sbio->bio);
2027         kfree(sbio);
2028         scrub_pending_bio_dec(sctx);
2029 }
2030
2031 static int scrub_checksum(struct scrub_block *sblock)
2032 {
2033         u64 flags;
2034         int ret;
2035
2036         /*
2037          * No need to initialize these stats currently,
2038          * because this function only use return value
2039          * instead of these stats value.
2040          *
2041          * Todo:
2042          * always use stats
2043          */
2044         sblock->header_error = 0;
2045         sblock->generation_error = 0;
2046         sblock->checksum_error = 0;
2047
2048         WARN_ON(sblock->page_count < 1);
2049         flags = sblock->pagev[0]->flags;
2050         ret = 0;
2051         if (flags & BTRFS_EXTENT_FLAG_DATA)
2052                 ret = scrub_checksum_data(sblock);
2053         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2054                 ret = scrub_checksum_tree_block(sblock);
2055         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2056                 (void)scrub_checksum_super(sblock);
2057         else
2058                 WARN_ON(1);
2059         if (ret)
2060                 scrub_handle_errored_block(sblock);
2061
2062         return ret;
2063 }
2064
2065 static int scrub_checksum_data(struct scrub_block *sblock)
2066 {
2067         struct scrub_ctx *sctx = sblock->sctx;
2068         u8 csum[BTRFS_CSUM_SIZE];
2069         u8 *on_disk_csum;
2070         struct page *page;
2071         void *buffer;
2072         u32 crc = ~(u32)0;
2073         u64 len;
2074         int index;
2075
2076         BUG_ON(sblock->page_count < 1);
2077         if (!sblock->pagev[0]->have_csum)
2078                 return 0;
2079
2080         on_disk_csum = sblock->pagev[0]->csum;
2081         page = sblock->pagev[0]->page;
2082         buffer = kmap_atomic(page);
2083
2084         len = sctx->sectorsize;
2085         index = 0;
2086         for (;;) {
2087                 u64 l = min_t(u64, len, PAGE_SIZE);
2088
2089                 crc = btrfs_csum_data(buffer, crc, l);
2090                 kunmap_atomic(buffer);
2091                 len -= l;
2092                 if (len == 0)
2093                         break;
2094                 index++;
2095                 BUG_ON(index >= sblock->page_count);
2096                 BUG_ON(!sblock->pagev[index]->page);
2097                 page = sblock->pagev[index]->page;
2098                 buffer = kmap_atomic(page);
2099         }
2100
2101         btrfs_csum_final(crc, csum);
2102         if (memcmp(csum, on_disk_csum, sctx->csum_size))
2103                 sblock->checksum_error = 1;
2104
2105         return sblock->checksum_error;
2106 }
2107
2108 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2109 {
2110         struct scrub_ctx *sctx = sblock->sctx;
2111         struct btrfs_header *h;
2112         struct btrfs_fs_info *fs_info = sctx->fs_info;
2113         u8 calculated_csum[BTRFS_CSUM_SIZE];
2114         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2115         struct page *page;
2116         void *mapped_buffer;
2117         u64 mapped_size;
2118         void *p;
2119         u32 crc = ~(u32)0;
2120         u64 len;
2121         int index;
2122
2123         BUG_ON(sblock->page_count < 1);
2124         page = sblock->pagev[0]->page;
2125         mapped_buffer = kmap_atomic(page);
2126         h = (struct btrfs_header *)mapped_buffer;
2127         memcpy(on_disk_csum, h->csum, sctx->csum_size);
2128
2129         /*
2130          * we don't use the getter functions here, as we
2131          * a) don't have an extent buffer and
2132          * b) the page is already kmapped
2133          */
2134         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2135                 sblock->header_error = 1;
2136
2137         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2138                 sblock->header_error = 1;
2139                 sblock->generation_error = 1;
2140         }
2141
2142         if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2143                 sblock->header_error = 1;
2144
2145         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2146                    BTRFS_UUID_SIZE))
2147                 sblock->header_error = 1;
2148
2149         len = sctx->nodesize - BTRFS_CSUM_SIZE;
2150         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2151         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2152         index = 0;
2153         for (;;) {
2154                 u64 l = min_t(u64, len, mapped_size);
2155
2156                 crc = btrfs_csum_data(p, crc, l);
2157                 kunmap_atomic(mapped_buffer);
2158                 len -= l;
2159                 if (len == 0)
2160                         break;
2161                 index++;
2162                 BUG_ON(index >= sblock->page_count);
2163                 BUG_ON(!sblock->pagev[index]->page);
2164                 page = sblock->pagev[index]->page;
2165                 mapped_buffer = kmap_atomic(page);
2166                 mapped_size = PAGE_SIZE;
2167                 p = mapped_buffer;
2168         }
2169
2170         btrfs_csum_final(crc, calculated_csum);
2171         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2172                 sblock->checksum_error = 1;
2173
2174         return sblock->header_error || sblock->checksum_error;
2175 }
2176
2177 static int scrub_checksum_super(struct scrub_block *sblock)
2178 {
2179         struct btrfs_super_block *s;
2180         struct scrub_ctx *sctx = sblock->sctx;
2181         u8 calculated_csum[BTRFS_CSUM_SIZE];
2182         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2183         struct page *page;
2184         void *mapped_buffer;
2185         u64 mapped_size;
2186         void *p;
2187         u32 crc = ~(u32)0;
2188         int fail_gen = 0;
2189         int fail_cor = 0;
2190         u64 len;
2191         int index;
2192
2193         BUG_ON(sblock->page_count < 1);
2194         page = sblock->pagev[0]->page;
2195         mapped_buffer = kmap_atomic(page);
2196         s = (struct btrfs_super_block *)mapped_buffer;
2197         memcpy(on_disk_csum, s->csum, sctx->csum_size);
2198
2199         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2200                 ++fail_cor;
2201
2202         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2203                 ++fail_gen;
2204
2205         if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2206                 ++fail_cor;
2207
2208         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2209         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2210         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2211         index = 0;
2212         for (;;) {
2213                 u64 l = min_t(u64, len, mapped_size);
2214
2215                 crc = btrfs_csum_data(p, crc, l);
2216                 kunmap_atomic(mapped_buffer);
2217                 len -= l;
2218                 if (len == 0)
2219                         break;
2220                 index++;
2221                 BUG_ON(index >= sblock->page_count);
2222                 BUG_ON(!sblock->pagev[index]->page);
2223                 page = sblock->pagev[index]->page;
2224                 mapped_buffer = kmap_atomic(page);
2225                 mapped_size = PAGE_SIZE;
2226                 p = mapped_buffer;
2227         }
2228
2229         btrfs_csum_final(crc, calculated_csum);
2230         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2231                 ++fail_cor;
2232
2233         if (fail_cor + fail_gen) {
2234                 /*
2235                  * if we find an error in a super block, we just report it.
2236                  * They will get written with the next transaction commit
2237                  * anyway
2238                  */
2239                 spin_lock(&sctx->stat_lock);
2240                 ++sctx->stat.super_errors;
2241                 spin_unlock(&sctx->stat_lock);
2242                 if (fail_cor)
2243                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2244                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2245                 else
2246                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2247                                 BTRFS_DEV_STAT_GENERATION_ERRS);
2248         }
2249
2250         return fail_cor + fail_gen;
2251 }
2252
2253 static void scrub_block_get(struct scrub_block *sblock)
2254 {
2255         refcount_inc(&sblock->refs);
2256 }
2257
2258 static void scrub_block_put(struct scrub_block *sblock)
2259 {
2260         if (refcount_dec_and_test(&sblock->refs)) {
2261                 int i;
2262
2263                 if (sblock->sparity)
2264                         scrub_parity_put(sblock->sparity);
2265
2266                 for (i = 0; i < sblock->page_count; i++)
2267                         scrub_page_put(sblock->pagev[i]);
2268                 kfree(sblock);
2269         }
2270 }
2271
2272 static void scrub_page_get(struct scrub_page *spage)
2273 {
2274         atomic_inc(&spage->refs);
2275 }
2276
2277 static void scrub_page_put(struct scrub_page *spage)
2278 {
2279         if (atomic_dec_and_test(&spage->refs)) {
2280                 if (spage->page)
2281                         __free_page(spage->page);
2282                 kfree(spage);
2283         }
2284 }
2285
2286 static void scrub_submit(struct scrub_ctx *sctx)
2287 {
2288         struct scrub_bio *sbio;
2289
2290         if (sctx->curr == -1)
2291                 return;
2292
2293         sbio = sctx->bios[sctx->curr];
2294         sctx->curr = -1;
2295         scrub_pending_bio_inc(sctx);
2296         btrfsic_submit_bio(sbio->bio);
2297 }
2298
2299 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2300                                     struct scrub_page *spage)
2301 {
2302         struct scrub_block *sblock = spage->sblock;
2303         struct scrub_bio *sbio;
2304         int ret;
2305
2306 again:
2307         /*
2308          * grab a fresh bio or wait for one to become available
2309          */
2310         while (sctx->curr == -1) {
2311                 spin_lock(&sctx->list_lock);
2312                 sctx->curr = sctx->first_free;
2313                 if (sctx->curr != -1) {
2314                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2315                         sctx->bios[sctx->curr]->next_free = -1;
2316                         sctx->bios[sctx->curr]->page_count = 0;
2317                         spin_unlock(&sctx->list_lock);
2318                 } else {
2319                         spin_unlock(&sctx->list_lock);
2320                         wait_event(sctx->list_wait, sctx->first_free != -1);
2321                 }
2322         }
2323         sbio = sctx->bios[sctx->curr];
2324         if (sbio->page_count == 0) {
2325                 struct bio *bio;
2326
2327                 sbio->physical = spage->physical;
2328                 sbio->logical = spage->logical;
2329                 sbio->dev = spage->dev;
2330                 bio = sbio->bio;
2331                 if (!bio) {
2332                         bio = btrfs_io_bio_alloc(GFP_KERNEL,
2333                                         sctx->pages_per_rd_bio);
2334                         if (!bio)
2335                                 return -ENOMEM;
2336                         sbio->bio = bio;
2337                 }
2338
2339                 bio->bi_private = sbio;
2340                 bio->bi_end_io = scrub_bio_end_io;
2341                 bio->bi_bdev = sbio->dev->bdev;
2342                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2343                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
2344                 sbio->err = 0;
2345         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2346                    spage->physical ||
2347                    sbio->logical + sbio->page_count * PAGE_SIZE !=
2348                    spage->logical ||
2349                    sbio->dev != spage->dev) {
2350                 scrub_submit(sctx);
2351                 goto again;
2352         }
2353
2354         sbio->pagev[sbio->page_count] = spage;
2355         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2356         if (ret != PAGE_SIZE) {
2357                 if (sbio->page_count < 1) {
2358                         bio_put(sbio->bio);
2359                         sbio->bio = NULL;
2360                         return -EIO;
2361                 }
2362                 scrub_submit(sctx);
2363                 goto again;
2364         }
2365
2366         scrub_block_get(sblock); /* one for the page added to the bio */
2367         atomic_inc(&sblock->outstanding_pages);
2368         sbio->page_count++;
2369         if (sbio->page_count == sctx->pages_per_rd_bio)
2370                 scrub_submit(sctx);
2371
2372         return 0;
2373 }
2374
2375 static void scrub_missing_raid56_end_io(struct bio *bio)
2376 {
2377         struct scrub_block *sblock = bio->bi_private;
2378         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2379
2380         if (bio->bi_error)
2381                 sblock->no_io_error_seen = 0;
2382
2383         bio_put(bio);
2384
2385         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2386 }
2387
2388 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2389 {
2390         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2391         struct scrub_ctx *sctx = sblock->sctx;
2392         struct btrfs_fs_info *fs_info = sctx->fs_info;
2393         u64 logical;
2394         struct btrfs_device *dev;
2395
2396         logical = sblock->pagev[0]->logical;
2397         dev = sblock->pagev[0]->dev;
2398
2399         if (sblock->no_io_error_seen)
2400                 scrub_recheck_block_checksum(sblock);
2401
2402         if (!sblock->no_io_error_seen) {
2403                 spin_lock(&sctx->stat_lock);
2404                 sctx->stat.read_errors++;
2405                 spin_unlock(&sctx->stat_lock);
2406                 btrfs_err_rl_in_rcu(fs_info,
2407                         "IO error rebuilding logical %llu for dev %s",
2408                         logical, rcu_str_deref(dev->name));
2409         } else if (sblock->header_error || sblock->checksum_error) {
2410                 spin_lock(&sctx->stat_lock);
2411                 sctx->stat.uncorrectable_errors++;
2412                 spin_unlock(&sctx->stat_lock);
2413                 btrfs_err_rl_in_rcu(fs_info,
2414                         "failed to rebuild valid logical %llu for dev %s",
2415                         logical, rcu_str_deref(dev->name));
2416         } else {
2417                 scrub_write_block_to_dev_replace(sblock);
2418         }
2419
2420         scrub_block_put(sblock);
2421
2422         if (sctx->is_dev_replace &&
2423             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2424                 mutex_lock(&sctx->wr_ctx.wr_lock);
2425                 scrub_wr_submit(sctx);
2426                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2427         }
2428
2429         scrub_pending_bio_dec(sctx);
2430 }
2431
2432 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2433 {
2434         struct scrub_ctx *sctx = sblock->sctx;
2435         struct btrfs_fs_info *fs_info = sctx->fs_info;
2436         u64 length = sblock->page_count * PAGE_SIZE;
2437         u64 logical = sblock->pagev[0]->logical;
2438         struct btrfs_bio *bbio = NULL;
2439         struct bio *bio;
2440         struct btrfs_raid_bio *rbio;
2441         int ret;
2442         int i;
2443
2444         btrfs_bio_counter_inc_blocked(fs_info);
2445         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2446                         &length, &bbio);
2447         if (ret || !bbio || !bbio->raid_map)
2448                 goto bbio_out;
2449
2450         if (WARN_ON(!sctx->is_dev_replace ||
2451                     !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2452                 /*
2453                  * We shouldn't be scrubbing a missing device. Even for dev
2454                  * replace, we should only get here for RAID 5/6. We either
2455                  * managed to mount something with no mirrors remaining or
2456                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2457                  */
2458                 goto bbio_out;
2459         }
2460
2461         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2462         if (!bio)
2463                 goto bbio_out;
2464
2465         bio->bi_iter.bi_sector = logical >> 9;
2466         bio->bi_private = sblock;
2467         bio->bi_end_io = scrub_missing_raid56_end_io;
2468
2469         rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2470         if (!rbio)
2471                 goto rbio_out;
2472
2473         for (i = 0; i < sblock->page_count; i++) {
2474                 struct scrub_page *spage = sblock->pagev[i];
2475
2476                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2477         }
2478
2479         btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2480                         scrub_missing_raid56_worker, NULL, NULL);
2481         scrub_block_get(sblock);
2482         scrub_pending_bio_inc(sctx);
2483         raid56_submit_missing_rbio(rbio);
2484         return;
2485
2486 rbio_out:
2487         bio_put(bio);
2488 bbio_out:
2489         btrfs_bio_counter_dec(fs_info);
2490         btrfs_put_bbio(bbio);
2491         spin_lock(&sctx->stat_lock);
2492         sctx->stat.malloc_errors++;
2493         spin_unlock(&sctx->stat_lock);
2494 }
2495
2496 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2497                        u64 physical, struct btrfs_device *dev, u64 flags,
2498                        u64 gen, int mirror_num, u8 *csum, int force,
2499                        u64 physical_for_dev_replace)
2500 {
2501         struct scrub_block *sblock;
2502         int index;
2503
2504         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2505         if (!sblock) {
2506                 spin_lock(&sctx->stat_lock);
2507                 sctx->stat.malloc_errors++;
2508                 spin_unlock(&sctx->stat_lock);
2509                 return -ENOMEM;
2510         }
2511
2512         /* one ref inside this function, plus one for each page added to
2513          * a bio later on */
2514         refcount_set(&sblock->refs, 1);
2515         sblock->sctx = sctx;
2516         sblock->no_io_error_seen = 1;
2517
2518         for (index = 0; len > 0; index++) {
2519                 struct scrub_page *spage;
2520                 u64 l = min_t(u64, len, PAGE_SIZE);
2521
2522                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2523                 if (!spage) {
2524 leave_nomem:
2525                         spin_lock(&sctx->stat_lock);
2526                         sctx->stat.malloc_errors++;
2527                         spin_unlock(&sctx->stat_lock);
2528                         scrub_block_put(sblock);
2529                         return -ENOMEM;
2530                 }
2531                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2532                 scrub_page_get(spage);
2533                 sblock->pagev[index] = spage;
2534                 spage->sblock = sblock;
2535                 spage->dev = dev;
2536                 spage->flags = flags;
2537                 spage->generation = gen;
2538                 spage->logical = logical;
2539                 spage->physical = physical;
2540                 spage->physical_for_dev_replace = physical_for_dev_replace;
2541                 spage->mirror_num = mirror_num;
2542                 if (csum) {
2543                         spage->have_csum = 1;
2544                         memcpy(spage->csum, csum, sctx->csum_size);
2545                 } else {
2546                         spage->have_csum = 0;
2547                 }
2548                 sblock->page_count++;
2549                 spage->page = alloc_page(GFP_KERNEL);
2550                 if (!spage->page)
2551                         goto leave_nomem;
2552                 len -= l;
2553                 logical += l;
2554                 physical += l;
2555                 physical_for_dev_replace += l;
2556         }
2557
2558         WARN_ON(sblock->page_count == 0);
2559         if (dev->missing) {
2560                 /*
2561                  * This case should only be hit for RAID 5/6 device replace. See
2562                  * the comment in scrub_missing_raid56_pages() for details.
2563                  */
2564                 scrub_missing_raid56_pages(sblock);
2565         } else {
2566                 for (index = 0; index < sblock->page_count; index++) {
2567                         struct scrub_page *spage = sblock->pagev[index];
2568                         int ret;
2569
2570                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2571                         if (ret) {
2572                                 scrub_block_put(sblock);
2573                                 return ret;
2574                         }
2575                 }
2576
2577                 if (force)
2578                         scrub_submit(sctx);
2579         }
2580
2581         /* last one frees, either here or in bio completion for last page */
2582         scrub_block_put(sblock);
2583         return 0;
2584 }
2585
2586 static void scrub_bio_end_io(struct bio *bio)
2587 {
2588         struct scrub_bio *sbio = bio->bi_private;
2589         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2590
2591         sbio->err = bio->bi_error;
2592         sbio->bio = bio;
2593
2594         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2595 }
2596
2597 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2598 {
2599         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2600         struct scrub_ctx *sctx = sbio->sctx;
2601         int i;
2602
2603         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2604         if (sbio->err) {
2605                 for (i = 0; i < sbio->page_count; i++) {
2606                         struct scrub_page *spage = sbio->pagev[i];
2607
2608                         spage->io_error = 1;
2609                         spage->sblock->no_io_error_seen = 0;
2610                 }
2611         }
2612
2613         /* now complete the scrub_block items that have all pages completed */
2614         for (i = 0; i < sbio->page_count; i++) {
2615                 struct scrub_page *spage = sbio->pagev[i];
2616                 struct scrub_block *sblock = spage->sblock;
2617
2618                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2619                         scrub_block_complete(sblock);
2620                 scrub_block_put(sblock);
2621         }
2622
2623         bio_put(sbio->bio);
2624         sbio->bio = NULL;
2625         spin_lock(&sctx->list_lock);
2626         sbio->next_free = sctx->first_free;
2627         sctx->first_free = sbio->index;
2628         spin_unlock(&sctx->list_lock);
2629
2630         if (sctx->is_dev_replace &&
2631             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2632                 mutex_lock(&sctx->wr_ctx.wr_lock);
2633                 scrub_wr_submit(sctx);
2634                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2635         }
2636
2637         scrub_pending_bio_dec(sctx);
2638 }
2639
2640 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2641                                        unsigned long *bitmap,
2642                                        u64 start, u64 len)
2643 {
2644         u64 offset;
2645         int nsectors;
2646         int sectorsize = sparity->sctx->fs_info->sectorsize;
2647
2648         if (len >= sparity->stripe_len) {
2649                 bitmap_set(bitmap, 0, sparity->nsectors);
2650                 return;
2651         }
2652
2653         start -= sparity->logic_start;
2654         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2655         offset = div_u64(offset, sectorsize);
2656         nsectors = (int)len / sectorsize;
2657
2658         if (offset + nsectors <= sparity->nsectors) {
2659                 bitmap_set(bitmap, offset, nsectors);
2660                 return;
2661         }
2662
2663         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2664         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2665 }
2666
2667 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2668                                                    u64 start, u64 len)
2669 {
2670         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2671 }
2672
2673 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2674                                                   u64 start, u64 len)
2675 {
2676         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2677 }
2678
2679 static void scrub_block_complete(struct scrub_block *sblock)
2680 {
2681         int corrupted = 0;
2682
2683         if (!sblock->no_io_error_seen) {
2684                 corrupted = 1;
2685                 scrub_handle_errored_block(sblock);
2686         } else {
2687                 /*
2688                  * if has checksum error, write via repair mechanism in
2689                  * dev replace case, otherwise write here in dev replace
2690                  * case.
2691                  */
2692                 corrupted = scrub_checksum(sblock);
2693                 if (!corrupted && sblock->sctx->is_dev_replace)
2694                         scrub_write_block_to_dev_replace(sblock);
2695         }
2696
2697         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2698                 u64 start = sblock->pagev[0]->logical;
2699                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2700                           PAGE_SIZE;
2701
2702                 scrub_parity_mark_sectors_error(sblock->sparity,
2703                                                 start, end - start);
2704         }
2705 }
2706
2707 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2708 {
2709         struct btrfs_ordered_sum *sum = NULL;
2710         unsigned long index;
2711         unsigned long num_sectors;
2712
2713         while (!list_empty(&sctx->csum_list)) {
2714                 sum = list_first_entry(&sctx->csum_list,
2715                                        struct btrfs_ordered_sum, list);
2716                 if (sum->bytenr > logical)
2717                         return 0;
2718                 if (sum->bytenr + sum->len > logical)
2719                         break;
2720
2721                 ++sctx->stat.csum_discards;
2722                 list_del(&sum->list);
2723                 kfree(sum);
2724                 sum = NULL;
2725         }
2726         if (!sum)
2727                 return 0;
2728
2729         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2730         num_sectors = sum->len / sctx->sectorsize;
2731         memcpy(csum, sum->sums + index, sctx->csum_size);
2732         if (index == num_sectors - 1) {
2733                 list_del(&sum->list);
2734                 kfree(sum);
2735         }
2736         return 1;
2737 }
2738
2739 /* scrub extent tries to collect up to 64 kB for each bio */
2740 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2741                         u64 physical, struct btrfs_device *dev, u64 flags,
2742                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2743 {
2744         int ret;
2745         u8 csum[BTRFS_CSUM_SIZE];
2746         u32 blocksize;
2747
2748         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2749                 blocksize = sctx->sectorsize;
2750                 spin_lock(&sctx->stat_lock);
2751                 sctx->stat.data_extents_scrubbed++;
2752                 sctx->stat.data_bytes_scrubbed += len;
2753                 spin_unlock(&sctx->stat_lock);
2754         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2755                 blocksize = sctx->nodesize;
2756                 spin_lock(&sctx->stat_lock);
2757                 sctx->stat.tree_extents_scrubbed++;
2758                 sctx->stat.tree_bytes_scrubbed += len;
2759                 spin_unlock(&sctx->stat_lock);
2760         } else {
2761                 blocksize = sctx->sectorsize;
2762                 WARN_ON(1);
2763         }
2764
2765         while (len) {
2766                 u64 l = min_t(u64, len, blocksize);
2767                 int have_csum = 0;
2768
2769                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2770                         /* push csums to sbio */
2771                         have_csum = scrub_find_csum(sctx, logical, csum);
2772                         if (have_csum == 0)
2773                                 ++sctx->stat.no_csum;
2774                         if (sctx->is_dev_replace && !have_csum) {
2775                                 ret = copy_nocow_pages(sctx, logical, l,
2776                                                        mirror_num,
2777                                                       physical_for_dev_replace);
2778                                 goto behind_scrub_pages;
2779                         }
2780                 }
2781                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2782                                   mirror_num, have_csum ? csum : NULL, 0,
2783                                   physical_for_dev_replace);
2784 behind_scrub_pages:
2785                 if (ret)
2786                         return ret;
2787                 len -= l;
2788                 logical += l;
2789                 physical += l;
2790                 physical_for_dev_replace += l;
2791         }
2792         return 0;
2793 }
2794
2795 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2796                                   u64 logical, u64 len,
2797                                   u64 physical, struct btrfs_device *dev,
2798                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2799 {
2800         struct scrub_ctx *sctx = sparity->sctx;
2801         struct scrub_block *sblock;
2802         int index;
2803
2804         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2805         if (!sblock) {
2806                 spin_lock(&sctx->stat_lock);
2807                 sctx->stat.malloc_errors++;
2808                 spin_unlock(&sctx->stat_lock);
2809                 return -ENOMEM;
2810         }
2811
2812         /* one ref inside this function, plus one for each page added to
2813          * a bio later on */
2814         refcount_set(&sblock->refs, 1);
2815         sblock->sctx = sctx;
2816         sblock->no_io_error_seen = 1;
2817         sblock->sparity = sparity;
2818         scrub_parity_get(sparity);
2819
2820         for (index = 0; len > 0; index++) {
2821                 struct scrub_page *spage;
2822                 u64 l = min_t(u64, len, PAGE_SIZE);
2823
2824                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2825                 if (!spage) {
2826 leave_nomem:
2827                         spin_lock(&sctx->stat_lock);
2828                         sctx->stat.malloc_errors++;
2829                         spin_unlock(&sctx->stat_lock);
2830                         scrub_block_put(sblock);
2831                         return -ENOMEM;
2832                 }
2833                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2834                 /* For scrub block */
2835                 scrub_page_get(spage);
2836                 sblock->pagev[index] = spage;
2837                 /* For scrub parity */
2838                 scrub_page_get(spage);
2839                 list_add_tail(&spage->list, &sparity->spages);
2840                 spage->sblock = sblock;
2841                 spage->dev = dev;
2842                 spage->flags = flags;
2843                 spage->generation = gen;
2844                 spage->logical = logical;
2845                 spage->physical = physical;
2846                 spage->mirror_num = mirror_num;
2847                 if (csum) {
2848                         spage->have_csum = 1;
2849                         memcpy(spage->csum, csum, sctx->csum_size);
2850                 } else {
2851                         spage->have_csum = 0;
2852                 }
2853                 sblock->page_count++;
2854                 spage->page = alloc_page(GFP_KERNEL);
2855                 if (!spage->page)
2856                         goto leave_nomem;
2857                 len -= l;
2858                 logical += l;
2859                 physical += l;
2860         }
2861
2862         WARN_ON(sblock->page_count == 0);
2863         for (index = 0; index < sblock->page_count; index++) {
2864                 struct scrub_page *spage = sblock->pagev[index];
2865                 int ret;
2866
2867                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2868                 if (ret) {
2869                         scrub_block_put(sblock);
2870                         return ret;
2871                 }
2872         }
2873
2874         /* last one frees, either here or in bio completion for last page */
2875         scrub_block_put(sblock);
2876         return 0;
2877 }
2878
2879 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2880                                    u64 logical, u64 len,
2881                                    u64 physical, struct btrfs_device *dev,
2882                                    u64 flags, u64 gen, int mirror_num)
2883 {
2884         struct scrub_ctx *sctx = sparity->sctx;
2885         int ret;
2886         u8 csum[BTRFS_CSUM_SIZE];
2887         u32 blocksize;
2888
2889         if (dev->missing) {
2890                 scrub_parity_mark_sectors_error(sparity, logical, len);
2891                 return 0;
2892         }
2893
2894         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2895                 blocksize = sctx->sectorsize;
2896         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2897                 blocksize = sctx->nodesize;
2898         } else {
2899                 blocksize = sctx->sectorsize;
2900                 WARN_ON(1);
2901         }
2902
2903         while (len) {
2904                 u64 l = min_t(u64, len, blocksize);
2905                 int have_csum = 0;
2906
2907                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2908                         /* push csums to sbio */
2909                         have_csum = scrub_find_csum(sctx, logical, csum);
2910                         if (have_csum == 0)
2911                                 goto skip;
2912                 }
2913                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2914                                              flags, gen, mirror_num,
2915                                              have_csum ? csum : NULL);
2916                 if (ret)
2917                         return ret;
2918 skip:
2919                 len -= l;
2920                 logical += l;
2921                 physical += l;
2922         }
2923         return 0;
2924 }
2925
2926 /*
2927  * Given a physical address, this will calculate it's
2928  * logical offset. if this is a parity stripe, it will return
2929  * the most left data stripe's logical offset.
2930  *
2931  * return 0 if it is a data stripe, 1 means parity stripe.
2932  */
2933 static int get_raid56_logic_offset(u64 physical, int num,
2934                                    struct map_lookup *map, u64 *offset,
2935                                    u64 *stripe_start)
2936 {
2937         int i;
2938         int j = 0;
2939         u64 stripe_nr;
2940         u64 last_offset;
2941         u32 stripe_index;
2942         u32 rot;
2943
2944         last_offset = (physical - map->stripes[num].physical) *
2945                       nr_data_stripes(map);
2946         if (stripe_start)
2947                 *stripe_start = last_offset;
2948
2949         *offset = last_offset;
2950         for (i = 0; i < nr_data_stripes(map); i++) {
2951                 *offset = last_offset + i * map->stripe_len;
2952
2953                 stripe_nr = div64_u64(*offset, map->stripe_len);
2954                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2955
2956                 /* Work out the disk rotation on this stripe-set */
2957                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2958                 /* calculate which stripe this data locates */
2959                 rot += i;
2960                 stripe_index = rot % map->num_stripes;
2961                 if (stripe_index == num)
2962                         return 0;
2963                 if (stripe_index < num)
2964                         j++;
2965         }
2966         *offset = last_offset + j * map->stripe_len;
2967         return 1;
2968 }
2969
2970 static void scrub_free_parity(struct scrub_parity *sparity)
2971 {
2972         struct scrub_ctx *sctx = sparity->sctx;
2973         struct scrub_page *curr, *next;
2974         int nbits;
2975
2976         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2977         if (nbits) {
2978                 spin_lock(&sctx->stat_lock);
2979                 sctx->stat.read_errors += nbits;
2980                 sctx->stat.uncorrectable_errors += nbits;
2981                 spin_unlock(&sctx->stat_lock);
2982         }
2983
2984         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2985                 list_del_init(&curr->list);
2986                 scrub_page_put(curr);
2987         }
2988
2989         kfree(sparity);
2990 }
2991
2992 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2993 {
2994         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2995                                                     work);
2996         struct scrub_ctx *sctx = sparity->sctx;
2997
2998         scrub_free_parity(sparity);
2999         scrub_pending_bio_dec(sctx);
3000 }
3001
3002 static void scrub_parity_bio_endio(struct bio *bio)
3003 {
3004         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
3005         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3006
3007         if (bio->bi_error)
3008                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3009                           sparity->nsectors);
3010
3011         bio_put(bio);
3012
3013         btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3014                         scrub_parity_bio_endio_worker, NULL, NULL);
3015         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3016 }
3017
3018 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3019 {
3020         struct scrub_ctx *sctx = sparity->sctx;
3021         struct btrfs_fs_info *fs_info = sctx->fs_info;
3022         struct bio *bio;
3023         struct btrfs_raid_bio *rbio;
3024         struct btrfs_bio *bbio = NULL;
3025         u64 length;
3026         int ret;
3027
3028         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3029                            sparity->nsectors))
3030                 goto out;
3031
3032         length = sparity->logic_end - sparity->logic_start;
3033
3034         btrfs_bio_counter_inc_blocked(fs_info);
3035         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3036                                &length, &bbio);
3037         if (ret || !bbio || !bbio->raid_map)
3038                 goto bbio_out;
3039
3040         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3041         if (!bio)
3042                 goto bbio_out;
3043
3044         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3045         bio->bi_private = sparity;
3046         bio->bi_end_io = scrub_parity_bio_endio;
3047
3048         rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3049                                               length, sparity->scrub_dev,
3050                                               sparity->dbitmap,
3051                                               sparity->nsectors);
3052         if (!rbio)
3053                 goto rbio_out;
3054
3055         scrub_pending_bio_inc(sctx);
3056         raid56_parity_submit_scrub_rbio(rbio);
3057         return;
3058
3059 rbio_out:
3060         bio_put(bio);
3061 bbio_out:
3062         btrfs_bio_counter_dec(fs_info);
3063         btrfs_put_bbio(bbio);
3064         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3065                   sparity->nsectors);
3066         spin_lock(&sctx->stat_lock);
3067         sctx->stat.malloc_errors++;
3068         spin_unlock(&sctx->stat_lock);
3069 out:
3070         scrub_free_parity(sparity);
3071 }
3072
3073 static inline int scrub_calc_parity_bitmap_len(int nsectors)
3074 {
3075         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3076 }
3077
3078 static void scrub_parity_get(struct scrub_parity *sparity)
3079 {
3080         refcount_inc(&sparity->refs);
3081 }
3082
3083 static void scrub_parity_put(struct scrub_parity *sparity)
3084 {
3085         if (!refcount_dec_and_test(&sparity->refs))
3086                 return;
3087
3088         scrub_parity_check_and_repair(sparity);
3089 }
3090
3091 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3092                                                   struct map_lookup *map,
3093                                                   struct btrfs_device *sdev,
3094                                                   struct btrfs_path *path,
3095                                                   u64 logic_start,
3096                                                   u64 logic_end)
3097 {
3098         struct btrfs_fs_info *fs_info = sctx->fs_info;
3099         struct btrfs_root *root = fs_info->extent_root;
3100         struct btrfs_root *csum_root = fs_info->csum_root;
3101         struct btrfs_extent_item *extent;
3102         struct btrfs_bio *bbio = NULL;
3103         u64 flags;
3104         int ret;
3105         int slot;
3106         struct extent_buffer *l;
3107         struct btrfs_key key;
3108         u64 generation;
3109         u64 extent_logical;
3110         u64 extent_physical;
3111         u64 extent_len;
3112         u64 mapped_length;
3113         struct btrfs_device *extent_dev;
3114         struct scrub_parity *sparity;
3115         int nsectors;
3116         int bitmap_len;
3117         int extent_mirror_num;
3118         int stop_loop = 0;
3119
3120         nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3121         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3122         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3123                           GFP_NOFS);
3124         if (!sparity) {
3125                 spin_lock(&sctx->stat_lock);
3126                 sctx->stat.malloc_errors++;
3127                 spin_unlock(&sctx->stat_lock);
3128                 return -ENOMEM;
3129         }
3130
3131         sparity->stripe_len = map->stripe_len;
3132         sparity->nsectors = nsectors;
3133         sparity->sctx = sctx;
3134         sparity->scrub_dev = sdev;
3135         sparity->logic_start = logic_start;
3136         sparity->logic_end = logic_end;
3137         refcount_set(&sparity->refs, 1);
3138         INIT_LIST_HEAD(&sparity->spages);
3139         sparity->dbitmap = sparity->bitmap;
3140         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3141
3142         ret = 0;
3143         while (logic_start < logic_end) {
3144                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3145                         key.type = BTRFS_METADATA_ITEM_KEY;
3146                 else
3147                         key.type = BTRFS_EXTENT_ITEM_KEY;
3148                 key.objectid = logic_start;
3149                 key.offset = (u64)-1;
3150
3151                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3152                 if (ret < 0)
3153                         goto out;
3154
3155                 if (ret > 0) {
3156                         ret = btrfs_previous_extent_item(root, path, 0);
3157                         if (ret < 0)
3158                                 goto out;
3159                         if (ret > 0) {
3160                                 btrfs_release_path(path);
3161                                 ret = btrfs_search_slot(NULL, root, &key,
3162                                                         path, 0, 0);
3163                                 if (ret < 0)
3164                                         goto out;
3165                         }
3166                 }
3167
3168                 stop_loop = 0;
3169                 while (1) {
3170                         u64 bytes;
3171
3172                         l = path->nodes[0];
3173                         slot = path->slots[0];
3174                         if (slot >= btrfs_header_nritems(l)) {
3175                                 ret = btrfs_next_leaf(root, path);
3176                                 if (ret == 0)
3177                                         continue;
3178                                 if (ret < 0)
3179                                         goto out;
3180
3181                                 stop_loop = 1;
3182                                 break;
3183                         }
3184                         btrfs_item_key_to_cpu(l, &key, slot);
3185
3186                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3187                             key.type != BTRFS_METADATA_ITEM_KEY)
3188                                 goto next;
3189
3190                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3191                                 bytes = fs_info->nodesize;
3192                         else
3193                                 bytes = key.offset;
3194
3195                         if (key.objectid + bytes <= logic_start)
3196                                 goto next;
3197
3198                         if (key.objectid >= logic_end) {
3199                                 stop_loop = 1;
3200                                 break;
3201                         }
3202
3203                         while (key.objectid >= logic_start + map->stripe_len)
3204                                 logic_start += map->stripe_len;
3205
3206                         extent = btrfs_item_ptr(l, slot,
3207                                                 struct btrfs_extent_item);
3208                         flags = btrfs_extent_flags(l, extent);
3209                         generation = btrfs_extent_generation(l, extent);
3210
3211                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3212                             (key.objectid < logic_start ||
3213                              key.objectid + bytes >
3214                              logic_start + map->stripe_len)) {
3215                                 btrfs_err(fs_info,
3216                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3217                                           key.objectid, logic_start);
3218                                 spin_lock(&sctx->stat_lock);
3219                                 sctx->stat.uncorrectable_errors++;
3220                                 spin_unlock(&sctx->stat_lock);
3221                                 goto next;
3222                         }
3223 again:
3224                         extent_logical = key.objectid;
3225                         extent_len = bytes;
3226
3227                         if (extent_logical < logic_start) {
3228                                 extent_len -= logic_start - extent_logical;
3229                                 extent_logical = logic_start;
3230                         }
3231
3232                         if (extent_logical + extent_len >
3233                             logic_start + map->stripe_len)
3234                                 extent_len = logic_start + map->stripe_len -
3235                                              extent_logical;
3236
3237                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3238                                                        extent_len);
3239
3240                         mapped_length = extent_len;
3241                         bbio = NULL;
3242                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3243                                         extent_logical, &mapped_length, &bbio,
3244                                         0);
3245                         if (!ret) {
3246                                 if (!bbio || mapped_length < extent_len)
3247                                         ret = -EIO;
3248                         }
3249                         if (ret) {
3250                                 btrfs_put_bbio(bbio);
3251                                 goto out;
3252                         }
3253                         extent_physical = bbio->stripes[0].physical;
3254                         extent_mirror_num = bbio->mirror_num;
3255                         extent_dev = bbio->stripes[0].dev;
3256                         btrfs_put_bbio(bbio);
3257
3258                         ret = btrfs_lookup_csums_range(csum_root,
3259                                                 extent_logical,
3260                                                 extent_logical + extent_len - 1,
3261                                                 &sctx->csum_list, 1);
3262                         if (ret)
3263                                 goto out;
3264
3265                         ret = scrub_extent_for_parity(sparity, extent_logical,
3266                                                       extent_len,
3267                                                       extent_physical,
3268                                                       extent_dev, flags,
3269                                                       generation,
3270                                                       extent_mirror_num);
3271
3272                         scrub_free_csums(sctx);
3273
3274                         if (ret)
3275                                 goto out;
3276
3277                         if (extent_logical + extent_len <
3278                             key.objectid + bytes) {
3279                                 logic_start += map->stripe_len;
3280
3281                                 if (logic_start >= logic_end) {
3282                                         stop_loop = 1;
3283                                         break;
3284                                 }
3285
3286                                 if (logic_start < key.objectid + bytes) {
3287                                         cond_resched();
3288                                         goto again;
3289                                 }
3290                         }
3291 next:
3292                         path->slots[0]++;
3293                 }
3294
3295                 btrfs_release_path(path);
3296
3297                 if (stop_loop)
3298                         break;
3299
3300                 logic_start += map->stripe_len;
3301         }
3302 out:
3303         if (ret < 0)
3304                 scrub_parity_mark_sectors_error(sparity, logic_start,
3305                                                 logic_end - logic_start);
3306         scrub_parity_put(sparity);
3307         scrub_submit(sctx);
3308         mutex_lock(&sctx->wr_ctx.wr_lock);
3309         scrub_wr_submit(sctx);
3310         mutex_unlock(&sctx->wr_ctx.wr_lock);
3311
3312         btrfs_release_path(path);
3313         return ret < 0 ? ret : 0;
3314 }
3315
3316 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3317                                            struct map_lookup *map,
3318                                            struct btrfs_device *scrub_dev,
3319                                            int num, u64 base, u64 length,
3320                                            int is_dev_replace)
3321 {
3322         struct btrfs_path *path, *ppath;
3323         struct btrfs_fs_info *fs_info = sctx->fs_info;
3324         struct btrfs_root *root = fs_info->extent_root;
3325         struct btrfs_root *csum_root = fs_info->csum_root;
3326         struct btrfs_extent_item *extent;
3327         struct blk_plug plug;
3328         u64 flags;
3329         int ret;
3330         int slot;
3331         u64 nstripes;
3332         struct extent_buffer *l;
3333         u64 physical;
3334         u64 logical;
3335         u64 logic_end;
3336         u64 physical_end;
3337         u64 generation;
3338         int mirror_num;
3339         struct reada_control *reada1;
3340         struct reada_control *reada2;
3341         struct btrfs_key key;
3342         struct btrfs_key key_end;
3343         u64 increment = map->stripe_len;
3344         u64 offset;
3345         u64 extent_logical;
3346         u64 extent_physical;
3347         u64 extent_len;
3348         u64 stripe_logical;
3349         u64 stripe_end;
3350         struct btrfs_device *extent_dev;
3351         int extent_mirror_num;
3352         int stop_loop = 0;
3353
3354         physical = map->stripes[num].physical;
3355         offset = 0;
3356         nstripes = div64_u64(length, map->stripe_len);
3357         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3358                 offset = map->stripe_len * num;
3359                 increment = map->stripe_len * map->num_stripes;
3360                 mirror_num = 1;
3361         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3362                 int factor = map->num_stripes / map->sub_stripes;
3363                 offset = map->stripe_len * (num / map->sub_stripes);
3364                 increment = map->stripe_len * factor;
3365                 mirror_num = num % map->sub_stripes + 1;
3366         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3367                 increment = map->stripe_len;
3368                 mirror_num = num % map->num_stripes + 1;
3369         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3370                 increment = map->stripe_len;
3371                 mirror_num = num % map->num_stripes + 1;
3372         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3373                 get_raid56_logic_offset(physical, num, map, &offset, NULL);
3374                 increment = map->stripe_len * nr_data_stripes(map);
3375                 mirror_num = 1;
3376         } else {
3377                 increment = map->stripe_len;
3378                 mirror_num = 1;
3379         }
3380
3381         path = btrfs_alloc_path();
3382         if (!path)
3383                 return -ENOMEM;
3384
3385         ppath = btrfs_alloc_path();
3386         if (!ppath) {
3387                 btrfs_free_path(path);
3388                 return -ENOMEM;
3389         }
3390
3391         /*
3392          * work on commit root. The related disk blocks are static as
3393          * long as COW is applied. This means, it is save to rewrite
3394          * them to repair disk errors without any race conditions
3395          */
3396         path->search_commit_root = 1;
3397         path->skip_locking = 1;
3398
3399         ppath->search_commit_root = 1;
3400         ppath->skip_locking = 1;
3401         /*
3402          * trigger the readahead for extent tree csum tree and wait for
3403          * completion. During readahead, the scrub is officially paused
3404          * to not hold off transaction commits
3405          */
3406         logical = base + offset;
3407         physical_end = physical + nstripes * map->stripe_len;
3408         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3409                 get_raid56_logic_offset(physical_end, num,
3410                                         map, &logic_end, NULL);
3411                 logic_end += base;
3412         } else {
3413                 logic_end = logical + increment * nstripes;
3414         }
3415         wait_event(sctx->list_wait,
3416                    atomic_read(&sctx->bios_in_flight) == 0);
3417         scrub_blocked_if_needed(fs_info);
3418
3419         /* FIXME it might be better to start readahead at commit root */
3420         key.objectid = logical;
3421         key.type = BTRFS_EXTENT_ITEM_KEY;
3422         key.offset = (u64)0;
3423         key_end.objectid = logic_end;
3424         key_end.type = BTRFS_METADATA_ITEM_KEY;
3425         key_end.offset = (u64)-1;
3426         reada1 = btrfs_reada_add(root, &key, &key_end);
3427
3428         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3429         key.type = BTRFS_EXTENT_CSUM_KEY;
3430         key.offset = logical;
3431         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3432         key_end.type = BTRFS_EXTENT_CSUM_KEY;
3433         key_end.offset = logic_end;
3434         reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3435
3436         if (!IS_ERR(reada1))
3437                 btrfs_reada_wait(reada1);
3438         if (!IS_ERR(reada2))
3439                 btrfs_reada_wait(reada2);
3440
3441
3442         /*
3443          * collect all data csums for the stripe to avoid seeking during
3444          * the scrub. This might currently (crc32) end up to be about 1MB
3445          */
3446         blk_start_plug(&plug);
3447
3448         /*
3449          * now find all extents for each stripe and scrub them
3450          */
3451         ret = 0;
3452         while (physical < physical_end) {
3453                 /*
3454                  * canceled?
3455                  */
3456                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3457                     atomic_read(&sctx->cancel_req)) {
3458                         ret = -ECANCELED;
3459                         goto out;
3460                 }
3461                 /*
3462                  * check to see if we have to pause
3463                  */
3464                 if (atomic_read(&fs_info->scrub_pause_req)) {
3465                         /* push queued extents */
3466                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3467                         scrub_submit(sctx);
3468                         mutex_lock(&sctx->wr_ctx.wr_lock);
3469                         scrub_wr_submit(sctx);
3470                         mutex_unlock(&sctx->wr_ctx.wr_lock);
3471                         wait_event(sctx->list_wait,
3472                                    atomic_read(&sctx->bios_in_flight) == 0);
3473                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3474                         scrub_blocked_if_needed(fs_info);
3475                 }
3476
3477                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3478                         ret = get_raid56_logic_offset(physical, num, map,
3479                                                       &logical,
3480                                                       &stripe_logical);
3481                         logical += base;
3482                         if (ret) {
3483                                 /* it is parity strip */
3484                                 stripe_logical += base;
3485                                 stripe_end = stripe_logical + increment;
3486                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3487                                                           ppath, stripe_logical,
3488                                                           stripe_end);
3489                                 if (ret)
3490                                         goto out;
3491                                 goto skip;
3492                         }
3493                 }
3494
3495                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3496                         key.type = BTRFS_METADATA_ITEM_KEY;
3497                 else
3498                         key.type = BTRFS_EXTENT_ITEM_KEY;
3499                 key.objectid = logical;
3500                 key.offset = (u64)-1;
3501
3502                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3503                 if (ret < 0)
3504                         goto out;
3505
3506                 if (ret > 0) {
3507                         ret = btrfs_previous_extent_item(root, path, 0);
3508                         if (ret < 0)
3509                                 goto out;
3510                         if (ret > 0) {
3511                                 /* there's no smaller item, so stick with the
3512                                  * larger one */
3513                                 btrfs_release_path(path);
3514                                 ret = btrfs_search_slot(NULL, root, &key,
3515                                                         path, 0, 0);
3516                                 if (ret < 0)
3517                                         goto out;
3518                         }
3519                 }
3520
3521                 stop_loop = 0;
3522                 while (1) {
3523                         u64 bytes;
3524
3525                         l = path->nodes[0];
3526                         slot = path->slots[0];
3527                         if (slot >= btrfs_header_nritems(l)) {
3528                                 ret = btrfs_next_leaf(root, path);
3529                                 if (ret == 0)
3530                                         continue;
3531                                 if (ret < 0)
3532                                         goto out;
3533
3534                                 stop_loop = 1;
3535                                 break;
3536                         }
3537                         btrfs_item_key_to_cpu(l, &key, slot);
3538
3539                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3540                             key.type != BTRFS_METADATA_ITEM_KEY)
3541                                 goto next;
3542
3543                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3544                                 bytes = fs_info->nodesize;
3545                         else
3546                                 bytes = key.offset;
3547
3548                         if (key.objectid + bytes <= logical)
3549                                 goto next;
3550
3551                         if (key.objectid >= logical + map->stripe_len) {
3552                                 /* out of this device extent */
3553                                 if (key.objectid >= logic_end)
3554                                         stop_loop = 1;
3555                                 break;
3556                         }
3557
3558                         extent = btrfs_item_ptr(l, slot,
3559                                                 struct btrfs_extent_item);
3560                         flags = btrfs_extent_flags(l, extent);
3561                         generation = btrfs_extent_generation(l, extent);
3562
3563                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3564                             (key.objectid < logical ||
3565                              key.objectid + bytes >
3566                              logical + map->stripe_len)) {
3567                                 btrfs_err(fs_info,
3568                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3569                                        key.objectid, logical);
3570                                 spin_lock(&sctx->stat_lock);
3571                                 sctx->stat.uncorrectable_errors++;
3572                                 spin_unlock(&sctx->stat_lock);
3573                                 goto next;
3574                         }
3575
3576 again:
3577                         extent_logical = key.objectid;
3578                         extent_len = bytes;
3579
3580                         /*
3581                          * trim extent to this stripe
3582                          */
3583                         if (extent_logical < logical) {
3584                                 extent_len -= logical - extent_logical;
3585                                 extent_logical = logical;
3586                         }
3587                         if (extent_logical + extent_len >
3588                             logical + map->stripe_len) {
3589                                 extent_len = logical + map->stripe_len -
3590                                              extent_logical;
3591                         }
3592
3593                         extent_physical = extent_logical - logical + physical;
3594                         extent_dev = scrub_dev;
3595                         extent_mirror_num = mirror_num;
3596                         if (is_dev_replace)
3597                                 scrub_remap_extent(fs_info, extent_logical,
3598                                                    extent_len, &extent_physical,
3599                                                    &extent_dev,
3600                                                    &extent_mirror_num);
3601
3602                         ret = btrfs_lookup_csums_range(csum_root,
3603                                                        extent_logical,
3604                                                        extent_logical +
3605                                                        extent_len - 1,
3606                                                        &sctx->csum_list, 1);
3607                         if (ret)
3608                                 goto out;
3609
3610                         ret = scrub_extent(sctx, extent_logical, extent_len,
3611                                            extent_physical, extent_dev, flags,
3612                                            generation, extent_mirror_num,
3613                                            extent_logical - logical + physical);
3614
3615                         scrub_free_csums(sctx);
3616
3617                         if (ret)
3618                                 goto out;
3619
3620                         if (extent_logical + extent_len <
3621                             key.objectid + bytes) {
3622                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3623                                         /*
3624                                          * loop until we find next data stripe
3625                                          * or we have finished all stripes.
3626                                          */
3627 loop:
3628                                         physical += map->stripe_len;
3629                                         ret = get_raid56_logic_offset(physical,
3630                                                         num, map, &logical,
3631                                                         &stripe_logical);
3632                                         logical += base;
3633
3634                                         if (ret && physical < physical_end) {
3635                                                 stripe_logical += base;
3636                                                 stripe_end = stripe_logical +
3637                                                                 increment;
3638                                                 ret = scrub_raid56_parity(sctx,
3639                                                         map, scrub_dev, ppath,
3640                                                         stripe_logical,
3641                                                         stripe_end);
3642                                                 if (ret)
3643                                                         goto out;
3644                                                 goto loop;
3645                                         }
3646                                 } else {
3647                                         physical += map->stripe_len;
3648                                         logical += increment;
3649                                 }
3650                                 if (logical < key.objectid + bytes) {
3651                                         cond_resched();
3652                                         goto again;
3653                                 }
3654
3655                                 if (physical >= physical_end) {
3656                                         stop_loop = 1;
3657                                         break;
3658                                 }
3659                         }
3660 next:
3661                         path->slots[0]++;
3662                 }
3663                 btrfs_release_path(path);
3664 skip:
3665                 logical += increment;
3666                 physical += map->stripe_len;
3667                 spin_lock(&sctx->stat_lock);
3668                 if (stop_loop)
3669                         sctx->stat.last_physical = map->stripes[num].physical +
3670                                                    length;
3671                 else
3672                         sctx->stat.last_physical = physical;
3673                 spin_unlock(&sctx->stat_lock);
3674                 if (stop_loop)
3675                         break;
3676         }
3677 out:
3678         /* push queued extents */
3679         scrub_submit(sctx);
3680         mutex_lock(&sctx->wr_ctx.wr_lock);
3681         scrub_wr_submit(sctx);
3682         mutex_unlock(&sctx->wr_ctx.wr_lock);
3683
3684         blk_finish_plug(&plug);
3685         btrfs_free_path(path);
3686         btrfs_free_path(ppath);
3687         return ret < 0 ? ret : 0;
3688 }
3689
3690 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3691                                           struct btrfs_device *scrub_dev,
3692                                           u64 chunk_offset, u64 length,
3693                                           u64 dev_offset,
3694                                           struct btrfs_block_group_cache *cache,
3695                                           int is_dev_replace)
3696 {
3697         struct btrfs_fs_info *fs_info = sctx->fs_info;
3698         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3699         struct map_lookup *map;
3700         struct extent_map *em;
3701         int i;
3702         int ret = 0;
3703
3704         read_lock(&map_tree->map_tree.lock);
3705         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3706         read_unlock(&map_tree->map_tree.lock);
3707
3708         if (!em) {
3709                 /*
3710                  * Might have been an unused block group deleted by the cleaner
3711                  * kthread or relocation.
3712                  */
3713                 spin_lock(&cache->lock);
3714                 if (!cache->removed)
3715                         ret = -EINVAL;
3716                 spin_unlock(&cache->lock);
3717
3718                 return ret;
3719         }
3720
3721         map = em->map_lookup;
3722         if (em->start != chunk_offset)
3723                 goto out;
3724
3725         if (em->len < length)
3726                 goto out;
3727
3728         for (i = 0; i < map->num_stripes; ++i) {
3729                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3730                     map->stripes[i].physical == dev_offset) {
3731                         ret = scrub_stripe(sctx, map, scrub_dev, i,
3732                                            chunk_offset, length,
3733                                            is_dev_replace);
3734                         if (ret)
3735                                 goto out;
3736                 }
3737         }
3738 out:
3739         free_extent_map(em);
3740
3741         return ret;
3742 }
3743
3744 static noinline_for_stack
3745 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3746                            struct btrfs_device *scrub_dev, u64 start, u64 end,
3747                            int is_dev_replace)
3748 {
3749         struct btrfs_dev_extent *dev_extent = NULL;
3750         struct btrfs_path *path;
3751         struct btrfs_fs_info *fs_info = sctx->fs_info;
3752         struct btrfs_root *root = fs_info->dev_root;
3753         u64 length;
3754         u64 chunk_offset;
3755         int ret = 0;
3756         int ro_set;
3757         int slot;
3758         struct extent_buffer *l;
3759         struct btrfs_key key;
3760         struct btrfs_key found_key;
3761         struct btrfs_block_group_cache *cache;
3762         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3763
3764         path = btrfs_alloc_path();
3765         if (!path)
3766                 return -ENOMEM;
3767
3768         path->reada = READA_FORWARD;
3769         path->search_commit_root = 1;
3770         path->skip_locking = 1;
3771
3772         key.objectid = scrub_dev->devid;
3773         key.offset = 0ull;
3774         key.type = BTRFS_DEV_EXTENT_KEY;
3775
3776         while (1) {
3777                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3778                 if (ret < 0)
3779                         break;
3780                 if (ret > 0) {
3781                         if (path->slots[0] >=
3782                             btrfs_header_nritems(path->nodes[0])) {
3783                                 ret = btrfs_next_leaf(root, path);
3784                                 if (ret < 0)
3785                                         break;
3786                                 if (ret > 0) {
3787                                         ret = 0;
3788                                         break;
3789                                 }
3790                         } else {
3791                                 ret = 0;
3792                         }
3793                 }
3794
3795                 l = path->nodes[0];
3796                 slot = path->slots[0];
3797
3798                 btrfs_item_key_to_cpu(l, &found_key, slot);
3799
3800                 if (found_key.objectid != scrub_dev->devid)
3801                         break;
3802
3803                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3804                         break;
3805
3806                 if (found_key.offset >= end)
3807                         break;
3808
3809                 if (found_key.offset < key.offset)
3810                         break;
3811
3812                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3813                 length = btrfs_dev_extent_length(l, dev_extent);
3814
3815                 if (found_key.offset + length <= start)
3816                         goto skip;
3817
3818                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3819
3820                 /*
3821                  * get a reference on the corresponding block group to prevent
3822                  * the chunk from going away while we scrub it
3823                  */
3824                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3825
3826                 /* some chunks are removed but not committed to disk yet,
3827                  * continue scrubbing */
3828                 if (!cache)
3829                         goto skip;
3830
3831                 /*
3832                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3833                  * to avoid deadlock caused by:
3834                  * btrfs_inc_block_group_ro()
3835                  * -> btrfs_wait_for_commit()
3836                  * -> btrfs_commit_transaction()
3837                  * -> btrfs_scrub_pause()
3838                  */
3839                 scrub_pause_on(fs_info);
3840                 ret = btrfs_inc_block_group_ro(fs_info, cache);
3841                 if (!ret && is_dev_replace) {
3842                         /*
3843                          * If we are doing a device replace wait for any tasks
3844                          * that started dellaloc right before we set the block
3845                          * group to RO mode, as they might have just allocated
3846                          * an extent from it or decided they could do a nocow
3847                          * write. And if any such tasks did that, wait for their
3848                          * ordered extents to complete and then commit the
3849                          * current transaction, so that we can later see the new
3850                          * extent items in the extent tree - the ordered extents
3851                          * create delayed data references (for cow writes) when
3852                          * they complete, which will be run and insert the
3853                          * corresponding extent items into the extent tree when
3854                          * we commit the transaction they used when running
3855                          * inode.c:btrfs_finish_ordered_io(). We later use
3856                          * the commit root of the extent tree to find extents
3857                          * to copy from the srcdev into the tgtdev, and we don't
3858                          * want to miss any new extents.
3859                          */
3860                         btrfs_wait_block_group_reservations(cache);
3861                         btrfs_wait_nocow_writers(cache);
3862                         ret = btrfs_wait_ordered_roots(fs_info, -1,
3863                                                        cache->key.objectid,
3864                                                        cache->key.offset);
3865                         if (ret > 0) {
3866                                 struct btrfs_trans_handle *trans;
3867
3868                                 trans = btrfs_join_transaction(root);
3869                                 if (IS_ERR(trans))
3870                                         ret = PTR_ERR(trans);
3871                                 else
3872                                         ret = btrfs_commit_transaction(trans);
3873                                 if (ret) {
3874                                         scrub_pause_off(fs_info);
3875                                         btrfs_put_block_group(cache);
3876                                         break;
3877                                 }
3878                         }
3879                 }
3880                 scrub_pause_off(fs_info);
3881
3882                 if (ret == 0) {
3883                         ro_set = 1;
3884                 } else if (ret == -ENOSPC) {
3885                         /*
3886                          * btrfs_inc_block_group_ro return -ENOSPC when it
3887                          * failed in creating new chunk for metadata.
3888                          * It is not a problem for scrub/replace, because
3889                          * metadata are always cowed, and our scrub paused
3890                          * commit_transactions.
3891                          */
3892                         ro_set = 0;
3893                 } else {
3894                         btrfs_warn(fs_info,
3895                                    "failed setting block group ro, ret=%d\n",
3896                                    ret);
3897                         btrfs_put_block_group(cache);
3898                         break;
3899                 }
3900
3901                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3902                 dev_replace->cursor_right = found_key.offset + length;
3903                 dev_replace->cursor_left = found_key.offset;
3904                 dev_replace->item_needs_writeback = 1;
3905                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3906                 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3907                                   found_key.offset, cache, is_dev_replace);
3908
3909                 /*
3910                  * flush, submit all pending read and write bios, afterwards
3911                  * wait for them.
3912                  * Note that in the dev replace case, a read request causes
3913                  * write requests that are submitted in the read completion
3914                  * worker. Therefore in the current situation, it is required
3915                  * that all write requests are flushed, so that all read and
3916                  * write requests are really completed when bios_in_flight
3917                  * changes to 0.
3918                  */
3919                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3920                 scrub_submit(sctx);
3921                 mutex_lock(&sctx->wr_ctx.wr_lock);
3922                 scrub_wr_submit(sctx);
3923                 mutex_unlock(&sctx->wr_ctx.wr_lock);
3924
3925                 wait_event(sctx->list_wait,
3926                            atomic_read(&sctx->bios_in_flight) == 0);
3927
3928                 scrub_pause_on(fs_info);
3929
3930                 /*
3931                  * must be called before we decrease @scrub_paused.
3932                  * make sure we don't block transaction commit while
3933                  * we are waiting pending workers finished.
3934                  */
3935                 wait_event(sctx->list_wait,
3936                            atomic_read(&sctx->workers_pending) == 0);
3937                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3938
3939                 scrub_pause_off(fs_info);
3940
3941                 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3942                 dev_replace->cursor_left = dev_replace->cursor_right;
3943                 dev_replace->item_needs_writeback = 1;
3944                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3945
3946                 if (ro_set)
3947                         btrfs_dec_block_group_ro(cache);
3948
3949                 /*
3950                  * We might have prevented the cleaner kthread from deleting
3951                  * this block group if it was already unused because we raced
3952                  * and set it to RO mode first. So add it back to the unused
3953                  * list, otherwise it might not ever be deleted unless a manual
3954                  * balance is triggered or it becomes used and unused again.
3955                  */
3956                 spin_lock(&cache->lock);
3957                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3958                     btrfs_block_group_used(&cache->item) == 0) {
3959                         spin_unlock(&cache->lock);
3960                         spin_lock(&fs_info->unused_bgs_lock);
3961                         if (list_empty(&cache->bg_list)) {
3962                                 btrfs_get_block_group(cache);
3963                                 list_add_tail(&cache->bg_list,
3964                                               &fs_info->unused_bgs);
3965                         }
3966                         spin_unlock(&fs_info->unused_bgs_lock);
3967                 } else {
3968                         spin_unlock(&cache->lock);
3969                 }
3970
3971                 btrfs_put_block_group(cache);
3972                 if (ret)
3973                         break;
3974                 if (is_dev_replace &&
3975                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3976                         ret = -EIO;
3977                         break;
3978                 }
3979                 if (sctx->stat.malloc_errors > 0) {
3980                         ret = -ENOMEM;
3981                         break;
3982                 }
3983 skip:
3984                 key.offset = found_key.offset + length;
3985                 btrfs_release_path(path);
3986         }
3987
3988         btrfs_free_path(path);
3989
3990         return ret;
3991 }
3992
3993 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3994                                            struct btrfs_device *scrub_dev)
3995 {
3996         int     i;
3997         u64     bytenr;
3998         u64     gen;
3999         int     ret;
4000         struct btrfs_fs_info *fs_info = sctx->fs_info;
4001
4002         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4003                 return -EIO;
4004
4005         /* Seed devices of a new filesystem has their own generation. */
4006         if (scrub_dev->fs_devices != fs_info->fs_devices)
4007                 gen = scrub_dev->generation;
4008         else
4009                 gen = fs_info->last_trans_committed;
4010
4011         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4012                 bytenr = btrfs_sb_offset(i);
4013                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4014                     scrub_dev->commit_total_bytes)
4015                         break;
4016
4017                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4018                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4019                                   NULL, 1, bytenr);
4020                 if (ret)
4021                         return ret;
4022         }
4023         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4024
4025         return 0;
4026 }
4027
4028 /*
4029  * get a reference count on fs_info->scrub_workers. start worker if necessary
4030  */
4031 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4032                                                 int is_dev_replace)
4033 {
4034         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4035         int max_active = fs_info->thread_pool_size;
4036
4037         if (fs_info->scrub_workers_refcnt == 0) {
4038                 if (is_dev_replace)
4039                         fs_info->scrub_workers =
4040                                 btrfs_alloc_workqueue(fs_info, "scrub", flags,
4041                                                       1, 4);
4042                 else
4043                         fs_info->scrub_workers =
4044                                 btrfs_alloc_workqueue(fs_info, "scrub", flags,
4045                                                       max_active, 4);
4046                 if (!fs_info->scrub_workers)
4047                         goto fail_scrub_workers;
4048
4049                 fs_info->scrub_wr_completion_workers =
4050                         btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4051                                               max_active, 2);
4052                 if (!fs_info->scrub_wr_completion_workers)
4053                         goto fail_scrub_wr_completion_workers;
4054
4055                 fs_info->scrub_nocow_workers =
4056                         btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
4057                 if (!fs_info->scrub_nocow_workers)
4058                         goto fail_scrub_nocow_workers;
4059                 fs_info->scrub_parity_workers =
4060                         btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4061                                               max_active, 2);
4062                 if (!fs_info->scrub_parity_workers)
4063                         goto fail_scrub_parity_workers;
4064         }
4065         ++fs_info->scrub_workers_refcnt;
4066         return 0;
4067
4068 fail_scrub_parity_workers:
4069         btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4070 fail_scrub_nocow_workers:
4071         btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4072 fail_scrub_wr_completion_workers:
4073         btrfs_destroy_workqueue(fs_info->scrub_workers);
4074 fail_scrub_workers:
4075         return -ENOMEM;
4076 }
4077
4078 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
4079 {
4080         if (--fs_info->scrub_workers_refcnt == 0) {
4081                 btrfs_destroy_workqueue(fs_info->scrub_workers);
4082                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4083                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4084                 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
4085         }
4086         WARN_ON(fs_info->scrub_workers_refcnt < 0);
4087 }
4088
4089 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4090                     u64 end, struct btrfs_scrub_progress *progress,
4091                     int readonly, int is_dev_replace)
4092 {
4093         struct scrub_ctx *sctx;
4094         int ret;
4095         struct btrfs_device *dev;
4096         struct rcu_string *name;
4097
4098         if (btrfs_fs_closing(fs_info))
4099                 return -EINVAL;
4100
4101         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4102                 /*
4103                  * in this case scrub is unable to calculate the checksum
4104                  * the way scrub is implemented. Do not handle this
4105                  * situation at all because it won't ever happen.
4106                  */
4107                 btrfs_err(fs_info,
4108                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4109                        fs_info->nodesize,
4110                        BTRFS_STRIPE_LEN);
4111                 return -EINVAL;
4112         }
4113
4114         if (fs_info->sectorsize != PAGE_SIZE) {
4115                 /* not supported for data w/o checksums */
4116                 btrfs_err_rl(fs_info,
4117                            "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
4118                        fs_info->sectorsize, PAGE_SIZE);
4119                 return -EINVAL;
4120         }
4121
4122         if (fs_info->nodesize >
4123             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4124             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4125                 /*
4126                  * would exhaust the array bounds of pagev member in
4127                  * struct scrub_block
4128                  */
4129                 btrfs_err(fs_info,
4130                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131                        fs_info->nodesize,
4132                        SCRUB_MAX_PAGES_PER_BLOCK,
4133                        fs_info->sectorsize,
4134                        SCRUB_MAX_PAGES_PER_BLOCK);
4135                 return -EINVAL;
4136         }
4137
4138
4139         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4140         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4141         if (!dev || (dev->missing && !is_dev_replace)) {
4142                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4143                 return -ENODEV;
4144         }
4145
4146         if (!is_dev_replace && !readonly && !dev->writeable) {
4147                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4148                 rcu_read_lock();
4149                 name = rcu_dereference(dev->name);
4150                 btrfs_err(fs_info, "scrub: device %s is not writable",
4151                           name->str);
4152                 rcu_read_unlock();
4153                 return -EROFS;
4154         }
4155
4156         mutex_lock(&fs_info->scrub_lock);
4157         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
4158                 mutex_unlock(&fs_info->scrub_lock);
4159                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4160                 return -EIO;
4161         }
4162
4163         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
4164         if (dev->scrub_device ||
4165             (!is_dev_replace &&
4166              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4167                 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
4168                 mutex_unlock(&fs_info->scrub_lock);
4169                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4170                 return -EINPROGRESS;
4171         }
4172         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
4173
4174         ret = scrub_workers_get(fs_info, is_dev_replace);
4175         if (ret) {
4176                 mutex_unlock(&fs_info->scrub_lock);
4177                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4178                 return ret;
4179         }
4180
4181         sctx = scrub_setup_ctx(dev, is_dev_replace);
4182         if (IS_ERR(sctx)) {
4183                 mutex_unlock(&fs_info->scrub_lock);
4184                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4185                 scrub_workers_put(fs_info);
4186                 return PTR_ERR(sctx);
4187         }
4188         sctx->readonly = readonly;
4189         dev->scrub_device = sctx;
4190         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4191
4192         /*
4193          * checking @scrub_pause_req here, we can avoid
4194          * race between committing transaction and scrubbing.
4195          */
4196         __scrub_blocked_if_needed(fs_info);
4197         atomic_inc(&fs_info->scrubs_running);
4198         mutex_unlock(&fs_info->scrub_lock);
4199
4200         if (!is_dev_replace) {
4201                 /*
4202                  * by holding device list mutex, we can
4203                  * kick off writing super in log tree sync.
4204                  */
4205                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4206                 ret = scrub_supers(sctx, dev);
4207                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4208         }
4209
4210         if (!ret)
4211                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
4212                                              is_dev_replace);
4213
4214         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4215         atomic_dec(&fs_info->scrubs_running);
4216         wake_up(&fs_info->scrub_pause_wait);
4217
4218         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4219
4220         if (progress)
4221                 memcpy(progress, &sctx->stat, sizeof(*progress));
4222
4223         mutex_lock(&fs_info->scrub_lock);
4224         dev->scrub_device = NULL;
4225         scrub_workers_put(fs_info);
4226         mutex_unlock(&fs_info->scrub_lock);
4227
4228         scrub_put_ctx(sctx);
4229
4230         return ret;
4231 }
4232
4233 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4234 {
4235         mutex_lock(&fs_info->scrub_lock);
4236         atomic_inc(&fs_info->scrub_pause_req);
4237         while (atomic_read(&fs_info->scrubs_paused) !=
4238                atomic_read(&fs_info->scrubs_running)) {
4239                 mutex_unlock(&fs_info->scrub_lock);
4240                 wait_event(fs_info->scrub_pause_wait,
4241                            atomic_read(&fs_info->scrubs_paused) ==
4242                            atomic_read(&fs_info->scrubs_running));
4243                 mutex_lock(&fs_info->scrub_lock);
4244         }
4245         mutex_unlock(&fs_info->scrub_lock);
4246 }
4247
4248 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4249 {
4250         atomic_dec(&fs_info->scrub_pause_req);
4251         wake_up(&fs_info->scrub_pause_wait);
4252 }
4253
4254 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4255 {
4256         mutex_lock(&fs_info->scrub_lock);
4257         if (!atomic_read(&fs_info->scrubs_running)) {
4258                 mutex_unlock(&fs_info->scrub_lock);
4259                 return -ENOTCONN;
4260         }
4261
4262         atomic_inc(&fs_info->scrub_cancel_req);
4263         while (atomic_read(&fs_info->scrubs_running)) {
4264                 mutex_unlock(&fs_info->scrub_lock);
4265                 wait_event(fs_info->scrub_pause_wait,
4266                            atomic_read(&fs_info->scrubs_running) == 0);
4267                 mutex_lock(&fs_info->scrub_lock);
4268         }
4269         atomic_dec(&fs_info->scrub_cancel_req);
4270         mutex_unlock(&fs_info->scrub_lock);
4271
4272         return 0;
4273 }
4274
4275 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4276                            struct btrfs_device *dev)
4277 {
4278         struct scrub_ctx *sctx;
4279
4280         mutex_lock(&fs_info->scrub_lock);
4281         sctx = dev->scrub_device;
4282         if (!sctx) {
4283                 mutex_unlock(&fs_info->scrub_lock);
4284                 return -ENOTCONN;
4285         }
4286         atomic_inc(&sctx->cancel_req);
4287         while (dev->scrub_device) {
4288                 mutex_unlock(&fs_info->scrub_lock);
4289                 wait_event(fs_info->scrub_pause_wait,
4290                            dev->scrub_device == NULL);
4291                 mutex_lock(&fs_info->scrub_lock);
4292         }
4293         mutex_unlock(&fs_info->scrub_lock);
4294
4295         return 0;
4296 }
4297
4298 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4299                          struct btrfs_scrub_progress *progress)
4300 {
4301         struct btrfs_device *dev;
4302         struct scrub_ctx *sctx = NULL;
4303
4304         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4305         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4306         if (dev)
4307                 sctx = dev->scrub_device;
4308         if (sctx)
4309                 memcpy(progress, &sctx->stat, sizeof(*progress));
4310         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4311
4312         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4313 }
4314
4315 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4316                                u64 extent_logical, u64 extent_len,
4317                                u64 *extent_physical,
4318                                struct btrfs_device **extent_dev,
4319                                int *extent_mirror_num)
4320 {
4321         u64 mapped_length;
4322         struct btrfs_bio *bbio = NULL;
4323         int ret;
4324
4325         mapped_length = extent_len;
4326         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4327                               &mapped_length, &bbio, 0);
4328         if (ret || !bbio || mapped_length < extent_len ||
4329             !bbio->stripes[0].dev->bdev) {
4330                 btrfs_put_bbio(bbio);
4331                 return;
4332         }
4333
4334         *extent_physical = bbio->stripes[0].physical;
4335         *extent_mirror_num = bbio->mirror_num;
4336         *extent_dev = bbio->stripes[0].dev;
4337         btrfs_put_bbio(bbio);
4338 }
4339
4340 static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
4341                               struct btrfs_device *dev,
4342                               int is_dev_replace)
4343 {
4344         WARN_ON(wr_ctx->wr_curr_bio != NULL);
4345
4346         mutex_init(&wr_ctx->wr_lock);
4347         wr_ctx->wr_curr_bio = NULL;
4348         if (!is_dev_replace)
4349                 return 0;
4350
4351         WARN_ON(!dev->bdev);
4352         wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
4353         wr_ctx->tgtdev = dev;
4354         atomic_set(&wr_ctx->flush_all_writes, 0);
4355         return 0;
4356 }
4357
4358 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
4359 {
4360         mutex_lock(&wr_ctx->wr_lock);
4361         kfree(wr_ctx->wr_curr_bio);
4362         wr_ctx->wr_curr_bio = NULL;
4363         mutex_unlock(&wr_ctx->wr_lock);
4364 }
4365
4366 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4367                             int mirror_num, u64 physical_for_dev_replace)
4368 {
4369         struct scrub_copy_nocow_ctx *nocow_ctx;
4370         struct btrfs_fs_info *fs_info = sctx->fs_info;
4371
4372         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4373         if (!nocow_ctx) {
4374                 spin_lock(&sctx->stat_lock);
4375                 sctx->stat.malloc_errors++;
4376                 spin_unlock(&sctx->stat_lock);
4377                 return -ENOMEM;
4378         }
4379
4380         scrub_pending_trans_workers_inc(sctx);
4381
4382         nocow_ctx->sctx = sctx;
4383         nocow_ctx->logical = logical;
4384         nocow_ctx->len = len;
4385         nocow_ctx->mirror_num = mirror_num;
4386         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4387         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4388                         copy_nocow_pages_worker, NULL, NULL);
4389         INIT_LIST_HEAD(&nocow_ctx->inodes);
4390         btrfs_queue_work(fs_info->scrub_nocow_workers,
4391                          &nocow_ctx->work);
4392
4393         return 0;
4394 }
4395
4396 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4397 {
4398         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4399         struct scrub_nocow_inode *nocow_inode;
4400
4401         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4402         if (!nocow_inode)
4403                 return -ENOMEM;
4404         nocow_inode->inum = inum;
4405         nocow_inode->offset = offset;
4406         nocow_inode->root = root;
4407         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4408         return 0;
4409 }
4410
4411 #define COPY_COMPLETE 1
4412
4413 static void copy_nocow_pages_worker(struct btrfs_work *work)
4414 {
4415         struct scrub_copy_nocow_ctx *nocow_ctx =
4416                 container_of(work, struct scrub_copy_nocow_ctx, work);
4417         struct scrub_ctx *sctx = nocow_ctx->sctx;
4418         struct btrfs_fs_info *fs_info = sctx->fs_info;
4419         struct btrfs_root *root = fs_info->extent_root;
4420         u64 logical = nocow_ctx->logical;
4421         u64 len = nocow_ctx->len;
4422         int mirror_num = nocow_ctx->mirror_num;
4423         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4424         int ret;
4425         struct btrfs_trans_handle *trans = NULL;
4426         struct btrfs_path *path;
4427         int not_written = 0;
4428
4429         path = btrfs_alloc_path();
4430         if (!path) {
4431                 spin_lock(&sctx->stat_lock);
4432                 sctx->stat.malloc_errors++;
4433                 spin_unlock(&sctx->stat_lock);
4434                 not_written = 1;
4435                 goto out;
4436         }
4437
4438         trans = btrfs_join_transaction(root);
4439         if (IS_ERR(trans)) {
4440                 not_written = 1;
4441                 goto out;
4442         }
4443
4444         ret = iterate_inodes_from_logical(logical, fs_info, path,
4445                                           record_inode_for_nocow, nocow_ctx);
4446         if (ret != 0 && ret != -ENOENT) {
4447                 btrfs_warn(fs_info,
4448                            "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4449                            logical, physical_for_dev_replace, len, mirror_num,
4450                            ret);
4451                 not_written = 1;
4452                 goto out;
4453         }
4454
4455         btrfs_end_transaction(trans);
4456         trans = NULL;
4457         while (!list_empty(&nocow_ctx->inodes)) {
4458                 struct scrub_nocow_inode *entry;
4459                 entry = list_first_entry(&nocow_ctx->inodes,
4460                                          struct scrub_nocow_inode,
4461                                          list);
4462                 list_del_init(&entry->list);
4463                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4464                                                  entry->root, nocow_ctx);
4465                 kfree(entry);
4466                 if (ret == COPY_COMPLETE) {
4467                         ret = 0;
4468                         break;
4469                 } else if (ret) {
4470                         break;
4471                 }
4472         }
4473 out:
4474         while (!list_empty(&nocow_ctx->inodes)) {
4475                 struct scrub_nocow_inode *entry;
4476                 entry = list_first_entry(&nocow_ctx->inodes,
4477                                          struct scrub_nocow_inode,
4478                                          list);
4479                 list_del_init(&entry->list);
4480                 kfree(entry);
4481         }
4482         if (trans && !IS_ERR(trans))
4483                 btrfs_end_transaction(trans);
4484         if (not_written)
4485                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4486                                             num_uncorrectable_read_errors);
4487
4488         btrfs_free_path(path);
4489         kfree(nocow_ctx);
4490
4491         scrub_pending_trans_workers_dec(sctx);
4492 }
4493
4494 static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
4495                                  u64 logical)
4496 {
4497         struct extent_state *cached_state = NULL;
4498         struct btrfs_ordered_extent *ordered;
4499         struct extent_io_tree *io_tree;
4500         struct extent_map *em;
4501         u64 lockstart = start, lockend = start + len - 1;
4502         int ret = 0;
4503
4504         io_tree = &inode->io_tree;
4505
4506         lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4507         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4508         if (ordered) {
4509                 btrfs_put_ordered_extent(ordered);
4510                 ret = 1;
4511                 goto out_unlock;
4512         }
4513
4514         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4515         if (IS_ERR(em)) {
4516                 ret = PTR_ERR(em);
4517                 goto out_unlock;
4518         }
4519
4520         /*
4521          * This extent does not actually cover the logical extent anymore,
4522          * move on to the next inode.
4523          */
4524         if (em->block_start > logical ||
4525             em->block_start + em->block_len < logical + len) {
4526                 free_extent_map(em);
4527                 ret = 1;
4528                 goto out_unlock;
4529         }
4530         free_extent_map(em);
4531
4532 out_unlock:
4533         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4534                              GFP_NOFS);
4535         return ret;
4536 }
4537
4538 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4539                                       struct scrub_copy_nocow_ctx *nocow_ctx)
4540 {
4541         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4542         struct btrfs_key key;
4543         struct inode *inode;
4544         struct page *page;
4545         struct btrfs_root *local_root;
4546         struct extent_io_tree *io_tree;
4547         u64 physical_for_dev_replace;
4548         u64 nocow_ctx_logical;
4549         u64 len = nocow_ctx->len;
4550         unsigned long index;
4551         int srcu_index;
4552         int ret = 0;
4553         int err = 0;
4554
4555         key.objectid = root;
4556         key.type = BTRFS_ROOT_ITEM_KEY;
4557         key.offset = (u64)-1;
4558
4559         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4560
4561         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4562         if (IS_ERR(local_root)) {
4563                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4564                 return PTR_ERR(local_root);
4565         }
4566
4567         key.type = BTRFS_INODE_ITEM_KEY;
4568         key.objectid = inum;
4569         key.offset = 0;
4570         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4571         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4572         if (IS_ERR(inode))
4573                 return PTR_ERR(inode);
4574
4575         /* Avoid truncate/dio/punch hole.. */
4576         inode_lock(inode);
4577         inode_dio_wait(inode);
4578
4579         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4580         io_tree = &BTRFS_I(inode)->io_tree;
4581         nocow_ctx_logical = nocow_ctx->logical;
4582
4583         ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4584                         nocow_ctx_logical);
4585         if (ret) {
4586                 ret = ret > 0 ? 0 : ret;
4587                 goto out;
4588         }
4589
4590         while (len >= PAGE_SIZE) {
4591                 index = offset >> PAGE_SHIFT;
4592 again:
4593                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4594                 if (!page) {
4595                         btrfs_err(fs_info, "find_or_create_page() failed");
4596                         ret = -ENOMEM;
4597                         goto out;
4598                 }
4599
4600                 if (PageUptodate(page)) {
4601                         if (PageDirty(page))
4602                                 goto next_page;
4603                 } else {
4604                         ClearPageError(page);
4605                         err = extent_read_full_page(io_tree, page,
4606                                                            btrfs_get_extent,
4607                                                            nocow_ctx->mirror_num);
4608                         if (err) {
4609                                 ret = err;
4610                                 goto next_page;
4611                         }
4612
4613                         lock_page(page);
4614                         /*
4615                          * If the page has been remove from the page cache,
4616                          * the data on it is meaningless, because it may be
4617                          * old one, the new data may be written into the new
4618                          * page in the page cache.
4619                          */
4620                         if (page->mapping != inode->i_mapping) {
4621                                 unlock_page(page);
4622                                 put_page(page);
4623                                 goto again;
4624                         }
4625                         if (!PageUptodate(page)) {
4626                                 ret = -EIO;
4627                                 goto next_page;
4628                         }
4629                 }
4630
4631                 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4632                                             nocow_ctx_logical);
4633                 if (ret) {
4634                         ret = ret > 0 ? 0 : ret;
4635                         goto next_page;
4636                 }
4637
4638                 err = write_page_nocow(nocow_ctx->sctx,
4639                                        physical_for_dev_replace, page);
4640                 if (err)
4641                         ret = err;
4642 next_page:
4643                 unlock_page(page);
4644                 put_page(page);
4645
4646                 if (ret)
4647                         break;
4648
4649                 offset += PAGE_SIZE;
4650                 physical_for_dev_replace += PAGE_SIZE;
4651                 nocow_ctx_logical += PAGE_SIZE;
4652                 len -= PAGE_SIZE;
4653         }
4654         ret = COPY_COMPLETE;
4655 out:
4656         inode_unlock(inode);
4657         iput(inode);
4658         return ret;
4659 }
4660
4661 static int write_page_nocow(struct scrub_ctx *sctx,
4662                             u64 physical_for_dev_replace, struct page *page)
4663 {
4664         struct bio *bio;
4665         struct btrfs_device *dev;
4666         int ret;
4667
4668         dev = sctx->wr_ctx.tgtdev;
4669         if (!dev)
4670                 return -EIO;
4671         if (!dev->bdev) {
4672                 btrfs_warn_rl(dev->fs_info,
4673                         "scrub write_page_nocow(bdev == NULL) is unexpected");
4674                 return -EIO;
4675         }
4676         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4677         if (!bio) {
4678                 spin_lock(&sctx->stat_lock);
4679                 sctx->stat.malloc_errors++;
4680                 spin_unlock(&sctx->stat_lock);
4681                 return -ENOMEM;
4682         }
4683         bio->bi_iter.bi_size = 0;
4684         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4685         bio->bi_bdev = dev->bdev;
4686         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4687         ret = bio_add_page(bio, page, PAGE_SIZE, 0);
4688         if (ret != PAGE_SIZE) {
4689 leave_with_eio:
4690                 bio_put(bio);
4691                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4692                 return -EIO;
4693         }
4694
4695         if (btrfsic_submit_bio_wait(bio))
4696                 goto leave_with_eio;
4697
4698         bio_put(bio);
4699         return 0;
4700 }