Merge tag 'drm-intel-next-2017-05-29' of git://anongit.freedesktop.org/git/drm-intel...

[karo-tx-linux.git] / fs / btrfs / scrub.c
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index b0251eb1239fce83226650be88c31122a9f108af..c7b45eb2403d09e94b2538dabcb5a1f0116c55dd 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -64,7 +64,7 @@ struct scrub_ctx;
  #define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
  
  struct scrub_recover {
-       atomic_t                refs;
+       refcount_t              refs;
         struct btrfs_bio        *bbio;
         u64                     map_length;
  };
@@ -112,7 +112,7 @@ struct scrub_block {
         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
         int                     page_count;
         atomic_t                outstanding_pages;
-       atomic_t                refs; /* free mem on transition to zero */
+       refcount_t              refs; /* free mem on transition to zero */
         struct scrub_ctx        *sctx;
         struct scrub_parity     *sparity;
         struct {
@@ -140,9 +140,9 @@ struct scrub_parity {
  
         int                     nsectors;
  
-       int                     stripe_len;
+       u64                     stripe_len;
  
-       atomic_t                refs;
+       refcount_t              refs;
  
         struct list_head        spages;
  
@@ -202,7 +202,7 @@ struct scrub_ctx {
          * doesn't free the scrub context before or while the workers are
          * doing the wakeup() call.
          */
-       atomic_t                refs;
+       refcount_t              refs;
  };
  
  struct scrub_fixup_nodatasum {
@@ -240,6 +240,13 @@ struct scrub_warning {
         struct btrfs_device     *dev;
  };
  
+struct full_stripe_lock {
+       struct rb_node node;
+       u64 logical;
+       u64 refs;
+       struct mutex mutex;
+};
+
  static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
  static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
  static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -305,7 +312,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
  
  static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
  {
-       atomic_inc(&sctx->refs);
+       refcount_inc(&sctx->refs);
         atomic_inc(&sctx->bios_in_flight);
  }
  
@@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
         scrub_pause_off(fs_info);
  }
  
+/*
+ * Insert new full stripe lock into full stripe locks tree
+ *
+ * Return pointer to existing or newly inserted full_stripe_lock structure if
+ * everything works well.
+ * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+ *
+ * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+ * function
+ */
+static struct full_stripe_lock *insert_full_stripe_lock(
+               struct btrfs_full_stripe_locks_tree *locks_root,
+               u64 fstripe_logical)
+{
+       struct rb_node **p;
+       struct rb_node *parent = NULL;
+       struct full_stripe_lock *entry;
+       struct full_stripe_lock *ret;
+
+       WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+       p = &locks_root->root.rb_node;
+       while (*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct full_stripe_lock, node);
+               if (fstripe_logical < entry->logical) {
+                       p = &(*p)->rb_left;
+               } else if (fstripe_logical > entry->logical) {
+                       p = &(*p)->rb_right;
+               } else {
+                       entry->refs++;
+                       return entry;
+               }
+       }
+
+       /* Insert new lock */
+       ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+       if (!ret)
+               return ERR_PTR(-ENOMEM);
+       ret->logical = fstripe_logical;
+       ret->refs = 1;
+       mutex_init(&ret->mutex);
+
+       rb_link_node(&ret->node, parent, p);
+       rb_insert_color(&ret->node, &locks_root->root);
+       return ret;
+}
+
+/*
+ * Search for a full stripe lock of a block group
+ *
+ * Return pointer to existing full stripe lock if found
+ * Return NULL if not found
+ */
+static struct full_stripe_lock *search_full_stripe_lock(
+               struct btrfs_full_stripe_locks_tree *locks_root,
+               u64 fstripe_logical)
+{
+       struct rb_node *node;
+       struct full_stripe_lock *entry;
+
+       WARN_ON(!mutex_is_locked(&locks_root->lock));
+
+       node = locks_root->root.rb_node;
+       while (node) {
+               entry = rb_entry(node, struct full_stripe_lock, node);
+               if (fstripe_logical < entry->logical)
+                       node = node->rb_left;
+               else if (fstripe_logical > entry->logical)
+                       node = node->rb_right;
+               else
+                       return entry;
+       }
+       return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ *
+ * Caller must ensure @cache is a RAID56 block group.
+ */
+static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
+                                  u64 bytenr)
+{
+       u64 ret;
+
+       /*
+        * Due to chunk item size limit, full stripe length should not be
+        * larger than U32_MAX. Just a sanity check here.
+        */
+       WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+
+       /*
+        * round_down() can only handle power of 2, while RAID56 full
+        * stripe length can be 64KiB * n, so we need to manually round down.
+        */
+       ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
+               cache->full_stripe_len + cache->key.objectid;
+       return ret;
+}
+
+/*
+ * Lock a full stripe to avoid concurrency of recovery and read
+ *
+ * It's only used for profiles with parities (RAID5/6), for other profiles it
+ * does nothing.
+ *
+ * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+ * So caller must call unlock_full_stripe() at the same context.
+ *
+ * Return <0 if encounters error.
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+                           bool *locked_ret)
+{
+       struct btrfs_block_group_cache *bg_cache;
+       struct btrfs_full_stripe_locks_tree *locks_root;
+       struct full_stripe_lock *existing;
+       u64 fstripe_start;
+       int ret = 0;
+
+       *locked_ret = false;
+       bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg_cache) {
+               ASSERT(0);
+               return -ENOENT;
+       }
+
+       /* Profiles not based on parity don't need full stripe lock */
+       if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+               goto out;
+       locks_root = &bg_cache->full_stripe_locks_root;
+
+       fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+       /* Now insert the full stripe lock */
+       mutex_lock(&locks_root->lock);
+       existing = insert_full_stripe_lock(locks_root, fstripe_start);
+       mutex_unlock(&locks_root->lock);
+       if (IS_ERR(existing)) {
+               ret = PTR_ERR(existing);
+               goto out;
+       }
+       mutex_lock(&existing->mutex);
+       *locked_ret = true;
+out:
+       btrfs_put_block_group(bg_cache);
+       return ret;
+}
+
+/*
+ * Unlock a full stripe.
+ *
+ * NOTE: Caller must ensure it's the same context calling corresponding
+ * lock_full_stripe().
+ *
+ * Return 0 if we unlock full stripe without problem.
+ * Return <0 for error
+ */
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+                             bool locked)
+{
+       struct btrfs_block_group_cache *bg_cache;
+       struct btrfs_full_stripe_locks_tree *locks_root;
+       struct full_stripe_lock *fstripe_lock;
+       u64 fstripe_start;
+       bool freeit = false;
+       int ret = 0;
+
+       /* If we didn't acquire full stripe lock, no need to continue */
+       if (!locked)
+               return 0;
+
+       bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg_cache) {
+               ASSERT(0);
+               return -ENOENT;
+       }
+       if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+               goto out;
+
+       locks_root = &bg_cache->full_stripe_locks_root;
+       fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+
+       mutex_lock(&locks_root->lock);
+       fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+       /* Unpaired unlock_full_stripe() detected */
+       if (!fstripe_lock) {
+               WARN_ON(1);
+               ret = -ENOENT;
+               mutex_unlock(&locks_root->lock);
+               goto out;
+       }
+
+       if (fstripe_lock->refs == 0) {
+               WARN_ON(1);
+               btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+                       fstripe_lock->logical);
+       } else {
+               fstripe_lock->refs--;
+       }
+
+       if (fstripe_lock->refs == 0) {
+               rb_erase(&fstripe_lock->node, &locks_root->root);
+               freeit = true;
+       }
+       mutex_unlock(&locks_root->lock);
+
+       mutex_unlock(&fstripe_lock->mutex);
+       if (freeit)
+               kfree(fstripe_lock);
+out:
+       btrfs_put_block_group(bg_cache);
+       return ret;
+}
+
  /*
   * used for workers that require transaction commits (i.e., for the
   * NOCOW case)
@@ -356,7 +579,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
  {
         struct btrfs_fs_info *fs_info = sctx->fs_info;
  
-       atomic_inc(&sctx->refs);
+       refcount_inc(&sctx->refs);
         /*
          * increment scrubs_running to prevent cancel requests from
          * completing as long as a worker is running. we must also
@@ -447,7 +670,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
  
  static void scrub_put_ctx(struct scrub_ctx *sctx)
  {
-       if (atomic_dec_and_test(&sctx->refs))
+       if (refcount_dec_and_test(&sctx->refs))
                 scrub_free_ctx(sctx);
  }
  
@@ -462,7 +685,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
         if (!sctx)
                 goto nomem;
-       atomic_set(&sctx->refs, 1);
+       refcount_set(&sctx->refs, 1);
         sctx->is_dev_replace = is_dev_replace;
         sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
         sctx->curr = -1;
@@ -857,12 +1080,14 @@ out:
  
  static inline void scrub_get_recover(struct scrub_recover *recover)
  {
-       atomic_inc(&recover->refs);
+       refcount_inc(&recover->refs);
  }
  
-static inline void scrub_put_recover(struct scrub_recover *recover)
+static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+                                    struct scrub_recover *recover)
  {
-       if (atomic_dec_and_test(&recover->refs)) {
+       if (refcount_dec_and_test(&recover->refs)) {
+               btrfs_bio_counter_dec(fs_info);
                 btrfs_put_bbio(recover->bbio);
                 kfree(recover);
         }
@@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         int mirror_index;
         int page_num;
         int success;
+       bool full_stripe_locked;
         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                       DEFAULT_RATELIMIT_BURST);
  
@@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         have_csum = sblock_to_check->pagev[0]->have_csum;
         dev = sblock_to_check->pagev[0]->dev;
  
+       /*
+        * For RAID5/6, race can happen for a different device scrub thread.
+        * For data corruption, Parity and Data threads will both try
+        * to recovery the data.
+        * Race can lead to doubly added csum error, or even unrecoverable
+        * error.
+        */
+       ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+       if (ret < 0) {
+               spin_lock(&sctx->stat_lock);
+               if (ret == -ENOMEM)
+                       sctx->stat.malloc_errors++;
+               sctx->stat.read_errors++;
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return ret;
+       }
+
         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
                 sblocks_for_recheck = NULL;
                 goto nodatasum_case;
@@ -1241,7 +1485,7 @@ out:
                                 sblock->pagev[page_index]->sblock = NULL;
                                 recover = sblock->pagev[page_index]->recover;
                                 if (recover) {
-                                       scrub_put_recover(recover);
+                                       scrub_put_recover(fs_info, recover);
                                         sblock->pagev[page_index]->recover =
                                                                         NULL;
                                 }
@@ -1251,6 +1495,9 @@ out:
                 kfree(sblocks_for_recheck);
         }
  
+       ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+       if (ret < 0)
+               return ret;
         return 0;
  }
  
@@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                  * with a length of PAGE_SIZE, each returned stripe
                  * represents one mirror
                  */
+               btrfs_bio_counter_inc_blocked(fs_info);
                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &mapped_length, &bbio, 0, 1);
+                               logical, &mapped_length, &bbio);
                 if (ret || !bbio || mapped_length < sublen) {
                         btrfs_put_bbio(bbio);
+                       btrfs_bio_counter_dec(fs_info);
                         return -EIO;
                 }
  
                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                 if (!recover) {
                         btrfs_put_bbio(bbio);
+                       btrfs_bio_counter_dec(fs_info);
                         return -ENOMEM;
                 }
  
-               atomic_set(&recover->refs, 1);
+               refcount_set(&recover->refs, 1);
                 recover->bbio = bbio;
                 recover->map_length = mapped_length;
  
@@ -1365,7 +1615,7 @@ leave_nomem:
                                 spin_lock(&sctx->stat_lock);
                                 sctx->stat.malloc_errors++;
                                 spin_unlock(&sctx->stat_lock);
-                               scrub_put_recover(recover);
+                               scrub_put_recover(fs_info, recover);
                                 return -ENOMEM;
                         }
                         scrub_page_get(page);
@@ -1407,7 +1657,7 @@ leave_nomem:
                         scrub_get_recover(recover);
                         page->recover = recover;
                 }
-               scrub_put_recover(recover);
+               scrub_put_recover(fs_info, recover);
                 length -= sublen;
                 logical += sublen;
                 page_index++;
@@ -1497,14 +1747,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
  
                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
                 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
+                       if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
+                               page->io_error = 1;
                                 sblock->no_io_error_seen = 0;
+                       }
                 } else {
                         bio->bi_iter.bi_sector = page->physical >> 9;
                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
  
-                       if (btrfsic_submit_bio_wait(bio))
+                       if (btrfsic_submit_bio_wait(bio)) {
+                               page->io_error = 1;
                                 sblock->no_io_error_seen = 0;
+                       }
                 }
  
                 bio_put(bio);
@@ -1634,7 +1888,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
         if (spage->io_error) {
                 void *mapped_buffer = kmap_atomic(spage->page);
  
-               memset(mapped_buffer, 0, PAGE_SIZE);
+               clear_page(mapped_buffer);
                 flush_dcache_page(spage->page);
                 kunmap_atomic(mapped_buffer);
         }
@@ -1998,12 +2252,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
  
  static void scrub_block_get(struct scrub_block *sblock)
  {
-       atomic_inc(&sblock->refs);
+       refcount_inc(&sblock->refs);
  }
  
  static void scrub_block_put(struct scrub_block *sblock)
  {
-       if (atomic_dec_and_test(&sblock->refs)) {
+       if (refcount_dec_and_test(&sblock->refs)) {
                 int i;
  
                 if (sblock->sparity)
@@ -2187,8 +2441,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
         int ret;
         int i;
  
+       btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio, 0, 1);
+                       &length, &bbio);
         if (ret || !bbio || !bbio->raid_map)
                 goto bbio_out;
  
@@ -2231,6 +2486,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
  rbio_out:
         bio_put(bio);
  bbio_out:
+       btrfs_bio_counter_dec(fs_info);
         btrfs_put_bbio(bbio);
         spin_lock(&sctx->stat_lock);
         sctx->stat.malloc_errors++;
@@ -2255,7 +2511,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
  
         /* one ref inside this function, plus one for each page added to
          * a bio later on */
-       atomic_set(&sblock->refs, 1);
+       refcount_set(&sblock->refs, 1);
         sblock->sctx = sctx;
         sblock->no_io_error_seen = 1;
  
@@ -2385,7 +2641,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
                                        unsigned long *bitmap,
                                        u64 start, u64 len)
  {
-       u32 offset;
+       u64 offset;
         int nsectors;
         int sectorsize = sparity->sctx->fs_info->sectorsize;
  
@@ -2395,8 +2651,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
         }
  
         start -= sparity->logic_start;
-       start = div_u64_rem(start, sparity->stripe_len, &offset);
-       offset /= sectorsize;
+       start = div64_u64_rem(start, sparity->stripe_len, &offset);
+       offset = div_u64(offset, sectorsize);
         nsectors = (int)len / sectorsize;
  
         if (offset + nsectors <= sparity->nsectors) {
@@ -2555,7 +2811,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
  
         /* one ref inside this function, plus one for each page added to
          * a bio later on */
-       atomic_set(&sblock->refs, 1);
+       refcount_set(&sblock->refs, 1);
         sblock->sctx = sctx;
         sblock->no_io_error_seen = 1;
         sblock->sparity = sparity;
@@ -2694,7 +2950,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
         for (i = 0; i < nr_data_stripes(map); i++) {
                 *offset = last_offset + i * map->stripe_len;
  
-               stripe_nr = div_u64(*offset, map->stripe_len);
+               stripe_nr = div64_u64(*offset, map->stripe_len);
                 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
  
                 /* Work out the disk rotation on this stripe-set */
@@ -2765,7 +3021,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         struct btrfs_fs_info *fs_info = sctx->fs_info;
         struct bio *bio;
         struct btrfs_raid_bio *rbio;
-       struct scrub_page *spage;
         struct btrfs_bio *bbio = NULL;
         u64 length;
         int ret;
@@ -2775,8 +3030,10 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
                 goto out;
  
         length = sparity->logic_end - sparity->logic_start;
+
+       btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
-                              &length, &bbio, 0, 1);
+                              &length, &bbio);
         if (ret || !bbio || !bbio->raid_map)
                 goto bbio_out;
  
@@ -2795,9 +3052,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         if (!rbio)
                 goto rbio_out;
  
-       list_for_each_entry(spage, &sparity->spages, list)
-               raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
         scrub_pending_bio_inc(sctx);
         raid56_parity_submit_scrub_rbio(rbio);
         return;
@@ -2805,6 +3059,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
  rbio_out:
         bio_put(bio);
  bbio_out:
+       btrfs_bio_counter_dec(fs_info);
         btrfs_put_bbio(bbio);
         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                   sparity->nsectors);
@@ -2822,12 +3077,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
  
  static void scrub_parity_get(struct scrub_parity *sparity)
  {
-       atomic_inc(&sparity->refs);
+       refcount_inc(&sparity->refs);
  }
  
  static void scrub_parity_put(struct scrub_parity *sparity)
  {
-       if (!atomic_dec_and_test(&sparity->refs))
+       if (!refcount_dec_and_test(&sparity->refs))
                 return;
  
         scrub_parity_check_and_repair(sparity);
@@ -2879,7 +3134,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         sparity->scrub_dev = sdev;
         sparity->logic_start = logic_start;
         sparity->logic_end = logic_end;
-       atomic_set(&sparity->refs, 1);
+       refcount_set(&sparity->refs, 1);
         INIT_LIST_HEAD(&sparity->spages);
         sparity->dbitmap = sparity->bitmap;
         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3098,7 +3353,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
  
         physical = map->stripes[num].physical;
         offset = 0;
-       nstripes = div_u64(length, map->stripe_len);
+       nstripes = div64_u64(length, map->stripe_len);
         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                 offset = map->stripe_len * num;
                 increment = map->stripe_len * map->num_stripes;