MD: raid5 trim support

author Shaohua Li <shli@kernel.org>

Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)

committer NeilBrown <neilb@suse.de>

Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)
author Shaohua Li <shli@kernel.org>
Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)
committer NeilBrown <neilb@suse.de>
Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 474d0d8b2296bc233f7a4d5a02c9cbb9454cc463..8a82169660abb0fac3a405c528df5a23eae562d6 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -577,6 +577,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                 rw = WRITE_FUA;
                         else
                                 rw = WRITE;
+                       if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+                               rw |= REQ_DISCARD;
                 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
                         rw = READ;
                 else if (test_and_clear_bit(R5_WantReplace,
@@ -1200,8 +1202,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                                         set_bit(R5_WantFUA, &dev->flags);
                                 if (wbi->bi_rw & REQ_SYNC)
                                         set_bit(R5_SyncIO, &dev->flags);
-                               tx = async_copy_data(1, wbi, dev->page,
-                                       dev->sector, tx);
+                               if (wbi->bi_rw & REQ_DISCARD) {
+                                       memset(page_address(dev->page), 0,
+                                               STRIPE_SECTORS << 9);
+                                       set_bit(R5_Discard, &dev->flags);
+                               } else
+                                       tx = async_copy_data(1, wbi, dev->page,
+                                               dev->sector, tx);
                                 wbi = r5_next_bio(wbi, dev->sector);
                         }
                 }
@@ -1267,6 +1274,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
         pr_debug("%s: stripe %llu\n", __func__,
                 (unsigned long long)sh->sector);
  
+       for (i = 0; i < sh->disks; i++) {
+               if (pd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               memset(page_address(sh->dev[pd_idx].page), 0,
+                       STRIPE_SECTORS << 9);
+               set_bit(R5_Discard, &sh->dev[pd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
         /* check if prexor is active which means only process blocks
          * that are part of a read-modify-write (written)
          */
@@ -1311,10 +1332,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
  {
         struct async_submit_ctl submit;
         struct page **blocks = percpu->scribble;
-       int count;
+       int count, i;
  
         pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  
+       for (i = 0; i < sh->disks; i++) {
+               if (sh->pd_idx == i || sh->qd_idx == i)
+                       continue;
+               if (!test_bit(R5_Discard, &sh->dev[i].flags))
+                       break;
+       }
+       if (i >= sh->disks) {
+               atomic_inc(&sh->count);
+               memset(page_address(sh->dev[sh->pd_idx].page), 0,
+                       STRIPE_SECTORS << 9);
+               memset(page_address(sh->dev[sh->qd_idx].page), 0,
+                       STRIPE_SECTORS << 9);
+               set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+               set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+               ops_complete_reconstruct(sh);
+               return;
+       }
+
         count = set_syndrome_sources(blocks, sh);
  
         atomic_inc(&sh->count);
@@ -4127,6 +4166,88 @@ static void release_stripe_plug(struct mddev *mddev,
                 release_stripe(sh);
  }
  
+static void make_discard_request(struct mddev *mddev, struct bio *bi)
+{
+       struct r5conf *conf = mddev->private;
+       sector_t logical_sector, last_sector;
+       struct stripe_head *sh;
+       int remaining;
+       int stripe_sectors;
+
+       if (mddev->reshape_position != MaxSector)
+               /* Skip discard while reshape is happening */
+               return;
+
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
+
+       bi->bi_next = NULL;
+       bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+
+       stripe_sectors = conf->chunk_sectors *
+               (conf->raid_disks - conf->max_degraded);
+       logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
+                                              stripe_sectors);
+       sector_div(last_sector, stripe_sectors);
+
+       logical_sector *= conf->chunk_sectors;
+       last_sector *= conf->chunk_sectors;
+
+       for (;logical_sector < last_sector;
+            logical_sector += STRIPE_SECTORS) {
+               DEFINE_WAIT(w);
+               int d;
+       again:
+               sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+               prepare_to_wait(&conf->wait_for_overlap, &w,
+                               TASK_UNINTERRUPTIBLE);
+               spin_lock_irq(&sh->stripe_lock);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       if (sh->dev[d].towrite || sh->dev[d].toread) {
+                               set_bit(R5_Overlap, &sh->dev[d].flags);
+                               spin_unlock_irq(&sh->stripe_lock);
+                               release_stripe(sh);
+                               schedule();
+                               goto again;
+                       }
+               }
+               finish_wait(&conf->wait_for_overlap, &w);
+               for (d = 0; d < conf->raid_disks; d++) {
+                       if (d == sh->pd_idx || d == sh->qd_idx)
+                               continue;
+                       sh->dev[d].towrite = bi;
+                       set_bit(R5_OVERWRITE, &sh->dev[d].flags);
+                       raid5_inc_bi_active_stripes(bi);
+               }
+               spin_unlock_irq(&sh->stripe_lock);
+               if (conf->mddev->bitmap) {
+                       for (d = 0;
+                            d < conf->raid_disks - conf->max_degraded;
+                            d++)
+                               bitmap_startwrite(mddev->bitmap,
+                                                 sh->sector,
+                                                 STRIPE_SECTORS,
+                                                 0);
+                       sh->bm_seq = conf->seq_flush + 1;
+                       set_bit(STRIPE_BIT_DELAY, &sh->state);
+               }
+
+               set_bit(STRIPE_HANDLE, &sh->state);
+               clear_bit(STRIPE_DELAYED, &sh->state);
+               if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                       atomic_inc(&conf->preread_active_stripes);
+               release_stripe_plug(mddev, sh);
+       }
+
+       remaining = raid5_dec_bi_active_stripes(bi);
+       if (remaining == 0) {
+               md_write_end(mddev);
+               bio_endio(bi, 0);
+       }
+}
+
  static void make_request(struct mddev *mddev, struct bio * bi)
  {
         struct r5conf *conf = mddev->private;
@@ -4149,6 +4270,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
              chunk_aligned_read(mddev,bi))
                 return;
  
+       if (unlikely(bi->bi_rw & REQ_DISCARD)) {
+               make_discard_request(mddev, bi);
+               return;
+       }
+
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         last_sector = bi->bi_sector + (bi->bi_size>>9);
         bi->bi_next = NULL;
@@ -5729,6 +5855,7 @@ static int run(struct mddev *mddev)
  
         if (mddev->queue) {
                 int chunk_size;
+               bool discard_supported = true;
                 /* read-ahead size must cover two whole stripes, which
                  * is 2 * (datadisks) * chunksize where 'n' is the
                  * number of raid devices
@@ -5748,13 +5875,48 @@ static int run(struct mddev *mddev)
                 blk_queue_io_min(mddev->queue, chunk_size);
                 blk_queue_io_opt(mddev->queue, chunk_size *
                                  (conf->raid_disks - conf->max_degraded));
+               /*
+                * We can only discard a whole stripe. It doesn't make sense to
+                * discard data disk but write parity disk
+                */
+               stripe = stripe * PAGE_SIZE;
+               mddev->queue->limits.discard_alignment = stripe;
+               mddev->queue->limits.discard_granularity = stripe;
+               /*
+                * unaligned part of discard request will be ignored, so can't
+                * guarantee discard_zerors_data
+                */
+               mddev->queue->limits.discard_zeroes_data = 0;
  
                 rdev_for_each(rdev, mddev) {
                         disk_stack_limits(mddev->gendisk, rdev->bdev,
                                           rdev->data_offset << 9);
                         disk_stack_limits(mddev->gendisk, rdev->bdev,
                                           rdev->new_data_offset << 9);
+                       /*
+                        * discard_zeroes_data is required, otherwise data
+                        * could be lost. Consider a scenario: discard a stripe
+                        * (the stripe could be inconsistent if
+                        * discard_zeroes_data is 0); write one disk of the
+                        * stripe (the stripe could be inconsistent again
+                        * depending on which disks are used to calculate
+                        * parity); the disk is broken; The stripe data of this
+                        * disk is lost.
+                        */
+                       if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
+                           !bdev_get_queue(rdev->bdev)->
+                                               limits.discard_zeroes_data)
+                               discard_supported = false;
                 }
+
+               if (discard_supported &&
+                  mddev->queue->limits.max_discard_sectors >= stripe &&
+                  mddev->queue->limits.discard_granularity >= stripe)
+                       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
+               else
+                       queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+                                               mddev->queue);
         }
  
         return 0;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h

index 813b6c18f2501211d407dd446f41ded9f0d01ca8..2afd8358556b6989668add1e25c187b9caa6b1ed 100644 (file)
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -299,6 +299,7 @@ enum r5dev_flags {
         R5_WantReplace, /* We need to update the replacement, we have read
                          * data in, and now is a good time to write it out.
                          */
+       R5_Discard,     /* Discard the stripe */
  };
  
  /*
author	Shaohua Li <shli@kernel.org>
	Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)
committer	NeilBrown <neilb@suse.de>
	Mon, 24 Sep 2012 07:08:43 +0000 (17:08 +1000)
drivers/md/raid5.c		patch \| blob \| history
drivers/md/raid5.h		patch \| blob \| history