]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/md/raid5.c
md/raid5: write errors should be recorded as bad blocks if possible.
[karo-tx-linux.git] / drivers / md / raid5.c
index 0cd591472e1f863137749d5027e7c3ba3d71c4cb..9768a7d67148225ccafad99f473c1d4bc107729a 100644 (file)
@@ -340,7 +340,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                               (unsigned long long)sh->sector, i, dev->toread,
                               dev->read, dev->towrite, dev->written,
                               test_bit(R5_LOCKED, &dev->flags));
-                       BUG();
+                       WARN_ON(1);
                }
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
@@ -547,10 +547,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
                        bi->bi_next = NULL;
-                       if ((rw & WRITE) &&
-                           test_bit(R5_ReWrite, &sh->dev[i].flags))
-                               atomic_add(STRIPE_SECTORS,
-                                       &rdev->corrected_errors);
                        generic_make_request(bi);
                } else {
                        if (rw & WRITE)
@@ -1590,6 +1586,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
                                (unsigned long long)(sh->sector
                                                     + rdev->data_offset),
                                bdevname(rdev->bdev, b));
+                       atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
                }
@@ -1661,8 +1658,10 @@ static void raid5_end_write_request(struct bio *bi, int error)
                return;
        }
 
-       if (!uptodate)
-               md_error(conf->mddev, conf->disks[i].rdev);
+       if (!uptodate) {
+               set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
+               set_bit(R5_WriteError, &sh->dev[i].flags);
+       }
 
        rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
        
@@ -1709,6 +1708,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                 */
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
        }
+       set_bit(Blocked, &rdev->flags);
        set_bit(Faulty, &rdev->flags);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        printk(KERN_ALERT
@@ -2234,9 +2234,18 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                        rcu_read_lock();
                        rdev = rcu_dereference(conf->disks[i].rdev);
                        if (rdev && test_bit(In_sync, &rdev->flags))
-                               /* multiple read failures in one stripe */
-                               md_error(conf->mddev, rdev);
+                               atomic_inc(&rdev->nr_pending);
+                       else
+                               rdev = NULL;
                        rcu_read_unlock();
+                       if (rdev) {
+                               if (!rdev_set_badblocks(
+                                           rdev,
+                                           sh->sector,
+                                           STRIPE_SECTORS, 0))
+                                       md_error(conf->mddev, rdev);
+                               rdev_dec_pending(rdev, conf->mddev);
+                       }
                }
                spin_lock_irq(&conf->device_lock);
                /* fail all writes first */
@@ -2304,6 +2313,10 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                if (bitmap_end)
                        bitmap_endwrite(conf->mddev->bitmap, sh->sector,
                                        STRIPE_SECTORS, 0, 0);
+               /* If we were in the middle of a write the parity block might
+                * still be locked - so just clear all R5_LOCKED flags
+                */
+               clear_bit(R5_LOCKED, &sh->dev[i].flags);
        }
 
        if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -2311,6 +2324,41 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                        md_wakeup_thread(conf->mddev->thread);
 }
 
+static void
+handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
+                  struct stripe_head_state *s)
+{
+       int abort = 0;
+       int i;
+
+       md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
+       clear_bit(STRIPE_SYNCING, &sh->state);
+       s->syncing = 0;
+       /* There is nothing more to do for sync/check/repair.
+        * For recover we need to record a bad block on all
+        * non-sync devices, or abort the recovery
+        */
+       if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
+               return;
+       /* During recovery devices cannot be removed, so locking and
+        * refcounting of rdevs is not needed
+        */
+       for (i = 0; i < conf->raid_disks; i++) {
+               mdk_rdev_t *rdev = conf->disks[i].rdev;
+               if (!rdev
+                   || test_bit(Faulty, &rdev->flags)
+                   || test_bit(In_sync, &rdev->flags))
+                       continue;
+               if (!rdev_set_badblocks(rdev, sh->sector,
+                                       STRIPE_SECTORS, 0))
+                       abort = 1;
+       }
+       if (abort) {
+               conf->recovery_disabled = conf->mddev->recovery_disabled;
+               set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
+       }
+}
+
 /* fetch_block - checks the given member device to see if its data needs
  * to be read or computed to satisfy a request.
  *
@@ -2921,6 +2969,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
        spin_lock_irq(&conf->device_lock);
        for (i=disks; i--; ) {
                mdk_rdev_t *rdev;
+               sector_t first_bad;
+               int bad_sectors;
+               int is_bad = 0;
 
                dev = &sh->dev[i];
 
@@ -2957,21 +3008,46 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                if (dev->written)
                        s->written++;
                rdev = rcu_dereference(conf->disks[i].rdev);
-               if (s->blocked_rdev == NULL &&
-                   rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-                       s->blocked_rdev = rdev;
-                       atomic_inc(&rdev->nr_pending);
+               if (rdev) {
+                       is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+                                            &first_bad, &bad_sectors);
+                       if (s->blocked_rdev == NULL
+                           && (test_bit(Blocked, &rdev->flags)
+                               || is_bad < 0)) {
+                               if (is_bad < 0)
+                                       set_bit(BlockedBadBlocks,
+                                               &rdev->flags);
+                               s->blocked_rdev = rdev;
+                               atomic_inc(&rdev->nr_pending);
+                       }
                }
                clear_bit(R5_Insync, &dev->flags);
                if (!rdev)
                        /* Not in-sync */;
-               else if (test_bit(In_sync, &rdev->flags))
+               else if (is_bad) {
+                       /* also not in-sync */
+                       if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+                               /* treat as in-sync, but with a read error
+                                * which we can now try to correct
+                                */
+                               set_bit(R5_Insync, &dev->flags);
+                               set_bit(R5_ReadError, &dev->flags);
+                       }
+               } else if (test_bit(In_sync, &rdev->flags))
                        set_bit(R5_Insync, &dev->flags);
                else {
                        /* in sync if before recovery_offset */
                        if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
                                set_bit(R5_Insync, &dev->flags);
                }
+               if (test_bit(R5_WriteError, &dev->flags)) {
+                       clear_bit(R5_Insync, &dev->flags);
+                       if (!test_bit(Faulty, &rdev->flags)) {
+                               s->handle_bad_blocks = 1;
+                               atomic_inc(&rdev->nr_pending);
+                       } else
+                               clear_bit(R5_WriteError, &dev->flags);
+               }
                if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
@@ -3020,6 +3096,11 @@ static void handle_stripe(struct stripe_head *sh)
 
        analyse_stripe(sh, &s);
 
+       if (s.handle_bad_blocks) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto finish;
+       }
+
        if (unlikely(s.blocked_rdev)) {
                if (s.syncing || s.expanding || s.expanded ||
                    s.to_write || s.written) {
@@ -3045,11 +3126,8 @@ static void handle_stripe(struct stripe_head *sh)
         */
        if (s.failed > conf->max_degraded && s.to_read+s.to_write+s.written)
                handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
-       if (s.failed > conf->max_degraded && s.syncing) {
-               md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
-               clear_bit(STRIPE_SYNCING, &sh->state);
-               s.syncing = 0;
-       }
+       if (s.failed > conf->max_degraded && s.syncing)
+               handle_failed_sync(conf, sh, &s);
 
        /*
         * might be able to return some write requests if the parity blocks
@@ -3220,6 +3298,20 @@ finish:
        if (unlikely(s.blocked_rdev))
                md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
 
+       if (s.handle_bad_blocks)
+               for (i = disks; i--; ) {
+                       mdk_rdev_t *rdev;
+                       struct r5dev *dev = &sh->dev[i];
+                       if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
+                               /* We own a safe reference to the rdev */
+                               rdev = conf->disks[i].rdev;
+                               if (!rdev_set_badblocks(rdev, sh->sector,
+                                                       STRIPE_SECTORS, 0))
+                                       md_error(conf->mddev, rdev);
+                               rdev_dec_pending(rdev, conf->mddev);
+                       }
+               }
+
        if (s.ops_request)
                raid_run_ops(sh, s.ops_request);
 
@@ -3469,6 +3561,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
        if (rdev && test_bit(In_sync, &rdev->flags)) {
+               sector_t first_bad;
+               int bad_sectors;
+
                atomic_inc(&rdev->nr_pending);
                rcu_read_unlock();
                raid_bio->bi_next = (void*)rdev;
@@ -3476,8 +3571,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
                align_bi->bi_sector += rdev->data_offset;
 
-               if (!bio_fits_rdev(align_bi)) {
-                       /* too big in some way */
+               if (!bio_fits_rdev(align_bi) ||
+                   is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+                               &first_bad, &bad_sectors)) {
+                       /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
                        rdev_dec_pending(rdev, mddev);
                        return 0;
@@ -4142,6 +4239,9 @@ static void raid5d(mddev_t *mddev)
                release_stripe(sh);
                cond_resched();
 
+               if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
+                       md_check_recovery(mddev);
+
                spin_lock_irq(&conf->device_lock);
        }
        pr_debug("%d stripes handled\n", handled);
@@ -4946,6 +5046,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
+                   mddev->recovery_disabled != conf->recovery_disabled &&
                    !has_failed(conf) &&
                    number < conf->raid_disks) {
                        err = -EBUSY;
@@ -4974,6 +5075,9 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
 
+       if (mddev->recovery_disabled == conf->recovery_disabled)
+               return -EBUSY;
+
        if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;