From: NeilBrown Date: Tue, 26 Jul 2011 01:34:20 +0000 (+1000) Subject: md/raid5: replace sh->lock with an 'active' flag. X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=c4c1663be46b2ab94e59d3e0c583a8f6b188ff0c;p=linux-beck.git md/raid5: replace sh->lock with an 'active' flag. sh->lock is now mainly used to ensure that two threads aren't running in the locked part of handle_stripe[56] at the same time. That can more neatly be achieved with an 'active' flag which we set while running handle_stripe. If we find the flag is set, we simply requeue the stripe for later by setting STRIPE_HANDLE. For safety we take ->device_lock while examining the state of the stripe and creating a summary in 'stripe_head_state / r6_state'. This possibly isn't needed but as shared fields like ->toread, ->towrite are checked it is safer for now at least. We leave the label after the old 'unlock' called "unlock" because it will disappear in a few patches, so renaming seems pointless. This leaves the stripe 'locked' for longer as we clear STRIPE_ACTIVE later, but that is not a problem. Signed-off-by: NeilBrown Reviewed-by: Namhyung Kim --- diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9985138f4c04..f8275b5a6fbe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1020,14 +1020,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock(&sh->lock); spin_lock_irq(&sh->raid_conf->device_lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->raid_conf->device_lock); - spin_unlock(&sh->lock); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1322,7 +1320,6 @@ static int grow_one_stripe(raid5_conf_t *conf) return 0; sh->raid_conf = conf; - spin_lock_init(&sh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&sh->ops.wait_for_ops); #endif @@ -1442,7 +1439,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) break; nsh->raid_conf = conf; - spin_lock_init(&nsh->lock); #ifdef CONFIG_MULTICORE_RAID456 init_waitqueue_head(&nsh->ops.wait_for_ops); #endif @@ -2148,7 +2144,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)sh->sector); - spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; @@ -2184,7 +2179,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, @@ -2201,7 +2195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); return 0; } @@ -3023,12 +3016,10 @@ static void handle_stripe5(struct stripe_head *sh) atomic_read(&sh->count), sh->pd_idx, sh->check_state, sh->reconstruct_state); - spin_lock(&sh->lock); if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); } - clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state); @@ -3037,6 +3028,7 @@ static void handle_stripe5(struct stripe_head *sh) /* Now to look around and see what can be done */ rcu_read_lock(); + spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -3099,6 +3091,7 @@ static void handle_stripe5(struct stripe_head *sh) s.failed_num = i; } } + spin_unlock_irq(&conf->device_lock); rcu_read_unlock(); if (unlikely(blocked_rdev)) { @@ -3275,7 +3268,6 @@ static void handle_stripe5(struct stripe_head *sh) handle_stripe_expansion(conf, sh, NULL); unlock: - spin_unlock(&sh->lock); /* wait for this device to become unblocked */ if (unlikely(blocked_rdev)) @@ -3318,12 +3310,10 @@ static void handle_stripe6(struct stripe_head *sh) sh->check_state, sh->reconstruct_state); memset(&s, 0, sizeof(s)); - spin_lock(&sh->lock); if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); } - clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state); @@ -3332,6 +3322,7 @@ static void handle_stripe6(struct stripe_head *sh) /* Now to look around and see what can be done */ rcu_read_lock(); + spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; @@ -3395,6 +3386,7 @@ static void handle_stripe6(struct stripe_head *sh) s.failed++; } } + spin_unlock_irq(&conf->device_lock); rcu_read_unlock(); if (unlikely(blocked_rdev)) { @@ -3580,7 +3572,6 @@ static void handle_stripe6(struct stripe_head *sh) handle_stripe_expansion(conf, sh, &r6s); unlock: - spin_unlock(&sh->lock); /* wait for this device to become unblocked */ if (unlikely(blocked_rdev)) @@ -3608,10 +3599,19 @@ static void handle_stripe6(struct stripe_head *sh) static void handle_stripe(struct stripe_head *sh) { + clear_bit(STRIPE_HANDLE, &sh->state); + if (test_and_set_bit(STRIPE_ACTIVE, &sh->state)) { + /* already being handled, ensure it gets handled + * again when current action finishes */ + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + if (sh->raid_conf->level == 6) handle_stripe6(sh); else handle_stripe5(sh); + clear_bit(STRIPE_ACTIVE, &sh->state); } static void raid5_activate_delayed(raid5_conf_t *conf) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a33001137bf8..bb246d9e0547 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -6,11 +6,11 @@ /* * - * Each stripe contains one buffer per disc. Each buffer can be in + * Each stripe contains one buffer per device. Each buffer can be in * one of a number of states stored in "flags". Changes between - * these states happen *almost* exclusively under a per-stripe - * spinlock. Some very specific changes can happen in bi_end_io, and - * these are not protected by the spin lock. + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE. * * The flag bits that are used to represent these states are: * R5_UPTODATE and R5_LOCKED @@ -76,12 +76,10 @@ * block and the cached buffer are successfully written, any buffer on * a written list can be returned with b_end_io. * - * The write list and read list both act as fifos. The read list is - * protected by the device_lock. The write and written lists are - * protected by the stripe lock. The device_lock, which can be - * claimed while the stipe lock is held, is only for list - * manipulations and will only be held for a very short time. It can - * be claimed from interrupts. + * The write list and read list both act as fifos. The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time. It can be claimed from interrupts. * * * Stripes in the stripe cache can be on one of two lists (or on @@ -96,7 +94,6 @@ * * The inactive_list, handle_list and hash bucket lists are all protected by the * device_lock. - * - stripes on the inactive_list never have their stripe_lock held. * - stripes have a reference counter. If count==0, they are on a list. * - If a stripe might need handling, STRIPE_HANDLE is set. * - When refcount reaches zero, then if STRIPE_HANDLE it is put on @@ -116,10 +113,10 @@ * attach a request to an active stripe (add_stripe_bh()) * lockdev attach-buffer unlockdev * handle a stripe (handle_stripe()) - * lockstripe clrSTRIPE_HANDLE ... + * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... * (lockdev check-buffers unlockdev) .. * change-state .. - * record io/ops needed unlockstripe schedule io/ops + * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops * release an active stripe (release_stripe()) * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * @@ -128,8 +125,7 @@ * on a cached buffer, and plus one if the stripe is undergoing stripe * operations. * - * Stripe operations are performed outside the stripe lock, - * the stripe operations are: + * The stripe operations are: * -copying data between the stripe cache and user application buffers * -computing blocks to save a disk access, or to recover a missing block * -updating the parity on a write operation (reconstruct write and @@ -159,7 +155,8 @@ */ /* - * Operations state - intermediate states that are visible outside of sh->lock + * Operations state - intermediate states that are visible outside of + * STRIPE_ACTIVE. * In general _idle indicates nothing is running, _run indicates a data * processing operation is active, and _result means the data processing result * is stable and can be acted upon. For simple operations like biofill and @@ -209,7 +206,6 @@ struct stripe_head { short ddf_layout;/* use DDF ordering to calculate Q */ unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ - spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ enum check_states check_state; @@ -240,7 +236,7 @@ struct stripe_head { }; /* stripe_head_state - collects and tracks the dynamic state of a stripe_head - * for handle_stripe. It is only valid under spin_lock(sh->lock); + * for handle_stripe. */ struct stripe_head_state { int syncing, expanding, expanded; @@ -290,6 +286,7 @@ struct r6_state { * Stripe state */ enum { + STRIPE_ACTIVE, STRIPE_HANDLE, STRIPE_SYNC_REQUESTED, STRIPE_SYNCING, @@ -339,7 +336,7 @@ enum { * PREREAD_ACTIVE. * In stripe_handle, if we find pre-reading is necessary, we do it if * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. - * HANDLE gets cleared if stripe_handle leave nothing locked. + * HANDLE gets cleared if stripe_handle leaves nothing locked. */