X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=a33dac7c2e2fb143d674a48b8ab19280cdd36dd8;hb=b5e98d65d34a1c11a2135ea8a9b2619dbc7216c8;hp=d89a25e7c17bb0d50b0bc78bb290877ddb31ea8a;hpb=d84e0f10d38393f617227f0c831a99c69294651f;p=karo-tx-linux.git diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d89a25e7c17b..a33dac7c2e2f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1822,7 +1822,79 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } +static int +handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked = 0; + + if (rcw) { + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + sh->ops.count++; + } + + set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + sh->ops.count++; + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + + sh->ops.count += 3; + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i == pd_idx) + continue; + + /* For a read-modify write there may be blocks that are + * locked for reading while others are ready to be + * written so we distinguish these blocks by the + * R5_Wantprexor bit + */ + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantprexor, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations + * are in flight + */ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + + pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.pending); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -1977,9 +2049,12 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, bi = bi2; } - /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags) || - test_bit(R5_ReadError, &sh->dev[i].flags)) { + /* fail any reads if this device is non-operational and + * the data has not reached the cache yet. + */ + if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags))) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) @@ -2005,36 +2080,101 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, } +/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks + * to process + */ +static int __handle_issuing_new_read_requests5(struct stripe_head *sh, + struct stripe_head_state *s, int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *failed_dev = &sh->dev[s->failed_num]; + + /* don't schedule compute operations or reads on the parity block while + * a check is in flight + */ + if ((disk_idx == sh->pd_idx) && + test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + return ~0; + + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || (s->failed && + (failed_dev->toread || (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags) + ))))) { + /* 1/ We would like to get this block, possibly by computing it, + * but we might not be able to. + * + * 2/ Since parity check operations potentially make the parity + * block !uptodate it will need to be refreshed before any + * compute operations on data disks are scheduled. + * + * 3/ We hold off parity block re-reads until check operations + * have quiesced. + */ + if ((s->uptodate == disks - 1) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + s->req_compute = 1; + sh->ops.count++; + /* Careful: from this point on 'uptodate' is in the eye + * of raid5_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 0; /* uptodate + compute == disks */ + } else if ((s->uptodate < disks - 1) && + test_bit(R5_Insync, &dev->flags)) { + /* Note: we hold off compute operations while checks are + * in flight, but we still prefer 'compute' over 'read' + * hence we only read if (uptodate < * disks-1) + */ + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", disk_idx, + s->syncing); + } + } + + return ~0; +} + static void handle_issuing_new_read_requests5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed && (sh->dev[s->failed_num].toread || - (sh->dev[s->failed_num].towrite && - !test_bit(R5_OVERWRITE, &sh->dev[s->failed_num].flags)) - )))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if (s->uptodate == disks-1) { - pr_debug("Computing block %d\n", i); - compute_block(sh, i); - s->uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); - } - } + + /* Clear completed compute operations. Parity recovery + * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled + * later on in this routine + */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i = disks; i--; ) + if (__handle_issuing_new_read_requests5( + sh, s, i, disks) == 0) + break; } set_bit(STRIPE_HANDLE, &sh->state); } @@ -2151,7 +2291,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags)) { + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; else @@ -2160,9 +2301,9 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) - rcw++; + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } @@ -2176,7 +2317,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -2198,7 +2340,8 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -2216,28 +2359,18 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, /* now if nothing is locked, and if we have enough data, * we can start a write request */ - if (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - pr_debug("Computing parity...\n"); - compute_parity5(sh, rcw == 0 ? - RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - /* now every locked buffer is ready to be written */ - for (i = disks; i--; ) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - pr_debug("Writing block %d\n", i); - s->locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_bit(R5_Insync, &sh->dev[i].flags) - || (i == sh->pd_idx && s->failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } + /* since handle_stripe can be called at any time we need to handle the + * case where a compute block operation has been submitted and then a + * subsequent call wants to start a write request. raid5_run_ops only + * handles the case where compute block and postxor are requested + * simultaneously. If this is not the case then new writes need to be + * held off until the compute completes. + */ + if ((s->req_compute || + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + s->locked += handle_write_operations5(sh, rcw == 0, 0); } static void handle_issuing_new_write_requests6(raid5_conf_t *conf, @@ -2341,26 +2474,67 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { set_bit(STRIPE_HANDLE, &sh->state); - if (s->failed == 0) { - BUG_ON(s->uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - s->uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ + /* Take one of the following actions: + * 1/ start a check parity operation if (uptodate == disks) + * 2/ finish a check parity operation and act on the result + * 3/ skip to the writeback section if we previously + * initiated a recovery operation + */ + if (s->failed == 0 && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(s->uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + s->uptodate--; + } else if ( + test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - s->uptodate++; + conf->mddev->resync_mismatches += + STRIPE_SECTORS; + if (test_bit( + MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + set_bit(STRIPE_OP_COMPUTE_BLK, + &sh->ops.pending); + set_bit(STRIPE_OP_MOD_REPAIR_PD, + &sh->ops.pending); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + sh->ops.count++; + s->uptodate++; + } } } } - if (!test_bit(STRIPE_INSYNC, &sh->state)) { + + /* check if we can clear a parity disk reconstruct */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + + clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* Wait for check parity and compute block operations to complete + * before write-back + */ + if (!test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { struct r5dev *dev; /* either failed parity check, or recovery is happening */ if (s->failed == 0) @@ -2569,36 +2743,27 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - pr_debug("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } - } + pr_debug("check %d: state 0x%lx toread %p read %p write %p " + "written %p\n", i, dev->flags, dev->toread, dev->read, + dev->towrite, dev->written); + + /* maybe we can request a biofill operation + * + * new wantfill requests are only permitted while + * STRIPE_OP_BIOFILL is clear + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + set_bit(R5_Wantfill, &dev->flags); /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; - if (dev->toread) + if (test_bit(R5_Wantfill, &dev->flags)) + s.to_fill++; + else if (dev->toread) s.to_read++; if (dev->towrite) { s.to_write++; @@ -2621,6 +2786,10 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + sh->ops.count++; + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", s.locked, s.uptodate, s.to_read, s.to_write, @@ -2653,20 +2822,88 @@ static void handle_stripe5(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate < disks)) || s.expanding) + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_issuing_new_read_requests5(sh, &s, disks); - /* now to consider writing and what else, if anything should be read */ - if (s.to_write) + /* Now we check to see if any write operations have recently + * completed + */ + + /* leave prexor set until postxor is done, allows us to distinguish + * a rmw from a rcw during biodrain + */ + if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + for (i = disks; i--; ) + clear_bit(R5_Wantprexor, &sh->dev[i].flags); + } + + /* if only POSTXOR is set then this is an 'expand' postxor */ + if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + + /* All the 'written' buffers and the parity block are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || dev->written)) { + pr_debug("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit( + STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + if (!test_bit(R5_Insync, &dev->flags) || + (i == sh->pd_idx && s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } + + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) handle_issuing_new_write_requests5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough data - * is available + * Any reads will already have been scheduled, so we just see if enough + * data is available. The parity check is held off while parity + * dependent operations are in flight. */ - if (s.syncing && s.locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) + if ((s.syncing && s.locked == 0 && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) handle_parity_checks5(conf, sh, &s, disks); + if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state);