From: NeilBrown Date: Tue, 11 Oct 2011 05:50:01 +0000 (+1100) Subject: md: add proper write-congestion reporting to RAID1 and RAID10. X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=34db0cd60f8a1f4ab73d118a8be3797c20388223;p=linux-beck.git md: add proper write-congestion reporting to RAID1 and RAID10. RAID1 and RAID10 handle write requests by queuing them for handling by a separate thread. This is because when a write-intent-bitmap is active we might need to update the bitmap first, so it is good to queue a lot of writes, then do one big bitmap update for them all. However writeback request devices to appear to be congested after a while so it can make some guesstimate of throughput. The infinite queue defeats that (note that RAID5 has already has a finite queue so it doesn't suffer from this problem). So impose a limit on the number of pending write requests. By default it is 1024 which seems to be generally suitable. Make it configurable via module option just in case someone finds a regression. Signed-off-by: NeilBrown --- diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e023a25acf54..d8957d74fd25 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -45,6 +45,11 @@ */ #define NR_RAID1_BIOS 256 +/* When there are this many requests queue to be written by + * the raid1 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; static void allow_barrier(struct r1conf *conf); static void lower_barrier(struct r1conf *conf); @@ -598,6 +603,10 @@ int md_raid1_congested(struct mddev *mddev, int bits) struct r1conf *conf = mddev->private; int i, ret = 0; + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + rcu_read_lock(); for (i = 0; i < mddev->raid_disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -638,10 +647,12 @@ static void flush_pending_writes(struct r1conf *conf) if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); + conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to * disk before proceeding w/ I/O */ bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; @@ -945,6 +956,11 @@ read_again: /* * WRITE: */ + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -1108,6 +1124,7 @@ read_again: atomic_inc(&r1_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); } /* Mustn't call r1_bio_write_done before this next test, @@ -2418,6 +2435,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); + conf->pending_count = 0; conf->last_used = -1; for (i = 0; i < conf->raid_disks; i++) { @@ -2776,3 +2794,5 @@ MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5b16d09817df..c732b6cce935 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -46,6 +46,7 @@ struct r1conf { /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; + int pending_count; /* for use when syncing mirrors: * We don't allow both normal IO and resync/recovery IO at diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8427ff1c5af1..9496463ca5df 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -58,6 +58,12 @@ */ #define NR_RAID10_BIOS 256 +/* When there are this many requests queue to be written by + * the raid10 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); @@ -681,6 +687,10 @@ static int raid10_congested(void *data, int bits) struct r10conf *conf = mddev->private; int i, ret = 0; + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + if (mddev_congested(mddev, bits)) return 1; rcu_read_lock(); @@ -706,10 +716,12 @@ static void flush_pending_writes(struct r10conf *conf) if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); + conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; @@ -996,6 +1008,11 @@ read_again: /* * WRITE: */ + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio @@ -1129,6 +1146,7 @@ retry_write: atomic_inc(&r10_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); } @@ -3086,3 +3104,5 @@ MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 35489a569597..7facfdf841f4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -42,7 +42,7 @@ struct r10conf { struct list_head retry_list; /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - + int pending_count; spinlock_t resync_lock; int nr_pending;