From 8b5eecf752dfe953923c41ce5acb6c9e6054577c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Sat, 18 Aug 2012 10:03:23 +1000
Subject: [PATCH] raid5: create multiple threads to handle stripes

This is a new tempt to make raid5 handle stripes in multiple threads, as
suggested by Neil to have maxium flexibility and better numa binding. It
basically is a combination of my first and second generation patches. By
default, no multiple thread is enabled (all stripes are handled by raid5d).

An example to enable multiple threads:
#echo 3 > /sys/block/md0/md/auxthread_number
This will create 3 auxiliary threads to handle stripes. The threads can run
on any cpus and handle stripes produced by any cpus.

#echo 1-3 > /sys/block/md0/md/auxth0/cpulist
This will bind auxiliary thread 0 to cpu 1-3, and this thread will only handle
stripes produced by cpu 1-3. User tool can further change the thread's
affinity, but the thread can only handle stripes produced by cpu 1-3 till the
sysfs entry is changed again.

If stripes produced by a CPU aren't handled by any auxiliary thread, such
stripes will be handled by raid5d. Otherwise, raid5d doesn't handle any
stripes.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    |   8 +-
 drivers/md/md.h    |   8 +
 drivers/md/raid5.c | 408 ++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.h |  19 +++
 4 files changed, 419 insertions(+), 24 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 289987e0b7e2..d56d3b690ea1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -640,9 +640,9 @@ static struct mddev * mddev_find(dev_t unit)
 	goto retry;
 }
 
-static inline int mddev_lock(struct mddev * mddev)
+int md_queue_misc_work(struct work_struct *work)
 {
-	return mutex_lock_interruptible(&mddev->reconfig_mutex);
+	return queue_work(md_misc_wq, work);
 }
 
 static inline int mddev_is_locked(struct mddev *mddev)
@@ -657,7 +657,7 @@ static inline int mddev_trylock(struct mddev * mddev)
 
 static struct attribute_group md_redundancy_group;
 
-static void mddev_unlock(struct mddev * mddev)
+void mddev_unlock(struct mddev * mddev)
 {
 	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
@@ -8572,6 +8572,8 @@ EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(mddev_unlock);
+EXPORT_SYMBOL(md_queue_misc_work);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD RAID framework");
 MODULE_ALIAS("md");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 93ada214b50e..dce301ced932 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -636,4 +636,12 @@ static inline int mddev_check_plugged(struct mddev *mddev)
 	return !!blk_check_plugged(md_unplug, mddev,
 				   sizeof(struct blk_plug_cb));
 }
+
+static inline int mddev_lock(struct mddev * mddev)
+{
+	return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+extern void mddev_unlock(struct mddev *mddev);
+extern int md_queue_misc_work(struct work_struct *work);
+
 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 81c02d63440b..7c8151a073c1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -196,6 +196,21 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct raid5_percpu *percpu;
+	int i, orphaned = 1;
+
+	percpu = per_cpu_ptr(conf->percpu, sh->cpu);
+	for_each_cpu(i, &percpu->handle_threads) {
+		md_wakeup_thread(conf->aux_threads[i]->thread);
+		orphaned = 0;
+	}
+	if (orphaned)
+		md_wakeup_thread(conf->mddev->thread);
+}
+
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	BUG_ON(!list_empty(&sh->lru));
@@ -208,9 +223,19 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
 			   sh->bm_seq - conf->seq_write > 0)
 			list_add_tail(&sh->lru, &conf->bitmap_list);
 		else {
+			int cpu = sh->cpu;
+			struct raid5_percpu *percpu;
+			if (!cpu_online(cpu)) {
+				cpu = cpumask_any(cpu_online_mask);
+				sh->cpu = cpu;
+			}
+			percpu = per_cpu_ptr(conf->percpu, cpu);
+
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
-			list_add_tail(&sh->lru, &conf->handle_list);
+			list_add_tail(&sh->lru, &percpu->handle_list);
+			raid5_wakeup_stripe_thread(sh);
+			return;
 		}
 		md_wakeup_thread(conf->mddev->thread);
 	} else {
@@ -355,6 +380,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 		raid5_build_block(sh, i, previous);
 	}
 	insert_hash(conf, sh);
+	sh->cpu = smp_processor_id();
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -3689,12 +3715,19 @@ static void raid5_activate_delayed(struct r5conf *conf)
 		while (!list_empty(&conf->delayed_list)) {
 			struct list_head *l = conf->delayed_list.next;
 			struct stripe_head *sh;
+			int cpu;
 			sh = list_entry(l, struct stripe_head, lru);
 			list_del_init(l);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			list_add_tail(&sh->lru, &conf->hold_list);
+			cpu = sh->cpu;
+			if (!cpu_online(cpu)) {
+				cpu = cpumask_any(cpu_online_mask);
+				sh->cpu = cpu;
+			}
+			raid5_wakeup_stripe_thread(sh);
 		}
 	}
 }
@@ -3968,18 +4001,29 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
  * head of the hold_list has changed, i.e. the head was promoted to the
  * handle_list.
  */
-static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
-{
-	struct stripe_head *sh;
-
+static struct stripe_head *__get_priority_stripe(struct r5conf *conf,
+	cpumask_t *mask)
+{
+	struct stripe_head *sh = NULL, *tmp;
+	struct list_head *handle_list = NULL;
+	int cpu;
+
+	/* Should we take action to avoid starvation of latter CPUs ? */
+	for_each_cpu(cpu, mask) {
+		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+		if (!list_empty(&percpu->handle_list)) {
+			handle_list = &percpu->handle_list;
+			break;
+		}
+	}
 	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
 		  __func__,
-		  list_empty(&conf->handle_list) ? "empty" : "busy",
+		  !handle_list ? "empty" : "busy",
 		  list_empty(&conf->hold_list) ? "empty" : "busy",
 		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
 
-	if (!list_empty(&conf->handle_list)) {
-		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+	if (handle_list) {
+		sh = list_entry(handle_list->next, typeof(*sh), lru);
 
 		if (list_empty(&conf->hold_list))
 			conf->bypass_count = 0;
@@ -3997,12 +4041,23 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
 		   ((conf->bypass_threshold &&
 		     conf->bypass_count > conf->bypass_threshold) ||
 		    atomic_read(&conf->pending_full_writes) == 0)) {
-		sh = list_entry(conf->hold_list.next,
-				typeof(*sh), lru);
-		conf->bypass_count -= conf->bypass_threshold;
-		if (conf->bypass_count < 0)
-			conf->bypass_count = 0;
-	} else
+
+		list_for_each_entry(tmp, &conf->hold_list,  lru) {
+			if (cpumask_test_cpu(tmp->cpu, mask) ||
+			    !cpu_online(tmp->cpu)) {
+				sh = tmp;
+				break;
+			}
+		}
+
+		if (sh) {
+			conf->bypass_count -= conf->bypass_threshold;
+			if (conf->bypass_count < 0)
+				conf->bypass_count = 0;
+		}
+	}
+
+	if (!sh)
 		return NULL;
 
 	list_del_init(&sh->lru);
@@ -4594,13 +4649,13 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 
 #define MAX_STRIPE_BATCH 8
-static int handle_active_stripes(struct r5conf *conf)
+static int handle_active_stripes(struct r5conf *conf, cpumask_t *mask)
 {
 	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
 	int i, batch_size = 0;
 
 	while (batch_size < MAX_STRIPE_BATCH &&
-			(sh = __get_priority_stripe(conf)) != NULL)
+			(sh = __get_priority_stripe(conf, mask)) != NULL)
 		batch[batch_size++] = sh;
 
 	if (batch_size == 0)
@@ -4618,6 +4673,35 @@ static int handle_active_stripes(struct r5conf *conf)
 	return batch_size;
 }
 
+static void raid5auxd(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct r5conf *conf = mddev->private;
+	struct blk_plug plug;
+	int handled;
+	struct raid5_auxth *auxth = thread->private;
+
+	pr_debug("+++ raid5auxd active\n");
+
+	blk_start_plug(&plug);
+	handled = 0;
+	spin_lock_irq(&conf->device_lock);
+	while (1) {
+		int batch_size;
+
+		batch_size = handle_active_stripes(conf, &auxth->work_mask);
+		if (!batch_size)
+			break;
+		handled += batch_size;
+	}
+	pr_debug("%d stripes handled\n", handled);
+
+	spin_unlock_irq(&conf->device_lock);
+	blk_finish_plug(&plug);
+
+	pr_debug("--- raid5auxd inactive\n");
+}
+
 /*
  * This is our raid5 kernel thread.
  *
@@ -4665,7 +4749,7 @@ static void raid5d(struct md_thread *thread)
 			handled++;
 		}
 
-		batch_size = handle_active_stripes(conf);
+		batch_size = handle_active_stripes(conf, &conf->work_mask);
 		if (!batch_size)
 			break;
 		handled += batch_size;
@@ -4794,10 +4878,270 @@ stripe_cache_active_show(struct mddev *mddev, char *page)
 static struct md_sysfs_entry
 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 
+static void raid5_update_threads_handle_mask(struct mddev *mddev)
+{
+	int cpu, i;
+	struct raid5_percpu *percpu;
+	struct r5conf *conf = mddev->private;
+
+	for_each_online_cpu(cpu) {
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		cpumask_clear(&percpu->handle_threads);
+	}
+	cpumask_copy(&conf->work_mask, cpu_online_mask);
+
+	for (i = 0; i < conf->aux_thread_num; i++) {
+		cpumask_t *work_mask = &conf->aux_threads[i]->work_mask;
+		for_each_cpu(cpu, work_mask) {
+			percpu = per_cpu_ptr(conf->percpu, cpu);
+			cpumask_set_cpu(i, &percpu->handle_threads);
+		}
+		cpumask_andnot(&conf->work_mask, &conf->work_mask,
+				work_mask);
+	}
+}
+
+struct raid5_auxth_sysfs {
+	struct attribute attr;
+	ssize_t (*show)(struct mddev *, struct raid5_auxth *, char *);
+	ssize_t (*store)(struct mddev *, struct raid5_auxth *,
+		const char *, size_t);
+};
+
+static ssize_t raid5_show_thread_cpulist(struct mddev *mddev,
+	struct raid5_auxth *thread, char *page)
+{
+	if (!mddev->private)
+		return 0;
+	return cpulist_scnprintf(page, PAGE_SIZE, &thread->work_mask);
+}
+
+static ssize_t
+raid5_store_thread_cpulist(struct mddev *mddev, struct raid5_auxth *thread,
+	const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	cpumask_var_t mask;
+
+	if (!conf)
+		return -ENODEV;
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (cpulist_parse(page, mask)) {
+		free_cpumask_var(mask);
+		return -EINVAL;
+	}
+
+	get_online_cpus();
+	spin_lock_irq(&conf->device_lock);
+	cpumask_copy(&thread->work_mask, mask);
+	raid5_update_threads_handle_mask(mddev);
+	spin_unlock_irq(&conf->device_lock);
+	put_online_cpus();
+	set_cpus_allowed_ptr(thread->thread->tsk, mask);
+
+	free_cpumask_var(mask);
+	return len;
+}
+
+static struct raid5_auxth_sysfs thread_cpulist =
+__ATTR(cpulist, S_IRUGO|S_IWUSR, raid5_show_thread_cpulist,
+	raid5_store_thread_cpulist);
+
+static struct attribute *auxth_attrs[] = {
+	&thread_cpulist.attr,
+	NULL,
+};
+
+static ssize_t
+raid5_auxth_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct raid5_auxth_sysfs *entry = container_of(attr,
+		struct raid5_auxth_sysfs, attr);
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	struct mddev *mddev = thread->thread->mddev;
+	ssize_t ret;
+
+	if (!entry->show)
+		return -EIO;
+	mddev_lock(mddev);
+	ret = entry->show(mddev, thread, page);
+	mddev_unlock(mddev);
+	return ret;
+}
+
+static ssize_t
+raid5_auxth_attr_store(struct kobject *kobj, struct attribute *attr,
+	      const char *page, size_t length)
+{
+	struct raid5_auxth_sysfs *entry = container_of(attr,
+		struct raid5_auxth_sysfs, attr);
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	struct mddev *mddev = thread->thread->mddev;
+	ssize_t ret;
+
+	if (!entry->store)
+		return -EIO;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	mddev_lock(mddev);
+	ret = entry->store(mddev, thread, page, length);
+	mddev_unlock(mddev);
+	return ret;
+}
+
+static void raid5_auxth_release(struct kobject *kobj)
+{
+	struct raid5_auxth *thread = container_of(kobj,
+		struct raid5_auxth, kobj);
+	kfree(thread);
+}
+
+static const struct sysfs_ops raid5_auxth_sysfsops = {
+	.show = raid5_auxth_attr_show,
+	.store = raid5_auxth_attr_store,
+};
+static struct kobj_type raid5_auxth_ktype = {
+	.release = raid5_auxth_release,
+	.sysfs_ops = &raid5_auxth_sysfsops,
+	.default_attrs = auxth_attrs,
+};
+
+static ssize_t
+raid5_show_auxthread_number(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->aux_thread_num);
+	else
+		return 0;
+}
+
+static void raid5_auxth_delete(struct work_struct *ws)
+{
+	struct raid5_auxth *thread = container_of(ws, struct raid5_auxth,
+		del_work);
+
+	kobject_del(&thread->kobj);
+	kobject_put(&thread->kobj);
+}
+
+static void __free_aux_thread(struct mddev *mddev, struct raid5_auxth *thread)
+{
+	md_unregister_thread(&thread->thread);
+	INIT_WORK(&thread->del_work, raid5_auxth_delete);
+	kobject_get(&thread->kobj);
+	md_queue_misc_work(&thread->del_work);
+}
+
+static struct raid5_auxth *__create_aux_thread(struct mddev *mddev, int i)
+{
+	struct raid5_auxth *thread;
+	char name[10];
+
+	thread = kzalloc(sizeof(*thread), GFP_KERNEL);
+	if (!thread)
+		return NULL;
+	snprintf(name, 10, "aux%d", i);
+	thread->thread = md_register_thread(raid5auxd, mddev, name);
+	if (!thread->thread) {
+		kfree(thread);
+		return NULL;
+	}
+	thread->thread->private = thread;
+
+	cpumask_copy(&thread->work_mask, cpu_online_mask);
+
+	if (kobject_init_and_add(&thread->kobj, &raid5_auxth_ktype,
+	    &mddev->kobj, "auxth%d", i)) {
+		md_unregister_thread(&thread->thread);
+		kfree(thread);
+		return NULL;
+	}
+	return thread;
+}
+
+static ssize_t
+raid5_store_auxthread_number(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+	int i;
+	struct raid5_auxth **threads;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new == conf->aux_thread_num)
+		return len;
+
+	/* There is no point creating more threads than cpu number */
+	if (new > num_online_cpus())
+		return -EINVAL;
+
+	if (new > conf->aux_thread_num) {
+		threads = kzalloc(sizeof(struct raid5_auxth *) * new,
+				GFP_KERNEL);
+		if (!threads)
+			return -ENOMEM;
+
+		i = conf->aux_thread_num;
+		while (i < new) {
+			threads[i] = __create_aux_thread(mddev, i);
+			if (!threads[i])
+				goto error;
+
+			i++;
+		}
+		memcpy(threads, conf->aux_threads,
+			sizeof(struct raid5_auxth *) * conf->aux_thread_num);
+		get_online_cpus();
+		spin_lock_irq(&conf->device_lock);
+		kfree(conf->aux_threads);
+		conf->aux_threads = threads;
+		conf->aux_thread_num = new;
+		raid5_update_threads_handle_mask(mddev);
+		spin_unlock_irq(&conf->device_lock);
+		put_online_cpus();
+	} else {
+		int old = conf->aux_thread_num;
+
+		get_online_cpus();
+		spin_lock_irq(&conf->device_lock);
+		conf->aux_thread_num = new;
+		raid5_update_threads_handle_mask(mddev);
+		spin_unlock_irq(&conf->device_lock);
+		put_online_cpus();
+		for (i = new; i < old; i++)
+			__free_aux_thread(mddev, conf->aux_threads[i]);
+	}
+
+	return len;
+error:
+	while (--i >= conf->aux_thread_num)
+		__free_aux_thread(mddev, threads[i]);
+	kfree(threads);
+	return -ENOMEM;
+}
+
+static struct md_sysfs_entry
+raid5_auxthread_number = __ATTR(auxthread_number, S_IRUGO|S_IWUSR,
+				raid5_show_auxthread_number,
+				raid5_store_auxthread_number);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
 	&raid5_preread_bypass_threshold.attr,
+	&raid5_auxthread_number.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -4845,6 +5189,7 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	kfree(conf->aux_threads);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
 	kfree(conf->disks);
@@ -4857,7 +5202,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 			      void *hcpu)
 {
 	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
-	long cpu = (long)hcpu;
+	long cpu = (long)hcpu, anycpu;
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
 	switch (action) {
@@ -4876,9 +5221,17 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 			       __func__, cpu);
 			return notifier_from_errno(-ENOMEM);
 		}
+		INIT_LIST_HEAD(&(percpu->handle_list));
+		cpumask_clear(&percpu->handle_threads);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+		spin_lock_irq(&conf->device_lock);
+		anycpu = cpumask_any(cpu_online_mask);
+		list_splice_tail_init(&percpu->handle_list,
+			&per_cpu_ptr(conf->percpu, anycpu)->handle_list);
+		spin_unlock_irq(&conf->device_lock);
+
 		safe_put_page(percpu->spare_page);
 		kfree(percpu->scribble);
 		percpu->spare_page = NULL;
@@ -4887,6 +5240,10 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 	default:
 		break;
 	}
+
+	spin_lock_irq(&conf->device_lock);
+	raid5_update_threads_handle_mask(conf->mddev);
+	spin_unlock_irq(&conf->device_lock);
 	return NOTIFY_OK;
 }
 #endif
@@ -4907,20 +5264,24 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 	get_online_cpus();
 	err = 0;
 	for_each_present_cpu(cpu) {
+		struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
+
 		if (conf->level == 6) {
 			spare_page = alloc_page(GFP_KERNEL);
 			if (!spare_page) {
 				err = -ENOMEM;
 				break;
 			}
-			per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
+			percpu->spare_page = spare_page;
 		}
 		scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
 		if (!scribble) {
 			err = -ENOMEM;
 			break;
 		}
-		per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
+		percpu->scribble = scribble;
+		INIT_LIST_HEAD(&percpu->handle_list);
+		cpumask_clear(&percpu->handle_threads);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
 	conf->cpu_notify.notifier_call = raid456_cpu_notify;
@@ -4976,7 +5337,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
-	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
@@ -4987,6 +5347,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->bypass_threshold = BYPASS_THRESHOLD;
 	conf->recovery_disabled = mddev->recovery_disabled - 1;
 
+	cpumask_copy(&conf->work_mask, cpu_online_mask);
+
 	conf->raid_disks = mddev->raid_disks;
 	if (mddev->reshape_position == MaxSector)
 		conf->previous_raid_disks = mddev->raid_disks;
@@ -5403,6 +5765,10 @@ abort:
 static int stop(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
+	int i;
+
+	for (i = 0; i < conf->aux_thread_num; i++)
+		__free_aux_thread(mddev, conf->aux_threads[i]);
 
 	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a9fc24901eda..813b6c18f250 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -211,6 +211,7 @@ struct stripe_head {
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
+	int			cpu;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -364,6 +365,14 @@ struct disk_info {
 	struct md_rdev	*rdev, *replacement;
 };
 
+struct raid5_auxth {
+	struct md_thread	*thread;
+	/* which CPUs should the auxiliary thread handle stripes from */
+	cpumask_t		work_mask;
+	struct kobject		kobj;
+	struct work_struct	del_work;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	struct mddev		*mddev;
@@ -432,6 +441,12 @@ struct r5conf {
 					      * lists and performing address
 					      * conversions
 					      */
+		struct list_head handle_list;
+		cpumask_t	handle_threads; /* Which threads can the CPU's
+						 * stripes be handled. It really
+						 * is a bitmap to aux_threads[],
+						 * but has max bits NR_CPUS
+						 */
 	} __percpu *percpu;
 	size_t			scribble_len; /* size of scribble region must be
 					       * associated with conf to handle
@@ -459,6 +474,10 @@ struct r5conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+	int			aux_thread_num;
+	struct raid5_auxth	**aux_threads;
+	/* which CPUs should raid5d thread handle stripes from */
+	cpumask_t		work_mask;
 };
 
 /*
-- 
2.39.5