#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
#define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x)))
-
#ifndef MODULE
static void autostart_arrays(int part);
#endif
static void md_print_devices(void);
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
static int start_readonly;
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+
+static void mddev_bio_destructor(struct bio *bio)
+{
+ mddev_t *mddev, **mddevp;
+
+ mddevp = (void*)bio;
+ mddev = mddevp[-1];
+
+ bio_free(bio, mddev->bio_set);
+}
+
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+ mddev_t *mddev)
+{
+ struct bio *b;
+ mddev_t **mddevp;
+
+ if (!mddev || !mddev->bio_set)
+ return bio_alloc(gfp_mask, nr_iovecs);
+
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+ mddev->bio_set);
+ if (!b)
+ return NULL;
+ mddevp = (void*)b;
+ mddevp[-1] = mddev;
+ b->bi_destructor = mddev_bio_destructor;
+ return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ mddev_t *mddev)
+{
+ struct bio *b;
+ mddev_t **mddevp;
+
+ if (!mddev || !mddev->bio_set)
+ return bio_clone(bio, gfp_mask);
+
+ b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+ mddev->bio_set);
+ if (!b)
+ return NULL;
+ mddevp = (void*)b;
+ mddevp[-1] = mddev;
+ b->bi_destructor = mddev_bio_destructor;
+ __bio_clone(b, bio);
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+
+ if (ret < 0) {
+ bio_put(b);
+ return NULL;
+ }
+ }
+
+ return b;
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
return 0;
}
rcu_read_lock();
- if (mddev->suspended || mddev->barrier) {
+ if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- if (!mddev->suspended && !mddev->barrier)
+ if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
int mddev_congested(mddev_t *mddev, int bits)
{
- if (mddev->barrier)
- return 1;
return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);
/*
- * Generic barrier handling for md
+ * Generic flush handling for md
*/
-#define POST_REQUEST_BARRIER ((void*)1)
-
-static void md_end_barrier(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err)
{
mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev;
- if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
- set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
- if (mddev->barrier == POST_REQUEST_BARRIER) {
- /* This was a post-request barrier */
- mddev->barrier = NULL;
- wake_up(&mddev->sb_wait);
- } else
- /* The pre-request barrier has finished */
- schedule_work(&mddev->barrier_work);
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
}
bio_put(bio);
}
-static void submit_barriers(mddev_t *mddev)
+static void submit_flushes(mddev_t *mddev)
{
mdk_rdev_t *rdev;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- bi = bio_alloc(GFP_KERNEL, 0);
- bi->bi_end_io = md_end_barrier;
+ bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
+ bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending);
- submit_bio(WRITE_BARRIER, bi);
+ submit_bio(WRITE_FLUSH, bi);
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
}
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
{
- mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
- struct bio *bio = mddev->barrier;
+ mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+ struct bio *bio = mddev->flush_bio;
atomic_set(&mddev->flush_pending, 1);
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- bio_endio(bio, -EOPNOTSUPP);
- else if (bio->bi_size == 0)
+ if (bio->bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
- bio->bi_rw &= ~REQ_HARDBARRIER;
+ bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio);
- mddev->barrier = POST_REQUEST_BARRIER;
- submit_barriers(mddev);
}
if (atomic_dec_and_test(&mddev->flush_pending)) {
- mddev->barrier = NULL;
+ mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);
}
}
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
- !mddev->barrier,
+ !mddev->flush_bio,
mddev->write_lock, /*nothing*/);
- mddev->barrier = bio;
+ mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 1);
- INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- submit_barriers(mddev);
+ submit_flushes(mddev);
if (atomic_dec_and_test(&mddev->flush_pending))
- schedule_work(&mddev->barrier_work);
+ queue_work(md_wq, &mddev->flush_work);
}
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
static void mddev_put(mddev_t *mddev)
{
+ struct bio_set *bs = NULL;
+
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
if (!mddev->raid_disks && list_empty(&mddev->disks) &&
/* Array is not configured at all, and not held active,
* so destroy it */
list_del(&mddev->all_mddevs);
+ bs = mddev->bio_set;
+ mddev->bio_set = NULL;
if (mddev->gendisk) {
- /* we did a probe so need to clean up.
- * Call schedule_work inside the spinlock
- * so that flush_scheduled_work() after
- * mddev_find will succeed in waiting for the
- * work to be done.
+ /* We did a probe so need to clean up. Call
+ * queue_work inside the spinlock so that
+ * flush_workqueue() after mddev_find will
+ * succeed in waiting for the work to be done.
*/
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- schedule_work(&mddev->del_work);
+ queue_work(md_misc_wq, &mddev->del_work);
} else
kfree(mddev);
}
spin_unlock(&all_mddevs_lock);
+ if (bs)
+ bioset_free(bs);
}
void mddev_init(mddev_t *mddev)
bio_put(bio);
}
-static void super_written_barrier(struct bio *bio, int error)
-{
- struct bio *bio2 = bio->bi_private;
- mdk_rdev_t *rdev = bio2->bi_private;
- mddev_t *mddev = rdev->mddev;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
- error == -EOPNOTSUPP) {
- unsigned long flags;
- /* barriers don't appear to be supported :-( */
- set_bit(BarriersNotsupp, &rdev->flags);
- mddev->barriers_work = 0;
- spin_lock_irqsave(&mddev->write_lock, flags);
- bio2->bi_next = mddev->biolist;
- mddev->biolist = bio2;
- spin_unlock_irqrestore(&mddev->write_lock, flags);
- wake_up(&mddev->sb_wait);
- bio_put(bio);
- } else {
- bio_put(bio2);
- bio->bi_private = rdev;
- super_written(bio, error);
- }
-}
-
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
- *
- * As we might need to resubmit the request if REQ_HARDBARRIER
- * causes ENOTSUPP, we allocate a spare bio...
*/
- struct bio *bio = bio_alloc(GFP_NOIO, 1);
- int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
- bio->bi_rw = rw;
atomic_inc(&mddev->pending_writes);
- if (!test_bit(BarriersNotsupp, &rdev->flags)) {
- struct bio *rbio;
- rw |= REQ_HARDBARRIER;
- rbio = bio_clone(bio, GFP_NOIO);
- rbio->bi_private = bio;
- rbio->bi_end_io = super_written_barrier;
- submit_bio(rw, rbio);
- } else
- submit_bio(rw, bio);
+ submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+ bio);
}
void md_super_wait(mddev_t *mddev)
{
- /* wait for all superblock writes that were scheduled to complete.
- * if any had to be retried (due to BARRIER problems), retry them
- */
+ /* wait for all superblock writes that were scheduled to complete */
DEFINE_WAIT(wq);
for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0)
break;
- while (mddev->biolist) {
- struct bio *bio;
- spin_lock_irq(&mddev->write_lock);
- bio = mddev->biolist;
- mddev->biolist = bio->bi_next ;
- bio->bi_next = NULL;
- spin_unlock_irq(&mddev->write_lock);
- submit_bio(bio->bi_rw, bio);
- }
schedule();
}
finish_wait(&mddev->sb_wait, &wq);
complete((struct completion*)bio->bi_private);
}
-int sync_page_io(struct block_device *bdev, sector_t sector, int size,
- struct page *page, int rw)
+int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+ struct page *page, int rw)
{
- struct bio *bio = bio_alloc(GFP_NOIO, 1);
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
struct completion event;
int ret;
rw |= REQ_SYNC | REQ_UNPLUG;
- bio->bi_bdev = bdev;
+ bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
init_completion(&event);
return 0;
- if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
goto fail;
rdev->sb_loaded = 1;
return 0;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj);
- schedule_work(&rdev->del_work);
+ queue_work(md_misc_wq, &rdev->del_work);
}
/*
if (!mddev->persistent) {
clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (!mddev->external)
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
return;
}
shift = partitioned ? MdpMinorShift : 0;
unit = MINOR(mddev->unit) >> shift;
- /* wait for any previous instance if this device
- * to be completed removed (mddev_delayed_delete).
+ /* wait for any previous instance of this device to be
+ * completely removed (mddev_delayed_delete).
*/
- flush_scheduled_work();
+ flush_workqueue(md_misc_wq);
mutex_lock(&disks_mutex);
error = -EEXIST;
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
+ if (mddev->bio_set == NULL)
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) {
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
- mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0)
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->degraded = 0;
- mddev->barriers_work = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
mddev_t *mddev = mddev_find(bdev->bd_dev);
int err;
- lock_kernel();
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
*/
mddev_put(mddev);
/* Wait until bdev->bd_disk is definitely gone */
- flush_scheduled_work();
+ flush_workqueue(md_misc_wq);
/* Then retry the open from the top */
- unlock_kernel();
return -ERESTARTSYS;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
check_disk_size_change(mddev->gendisk, bdev);
out:
- unlock_kernel();
return err;
}
mddev_t *mddev = disk->private_data;
BUG_ON(!mddev);
- lock_kernel();
atomic_dec(&mddev->openers);
mddev_put(mddev);
- unlock_kernel();
return 0;
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
if (mddev->event_work.func)
- schedule_work(&mddev->event_work);
+ queue_work(md_misc_wq, &mddev->event_work);
md_new_event_inintr(mddev);
}
static int __init md_init(void)
{
- if (register_blkdev(MD_MAJOR, "md"))
- return -1;
- if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
- unregister_blkdev(MD_MAJOR, "md");
- return -1;
- }
+ int ret = -ENOMEM;
+
+ md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
+ if (!md_wq)
+ goto err_wq;
+
+ md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+ if (!md_misc_wq)
+ goto err_misc_wq;
+
+ if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+ goto err_md;
+
+ if ((ret = register_blkdev(0, "mdp")) < 0)
+ goto err_mdp;
+ mdp_major = ret;
+
blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_geninit();
return 0;
-}
+err_mdp:
+ unregister_blkdev(MD_MAJOR, "md");
+err_md:
+ destroy_workqueue(md_misc_wq);
+err_misc_wq:
+ destroy_workqueue(md_wq);
+err_wq:
+ return ret;
+}
#ifndef MODULE
export_array(mddev);
mddev->hold_active = 0;
}
+ destroy_workqueue(md_misc_wq);
+ destroy_workqueue(md_wq);
}
subsys_initcall(md_init);