]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/md/md.c
Merge branches 'irq-core-for-linus' and 'core-locking-for-linus' of git://git.kernel...
[karo-tx-linux.git] / drivers / md / md.c
index f20d13e717d55e0a04de747f62384d2ce4b0b310..4e957f3140a81c64a75982d8764436962d929a15 100644 (file)
@@ -36,7 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
@@ -57,7 +57,6 @@
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
 
-
 #ifndef MODULE
 static void autostart_arrays(int part);
 #endif
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
 static void md_print_devices(void);
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops;
 
 static int start_readonly;
 
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+
+static void mddev_bio_destructor(struct bio *bio)
+{
+       mddev_t *mddev, **mddevp;
+
+       mddevp = (void*)bio;
+       mddev = mddevp[-1];
+
+       bio_free(bio, mddev->bio_set);
+}
+
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+                           mddev_t *mddev)
+{
+       struct bio *b;
+       mddev_t **mddevp;
+
+       if (!mddev || !mddev->bio_set)
+               return bio_alloc(gfp_mask, nr_iovecs);
+
+       b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+                            mddev->bio_set);
+       if (!b)
+               return NULL;
+       mddevp = (void*)b;
+       mddevp[-1] = mddev;
+       b->bi_destructor = mddev_bio_destructor;
+       return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+                           mddev_t *mddev)
+{
+       struct bio *b;
+       mddev_t **mddevp;
+
+       if (!mddev || !mddev->bio_set)
+               return bio_clone(bio, gfp_mask);
+
+       b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+                            mddev->bio_set);
+       if (!b)
+               return NULL;
+       mddevp = (void*)b;
+       mddevp[-1] = mddev;
+       b->bi_destructor = mddev_bio_destructor;
+       __bio_clone(b, bio);
+       if (bio_integrity(bio)) {
+               int ret;
+
+               ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+
+               if (ret < 0) {
+                       bio_put(b);
+                       return NULL;
+               }
+       }
+
+       return b;
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
 /*
  * We have a system wide 'event count' that is incremented
  * on any 'interesting' event, and readers of /proc/mdstat
@@ -226,12 +293,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
                return 0;
        }
        rcu_read_lock();
-       if (mddev->suspended || mddev->barrier) {
+       if (mddev->suspended) {
                DEFINE_WAIT(__wait);
                for (;;) {
                        prepare_to_wait(&mddev->sb_wait, &__wait,
                                        TASK_UNINTERRUPTIBLE);
-                       if (!mddev->suspended && !mddev->barrier)
+                       if (!mddev->suspended)
                                break;
                        rcu_read_unlock();
                        schedule();
@@ -282,40 +349,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
 
 int mddev_congested(mddev_t *mddev, int bits)
 {
-       if (mddev->barrier)
-               return 1;
        return mddev->suspended;
 }
 EXPORT_SYMBOL(mddev_congested);
 
 /*
- * Generic barrier handling for md
+ * Generic flush handling for md
  */
 
-#define POST_REQUEST_BARRIER ((void*)1)
-
-static void md_end_barrier(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err)
 {
        mdk_rdev_t *rdev = bio->bi_private;
        mddev_t *mddev = rdev->mddev;
-       if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
-               set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
 
        rdev_dec_pending(rdev, mddev);
 
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-               if (mddev->barrier == POST_REQUEST_BARRIER) {
-                       /* This was a post-request barrier */
-                       mddev->barrier = NULL;
-                       wake_up(&mddev->sb_wait);
-               } else
-                       /* The pre-request barrier has finished */
-                       schedule_work(&mddev->barrier_work);
+               /* The pre-request flush has finished */
+               queue_work(md_wq, &mddev->flush_work);
        }
        bio_put(bio);
 }
 
-static void submit_barriers(mddev_t *mddev)
+static void submit_flushes(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
 
@@ -331,61 +387,57 @@ static void submit_barriers(mddev_t *mddev)
                        atomic_inc(&rdev->nr_pending);
                        atomic_inc(&rdev->nr_pending);
                        rcu_read_unlock();
-                       bi = bio_alloc(GFP_KERNEL, 0);
-                       bi->bi_end_io = md_end_barrier;
+                       bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
+                       bi->bi_end_io = md_end_flush;
                        bi->bi_private = rdev;
                        bi->bi_bdev = rdev->bdev;
                        atomic_inc(&mddev->flush_pending);
-                       submit_bio(WRITE_BARRIER, bi);
+                       submit_bio(WRITE_FLUSH, bi);
                        rcu_read_lock();
                        rdev_dec_pending(rdev, mddev);
                }
        rcu_read_unlock();
 }
 
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
 {
-       mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
-       struct bio *bio = mddev->barrier;
+       mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+       struct bio *bio = mddev->flush_bio;
 
        atomic_set(&mddev->flush_pending, 1);
 
-       if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-               bio_endio(bio, -EOPNOTSUPP);
-       else if (bio->bi_size == 0)
+       if (bio->bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
-               bio->bi_rw &= ~REQ_HARDBARRIER;
+               bio->bi_rw &= ~REQ_FLUSH;
                if (mddev->pers->make_request(mddev, bio))
                        generic_make_request(bio);
-               mddev->barrier = POST_REQUEST_BARRIER;
-               submit_barriers(mddev);
        }
        if (atomic_dec_and_test(&mddev->flush_pending)) {
-               mddev->barrier = NULL;
+               mddev->flush_bio = NULL;
                wake_up(&mddev->sb_wait);
        }
 }
 
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
 {
        spin_lock_irq(&mddev->write_lock);
        wait_event_lock_irq(mddev->sb_wait,
-                           !mddev->barrier,
+                           !mddev->flush_bio,
                            mddev->write_lock, /*nothing*/);
-       mddev->barrier = bio;
+       mddev->flush_bio = bio;
        spin_unlock_irq(&mddev->write_lock);
 
        atomic_set(&mddev->flush_pending, 1);
-       INIT_WORK(&mddev->barrier_work, md_submit_barrier);
+       INIT_WORK(&mddev->flush_work, md_submit_flush_data);
 
-       submit_barriers(mddev);
+       submit_flushes(mddev);
 
        if (atomic_dec_and_test(&mddev->flush_pending))
-               schedule_work(&mddev->barrier_work);
+               queue_work(md_wq, &mddev->flush_work);
 }
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
 
 /* Support for plugging.
  * This mirrors the plugging support in request_queue, but does not
@@ -442,6 +494,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
 
 static void mddev_put(mddev_t *mddev)
 {
+       struct bio_set *bs = NULL;
+
        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
                return;
        if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -449,19 +503,22 @@ static void mddev_put(mddev_t *mddev)
                /* Array is not configured at all, and not held active,
                 * so destroy it */
                list_del(&mddev->all_mddevs);
+               bs = mddev->bio_set;
+               mddev->bio_set = NULL;
                if (mddev->gendisk) {
-                       /* we did a probe so need to clean up.
-                        * Call schedule_work inside the spinlock
-                        * so that flush_scheduled_work() after
-                        * mddev_find will succeed in waiting for the
-                        * work to be done.
+                       /* We did a probe so need to clean up.  Call
+                        * queue_work inside the spinlock so that
+                        * flush_workqueue() after mddev_find will
+                        * succeed in waiting for the work to be done.
                         */
                        INIT_WORK(&mddev->del_work, mddev_delayed_delete);
-                       schedule_work(&mddev->del_work);
+                       queue_work(md_misc_wq, &mddev->del_work);
                } else
                        kfree(mddev);
        }
        spin_unlock(&all_mddevs_lock);
+       if (bs)
+               bioset_free(bs);
 }
 
 void mddev_init(mddev_t *mddev)
@@ -696,31 +753,6 @@ static void super_written(struct bio *bio, int error)
        bio_put(bio);
 }
 
-static void super_written_barrier(struct bio *bio, int error)
-{
-       struct bio *bio2 = bio->bi_private;
-       mdk_rdev_t *rdev = bio2->bi_private;
-       mddev_t *mddev = rdev->mddev;
-
-       if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
-           error == -EOPNOTSUPP) {
-               unsigned long flags;
-               /* barriers don't appear to be supported :-( */
-               set_bit(BarriersNotsupp, &rdev->flags);
-               mddev->barriers_work = 0;
-               spin_lock_irqsave(&mddev->write_lock, flags);
-               bio2->bi_next = mddev->biolist;
-               mddev->biolist = bio2;
-               spin_unlock_irqrestore(&mddev->write_lock, flags);
-               wake_up(&mddev->sb_wait);
-               bio_put(bio);
-       } else {
-               bio_put(bio2);
-               bio->bi_private = rdev;
-               super_written(bio, error);
-       }
-}
-
 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
                   sector_t sector, int size, struct page *page)
 {
@@ -729,51 +761,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
         * and decrement it on completion, waking up sb_wait
         * if zero is reached.
         * If an error occurred, call md_error
-        *
-        * As we might need to resubmit the request if REQ_HARDBARRIER
-        * causes ENOTSUPP, we allocate a spare bio...
         */
-       struct bio *bio = bio_alloc(GFP_NOIO, 1);
-       int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
+       struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
 
        bio->bi_bdev = rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
-       bio->bi_rw = rw;
 
        atomic_inc(&mddev->pending_writes);
-       if (!test_bit(BarriersNotsupp, &rdev->flags)) {
-               struct bio *rbio;
-               rw |= REQ_HARDBARRIER;
-               rbio = bio_clone(bio, GFP_NOIO);
-               rbio->bi_private = bio;
-               rbio->bi_end_io = super_written_barrier;
-               submit_bio(rw, rbio);
-       } else
-               submit_bio(rw, bio);
+       submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+                  bio);
 }
 
 void md_super_wait(mddev_t *mddev)
 {
-       /* wait for all superblock writes that were scheduled to complete.
-        * if any had to be retried (due to BARRIER problems), retry them
-        */
+       /* wait for all superblock writes that were scheduled to complete */
        DEFINE_WAIT(wq);
        for(;;) {
                prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&mddev->pending_writes)==0)
                        break;
-               while (mddev->biolist) {
-                       struct bio *bio;
-                       spin_lock_irq(&mddev->write_lock);
-                       bio = mddev->biolist;
-                       mddev->biolist = bio->bi_next ;
-                       bio->bi_next = NULL;
-                       spin_unlock_irq(&mddev->write_lock);
-                       submit_bio(bio->bi_rw, bio);
-               }
                schedule();
        }
        finish_wait(&mddev->sb_wait, &wq);
@@ -784,16 +793,16 @@ static void bi_complete(struct bio *bio, int error)
        complete((struct completion*)bio->bi_private);
 }
 
-int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-                  struct page *page, int rw)
+int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+                struct page *page, int rw)
 {
-       struct bio *bio = bio_alloc(GFP_NOIO, 1);
+       struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        struct completion event;
        int ret;
 
        rw |= REQ_SYNC | REQ_UNPLUG;
 
-       bio->bi_bdev = bdev;
+       bio->bi_bdev = rdev->bdev;
        bio->bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        init_completion(&event);
@@ -819,7 +828,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
                return 0;
 
 
-       if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
+       if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ))
                goto fail;
        rdev->sb_loaded = 1;
        return 0;
@@ -1070,7 +1079,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
 
        if (mddev->raid_disks == 0) {
                mddev->major_version = 0;
@@ -1485,7 +1493,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
-       clear_bit(BarriersNotsupp, &rdev->flags);
 
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
@@ -1914,7 +1921,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
        synchronize_rcu();
        INIT_WORK(&rdev->del_work, md_delayed_delete);
        kobject_get(&rdev->kobj);
-       schedule_work(&rdev->del_work);
+       queue_work(md_misc_wq, &rdev->del_work);
 }
 
 /*
@@ -2172,6 +2179,8 @@ repeat:
        if (!mddev->persistent) {
                clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+               if (!mddev->external)
+                       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
                wake_up(&mddev->sb_wait);
                return;
        }
@@ -4256,10 +4265,10 @@ static int md_alloc(dev_t dev, char *name)
        shift = partitioned ? MdpMinorShift : 0;
        unit = MINOR(mddev->unit) >> shift;
 
-       /* wait for any previous instance if this device
-        * to be completed removed (mddev_delayed_delete).
+       /* wait for any previous instance of this device to be
+        * completely removed (mddev_delayed_delete).
         */
-       flush_scheduled_work();
+       flush_workqueue(md_misc_wq);
 
        mutex_lock(&disks_mutex);
        error = -EEXIST;
@@ -4442,6 +4451,9 @@ int md_run(mddev_t *mddev)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
        }
 
+       if (mddev->bio_set == NULL)
+               mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+
        spin_lock(&pers_lock);
        pers = find_pers(mddev->level, mddev->clevel);
        if (!pers || !try_module_get(pers->owner)) {
@@ -4504,7 +4516,6 @@ int md_run(mddev_t *mddev)
        /* may be over-ridden by personality */
        mddev->resync_max_sectors = mddev->dev_sectors;
 
-       mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
 
        if (start_readonly && mddev->ro == 0)
@@ -4683,7 +4694,6 @@ static void md_clean(mddev_t *mddev)
        mddev->recovery = 0;
        mddev->in_sync = 0;
        mddev->degraded = 0;
-       mddev->barriers_work = 0;
        mddev->safemode = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
@@ -5951,16 +5961,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
        mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
 
-       lock_kernel();
        if (mddev->gendisk != bdev->bd_disk) {
                /* we are racing with mddev_put which is discarding this
                 * bd_disk.
                 */
                mddev_put(mddev);
                /* Wait until bdev->bd_disk is definitely gone */
-               flush_scheduled_work();
+               flush_workqueue(md_misc_wq);
                /* Then retry the open from the top */
-               unlock_kernel();
                return -ERESTARTSYS;
        }
        BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5974,7 +5982,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 
        check_disk_size_change(mddev->gendisk, bdev);
  out:
-       unlock_kernel();
        return err;
 }
 
@@ -5983,10 +5990,8 @@ static int md_release(struct gendisk *disk, fmode_t mode)
        mddev_t *mddev = disk->private_data;
 
        BUG_ON(!mddev);
-       lock_kernel();
        atomic_dec(&mddev->openers);
        mddev_put(mddev);
-       unlock_kernel();
 
        return 0;
 }
@@ -6118,7 +6123,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
        if (mddev->event_work.func)
-               schedule_work(&mddev->event_work);
+               queue_work(md_misc_wq, &mddev->event_work);
        md_new_event_inintr(mddev);
 }
 
@@ -7278,12 +7283,23 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-       if (register_blkdev(MD_MAJOR, "md"))
-               return -1;
-       if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
-               unregister_blkdev(MD_MAJOR, "md");
-               return -1;
-       }
+       int ret = -ENOMEM;
+
+       md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
+       if (!md_wq)
+               goto err_wq;
+
+       md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+       if (!md_misc_wq)
+               goto err_misc_wq;
+
+       if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+               goto err_md;
+
+       if ((ret = register_blkdev(0, "mdp")) < 0)
+               goto err_mdp;
+       mdp_major = ret;
+
        blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7294,8 +7310,16 @@ static int __init md_init(void)
 
        md_geninit();
        return 0;
-}
 
+err_mdp:
+       unregister_blkdev(MD_MAJOR, "md");
+err_md:
+       destroy_workqueue(md_misc_wq);
+err_misc_wq:
+       destroy_workqueue(md_wq);
+err_wq:
+       return ret;
+}
 
 #ifndef MODULE
 
@@ -7382,6 +7406,8 @@ static __exit void md_exit(void)
                export_array(mddev);
                mddev->hold_active = 0;
        }
+       destroy_workqueue(md_misc_wq);
+       destroy_workqueue(md_wq);
 }
 
 subsys_initcall(md_init);