]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'md/for-next'
authorThierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:36:40 +0000 (14:36 +0200)
committerThierry Reding <treding@nvidia.com>
Thu, 24 Oct 2013 12:36:40 +0000 (14:36 +0200)
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid1.h
drivers/md/raid5.c
drivers/md/raid5.h
include/linux/wait.h

index 561a65f82e26af05c8210f94b49367b6fa51e55b..628cd529343f0b5bc38d44899c89adbb17e69e65 100644 (file)
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
 
 static struct ctl_table_header *raid_table_header;
 
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
        {
                .procname       = "speed_limit_min",
                .data           = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
        { }
 };
 
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
        {
                .procname       = "raid",
                .maxlen         = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
        { }
 };
 
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
        {
                .procname       = "dev",
                .maxlen         = 0,
@@ -5331,20 +5331,31 @@ EXPORT_SYMBOL_GPL(md_stop);
 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
 {
        int err = 0;
+       int did_freeze = 0;
+
+       if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+               did_freeze = 1;
+               set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+               md_wakeup_thread(mddev->thread);
+       }
+       if (mddev->sync_thread)
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       mddev_unlock(mddev);
+       wait_event(resync_wait, mddev->sync_thread == NULL);
+       mddev_lock(mddev);
+
        mutex_lock(&mddev->open_mutex);
-       if (atomic_read(&mddev->openers) > !!bdev) {
+       if (atomic_read(&mddev->openers) > !!bdev ||
+           mddev->sync_thread ||
+           (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
+               if (did_freeze) {
+                       clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                       md_wakeup_thread(mddev->thread);
+               }
                err = -EBUSY;
                goto out;
        }
-       if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-               /* Someone opened the device since we flushed it
-                * so page cache could be dirty and it is too late
-                * to flush.  So abort
-                */
-               mutex_unlock(&mddev->open_mutex);
-               return -EBUSY;
-       }
        if (mddev->pers) {
                __md_stop_writes(mddev);
 
@@ -5355,7 +5366,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
                set_disk_ro(mddev->gendisk, 1);
                clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                sysfs_notify_dirent_safe(mddev->sysfs_state);
-               err = 0;        
+               err = 0;
        }
 out:
        mutex_unlock(&mddev->open_mutex);
@@ -5371,20 +5382,30 @@ static int do_md_stop(struct mddev * mddev, int mode,
 {
        struct gendisk *disk = mddev->gendisk;
        struct md_rdev *rdev;
+       int did_freeze = 0;
+
+       if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+               did_freeze = 1;
+               set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+               md_wakeup_thread(mddev->thread);
+       }
+       if (mddev->sync_thread)
+               set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       mddev_unlock(mddev);
+       wait_event(resync_wait, mddev->sync_thread == NULL);
+       mddev_lock(mddev);
 
        mutex_lock(&mddev->open_mutex);
        if (atomic_read(&mddev->openers) > !!bdev ||
-           mddev->sysfs_active) {
+           mddev->sysfs_active ||
+           mddev->sync_thread ||
+           (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
                printk("md: %s still in use.\n",mdname(mddev));
                mutex_unlock(&mddev->open_mutex);
-               return -EBUSY;
-       }
-       if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
-               /* Someone opened the device since we flushed it
-                * so page cache could be dirty and it is too late
-                * to flush.  So abort
-                */
-               mutex_unlock(&mddev->open_mutex);
+               if (did_freeze) {
+                       clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+                       md_wakeup_thread(mddev->thread);
+               }
                return -EBUSY;
        }
        if (mddev->pers) {
@@ -7934,6 +7955,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 
        /* resync has finished, collect result */
        md_unregister_thread(&mddev->sync_thread);
+       wake_up(&resync_wait);
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
            !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
                /* success...*/
index aacf6bf352d87978a15633b05a0d51d3579ce16d..b4a6dcd4710a3b20a50e2d22572cb0ac942bb66f 100644 (file)
@@ -84,10 +84,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 }
 
 #define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -814,8 +816,6 @@ static void flush_pending_writes(struct r1conf *conf)
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
-#define RESYNC_DEPTH 32
-
 static void raise_barrier(struct r1conf *conf)
 {
        spin_lock_irq(&conf->resync_lock);
@@ -829,6 +829,7 @@ static void raise_barrier(struct r1conf *conf)
 
        /* Now wait for all pending IO to complete */
        wait_event_lock_irq(conf->wait_barrier,
+                           !conf->array_frozen &&
                            !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
                            conf->resync_lock);
 
@@ -860,10 +861,11 @@ static void wait_barrier(struct r1conf *conf)
                 * count down.
                 */
                wait_event_lock_irq(conf->wait_barrier,
-                                   !conf->barrier ||
+                                   !conf->array_frozen &&
+                                   (!conf->barrier ||
                                    (conf->nr_pending &&
                                     current->bio_list &&
-                                    !bio_list_empty(current->bio_list)),
+                                    !bio_list_empty(current->bio_list))),
                                    conf->resync_lock);
                conf->nr_waiting--;
        }
@@ -884,8 +886,7 @@ static void freeze_array(struct r1conf *conf, int extra)
 {
        /* stop syncio and normal IO and wait for everything to
         * go quite.
-        * We increment barrier and nr_waiting, and then
-        * wait until nr_pending match nr_queued+extra
+        * We wait until nr_pending match nr_queued+extra
         * This is called in the context of one normal IO request
         * that has failed. Thus any sync request that might be pending
         * will be blocked by nr_pending, and we need to wait for
@@ -895,8 +896,7 @@ static void freeze_array(struct r1conf *conf, int extra)
         * we continue.
         */
        spin_lock_irq(&conf->resync_lock);
-       conf->barrier++;
-       conf->nr_waiting++;
+       conf->array_frozen = 1;
        wait_event_lock_irq_cmd(conf->wait_barrier,
                                conf->nr_pending == conf->nr_queued+extra,
                                conf->resync_lock,
@@ -907,8 +907,7 @@ static void unfreeze_array(struct r1conf *conf)
 {
        /* reverse the effect of the freeze */
        spin_lock_irq(&conf->resync_lock);
-       conf->barrier--;
-       conf->nr_waiting--;
+       conf->array_frozen = 0;
        wake_up(&conf->wait_barrier);
        spin_unlock_irq(&conf->resync_lock);
 }
@@ -2871,8 +2870,8 @@ static int stop(struct mddev *mddev)
                           atomic_read(&bitmap->behind_writes) == 0);
        }
 
-       raise_barrier(conf);
-       lower_barrier(conf);
+       freeze_array(conf, 0);
+       unfreeze_array(conf);
 
        md_unregister_thread(&mddev->thread);
        if (conf->r1bio_pool)
@@ -3031,10 +3030,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
                wake_up(&conf->wait_barrier);
                break;
        case 1:
-               raise_barrier(conf);
+               freeze_array(conf, 0);
                break;
        case 0:
-               lower_barrier(conf);
+               unfreeze_array(conf);
                break;
        }
 }
index 0ff3715fb7eba5ec4fed61a9922276b07b363aff..331a98a231b4e3a3dff585055655f065c0f68de9 100644 (file)
@@ -65,6 +65,7 @@ struct r1conf {
        int                     nr_waiting;
        int                     nr_queued;
        int                     barrier;
+       int                     array_frozen;
 
        /* Set to 1 if a full sync is needed, (fresh device added).
         * Cleared when a sync completes.
index f8b9068439267a3539d671e6e59ceb0b47b90547..0e78fe370d44c936569f4b30cfb88b78e7272c15 100644 (file)
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
        return &conf->stripe_hashtbl[hash];
 }
 
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+       return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_lock_irq(conf->hash_locks + hash);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+       spin_unlock(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       local_irq_disable();
+       spin_lock(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+       spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+       int i;
+       spin_unlock(&conf->device_lock);
+       for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+               spin_unlock(conf->hash_locks + i - 1);
+       local_irq_enable();
+}
+
 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  * order without overlap.  There may be several bio's per stripe+device, and
  * a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
        }
 }
 
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                             struct list_head *temp_inactive_list)
 {
        BUG_ON(!list_empty(&sh->lru));
        BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,19 +315,63 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
                            < IO_THRESHOLD)
                                md_wakeup_thread(conf->mddev->thread);
                atomic_dec(&conf->active_stripes);
-               if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
-                       list_add_tail(&sh->lru, &conf->inactive_list);
-                       wake_up(&conf->wait_for_stripe);
-                       if (conf->retry_read_aligned)
-                               md_wakeup_thread(conf->mddev->thread);
-               }
+               if (!test_bit(STRIPE_EXPANDING, &sh->state))
+                       list_add_tail(&sh->lru, temp_inactive_list);
        }
 }
 
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+                            struct list_head *temp_inactive_list)
 {
        if (atomic_dec_and_test(&sh->count))
-               do_release_stripe(conf, sh);
+               do_release_stripe(conf, sh, temp_inactive_list);
+}
+
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+                                        struct list_head *temp_inactive_list,
+                                        int hash)
+{
+       int size;
+       bool do_wakeup = false;
+       unsigned long flags;
+
+       if (hash == NR_STRIPE_HASH_LOCKS) {
+               size = NR_STRIPE_HASH_LOCKS;
+               hash = NR_STRIPE_HASH_LOCKS - 1;
+       } else
+               size = 1;
+       while (size) {
+               struct list_head *list = &temp_inactive_list[size - 1];
+
+               /*
+                * We don't hold any lock here yet, get_active_stripe() might
+                * remove stripes from the list
+                */
+               if (!list_empty_careful(list)) {
+                       spin_lock_irqsave(conf->hash_locks + hash, flags);
+                       if (list_empty(conf->inactive_list + hash) &&
+                           !list_empty(list))
+                               atomic_dec(&conf->empty_inactive_list_nr);
+                       list_splice_tail_init(list, conf->inactive_list + hash);
+                       do_wakeup = true;
+                       spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+               }
+               size--;
+               hash--;
+       }
+
+       if (do_wakeup) {
+               wake_up(&conf->wait_for_stripe);
+               if (conf->retry_read_aligned)
+                       md_wakeup_thread(conf->mddev->thread);
+       }
 }
 
 static struct llist_node *llist_reverse_order(struct llist_node *head)
@@ -308,7 +389,8 @@ static struct llist_node *llist_reverse_order(struct llist_node *head)
 }
 
 /* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+                              struct list_head *temp_inactive_list)
 {
        struct stripe_head *sh;
        int count = 0;
@@ -317,6 +399,8 @@ static int release_stripe_list(struct r5conf *conf)
        head = llist_del_all(&conf->released_stripes);
        head = llist_reverse_order(head);
        while (head) {
+               int hash;
+
                sh = llist_entry(head, struct stripe_head, release_list);
                head = llist_next(head);
                /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -327,7 +411,8 @@ static int release_stripe_list(struct r5conf *conf)
                 * again, the count is always > 1. This is true for
                 * STRIPE_ON_UNPLUG_LIST bit too.
                 */
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
                count++;
        }
 
@@ -338,6 +423,8 @@ static void release_stripe(struct stripe_head *sh)
 {
        struct r5conf *conf = sh->raid_conf;
        unsigned long flags;
+       struct list_head list;
+       int hash;
        bool wakeup;
 
        if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
@@ -350,8 +437,11 @@ slow_path:
        local_irq_save(flags);
        /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
        if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
-               do_release_stripe(conf, sh);
+               INIT_LIST_HEAD(&list);
+               hash = sh->hash_lock_index;
+               do_release_stripe(conf, sh, &list);
                spin_unlock(&conf->device_lock);
+               release_inactive_stripe_list(conf, &list, hash);
        }
        local_irq_restore(flags);
 }
@@ -376,18 +466,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 
 
 /* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh = NULL;
        struct list_head *first;
 
-       if (list_empty(&conf->inactive_list))
+       if (list_empty(conf->inactive_list + hash))
                goto out;
-       first = conf->inactive_list.next;
+       first = (conf->inactive_list + hash)->next;
        sh = list_entry(first, struct stripe_head, lru);
        list_del_init(first);
        remove_hash(sh);
        atomic_inc(&conf->active_stripes);
+       BUG_ON(hash != sh->hash_lock_index);
+       if (list_empty(conf->inactive_list + hash))
+               atomic_inc(&conf->empty_inactive_list_nr);
 out:
        return sh;
 }
@@ -430,7 +523,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        struct r5conf *conf = sh->raid_conf;
-       int i;
+       int i, seq;
 
        BUG_ON(atomic_read(&sh->count) != 0);
        BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -440,7 +533,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                (unsigned long long)sh->sector);
 
        remove_hash(sh);
-
+retry:
+       seq = read_seqcount_begin(&conf->gen_lock);
        sh->generation = conf->generation - previous;
        sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
@@ -462,6 +556,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
                dev->flags = 0;
                raid5_build_block(sh, i, previous);
        }
+       if (read_seqcount_retry(&conf->gen_lock, seq))
+               goto retry;
        insert_hash(conf, sh);
        sh->cpu = smp_processor_id();
 }
@@ -566,29 +662,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                  int previous, int noblock, int noquiesce)
 {
        struct stripe_head *sh;
+       int hash = stripe_hash_locks_hash(sector);
 
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 
-       spin_lock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
 
        do {
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0 || noquiesce,
-                                   conf->device_lock);
+                                   *(conf->hash_locks + hash));
                sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
-                               sh = get_free_stripe(conf);
+                               sh = get_free_stripe(conf, hash);
                        if (noblock && sh == NULL)
                                break;
                        if (!sh) {
                                conf->inactive_blocked = 1;
-                               wait_event_lock_irq(conf->wait_for_stripe,
-                                                   !list_empty(&conf->inactive_list) &&
-                                                   (atomic_read(&conf->active_stripes)
-                                                    < (conf->max_nr_stripes *3/4)
-                                                    || !conf->inactive_blocked),
-                                                   conf->device_lock);
+                               wait_event_lock_irq(
+                                       conf->wait_for_stripe,
+                                       !list_empty(conf->inactive_list + hash) &&
+                                       (atomic_read(&conf->active_stripes)
+                                        < (conf->max_nr_stripes * 3 / 4)
+                                        || !conf->inactive_blocked),
+                                       *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
                        } else
                                init_stripe(sh, sector, previous);
@@ -599,6 +697,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
                                    && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
                        } else {
+                               spin_lock(&conf->device_lock);
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
                                if (list_empty(&sh->lru) &&
@@ -609,6 +708,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
+                               spin_unlock(&conf->device_lock);
                        }
                }
        } while (sh == NULL);
@@ -616,7 +716,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
        if (sh)
                atomic_inc(&sh->count);
 
-       spin_unlock_irq(&conf->device_lock);
+       spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
 
@@ -1596,7 +1696,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
        put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
        sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1612,6 +1712,7 @@ static int grow_one_stripe(struct r5conf *conf)
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
+       sh->hash_lock_index = hash;
        /* we just created an active stripe so... */
        atomic_set(&sh->count, 1);
        atomic_inc(&conf->active_stripes);
@@ -1624,6 +1725,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
        struct kmem_cache *sc;
        int devs = max(conf->raid_disks, conf->previous_raid_disks);
+       int hash;
 
        if (conf->mddev->gendisk)
                sprintf(conf->cache_name[0],
@@ -1641,9 +1743,13 @@ static int grow_stripes(struct r5conf *conf, int num)
                return 1;
        conf->slab_cache = sc;
        conf->pool_size = devs;
-       while (num--)
-               if (!grow_one_stripe(conf))
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+       while (num--) {
+               if (!grow_one_stripe(conf, hash))
                        return 1;
+               conf->max_nr_stripes++;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+       }
        return 0;
 }
 
@@ -1701,6 +1807,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        int err;
        struct kmem_cache *sc;
        int i;
+       int hash, cnt;
 
        if (newsize <= conf->pool_size)
                return 0; /* never bother to shrink */
@@ -1740,19 +1847,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
         * OK, we have enough stripes, start collecting inactive
         * stripes and copying them over
         */
+       hash = 0;
+       cnt = 0;
        list_for_each_entry(nsh, &newstripes, lru) {
-               spin_lock_irq(&conf->device_lock);
-               wait_event_lock_irq(conf->wait_for_stripe,
-                                   !list_empty(&conf->inactive_list),
-                                   conf->device_lock);
-               osh = get_free_stripe(conf);
-               spin_unlock_irq(&conf->device_lock);
+               lock_device_hash_lock(conf, hash);
+               wait_event_cmd(conf->wait_for_stripe,
+                                   !list_empty(conf->inactive_list + hash),
+                                   unlock_device_hash_lock(conf, hash),
+                                   lock_device_hash_lock(conf, hash));
+               osh = get_free_stripe(conf, hash);
+               unlock_device_hash_lock(conf, hash);
                atomic_set(&nsh->count, 1);
                for(i=0; i<conf->pool_size; i++)
                        nsh->dev[i].page = osh->dev[i].page;
                for( ; i<newsize; i++)
                        nsh->dev[i].page = NULL;
+               nsh->hash_lock_index = hash;
                kmem_cache_free(conf->slab_cache, osh);
+               cnt++;
+               if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+                   !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+                       hash++;
+                       cnt = 0;
+               }
        }
        kmem_cache_destroy(conf->slab_cache);
 
@@ -1811,13 +1928,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
        return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
 {
        struct stripe_head *sh;
 
-       spin_lock_irq(&conf->device_lock);
-       sh = get_free_stripe(conf);
-       spin_unlock_irq(&conf->device_lock);
+       spin_lock_irq(conf->hash_locks + hash);
+       sh = get_free_stripe(conf, hash);
+       spin_unlock_irq(conf->hash_locks + hash);
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
@@ -1829,8 +1946,10 @@ static int drop_one_stripe(struct r5conf *conf)
 
 static void shrink_stripes(struct r5conf *conf)
 {
-       while (drop_one_stripe(conf))
-               ;
+       int hash;
+       for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+               while (drop_one_stripe(conf, hash))
+                       ;
 
        if (conf->slab_cache)
                kmem_cache_destroy(conf->slab_cache);
@@ -1935,6 +2054,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
                               mdname(conf->mddev), bdn);
                else
                        retry = 1;
+               if (set_bad && test_bit(In_sync, &rdev->flags)
+                   && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+                       retry = 1;
                if (retry)
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
                                set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3914,7 +4036,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
        }
 }
 
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+       struct list_head *temp_inactive_list)
 {
        /* device_lock is held */
        struct list_head head;
@@ -3922,9 +4045,11 @@ static void activate_bit_delay(struct r5conf *conf)
        list_del_init(&conf->bitmap_list);
        while (!list_empty(&head)) {
                struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+               int hash;
                list_del_init(&sh->lru);
                atomic_inc(&sh->count);
-               __release_stripe(conf, sh);
+               hash = sh->hash_lock_index;
+               __release_stripe(conf, sh, &temp_inactive_list[hash]);
        }
 }
 
@@ -3940,7 +4065,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
                return 1;
        if (conf->quiesce)
                return 1;
-       if (list_empty_careful(&conf->inactive_list))
+       if (atomic_read(&conf->empty_inactive_list_nr))
                return 1;
 
        return 0;
@@ -4270,6 +4395,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 struct raid5_plug_cb {
        struct blk_plug_cb      cb;
        struct list_head        list;
+       struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
 };
 
 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4280,6 +4406,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
        struct mddev *mddev = cb->cb.data;
        struct r5conf *conf = mddev->private;
        int cnt = 0;
+       int hash;
 
        if (cb->list.next && !list_empty(&cb->list)) {
                spin_lock_irq(&conf->device_lock);
@@ -4297,11 +4424,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
                         * STRIPE_ON_RELEASE_LIST could be set here. In that
                         * case, the count is always > 1 here
                         */
-                       __release_stripe(conf, sh);
+                       hash = sh->hash_lock_index;
+                       __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
                        cnt++;
                }
                spin_unlock_irq(&conf->device_lock);
        }
+       release_inactive_stripe_list(conf, cb->temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
        if (mddev->queue)
                trace_block_unplug(mddev->queue, cnt, !from_schedule);
        kfree(cb);
@@ -4322,8 +4452,12 @@ static void release_stripe_plug(struct mddev *mddev,
 
        cb = container_of(blk_cb, struct raid5_plug_cb, cb);
 
-       if (cb->list.next == NULL)
+       if (cb->list.next == NULL) {
+               int i;
                INIT_LIST_HEAD(&cb->list);
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       INIT_LIST_HEAD(cb->temp_inactive_list + i);
+       }
 
        if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
                list_add_tail(&sh->lru, &cb->list);
@@ -4968,27 +5102,45 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 }
 
 static int handle_active_stripes(struct r5conf *conf, int group,
-                                struct r5worker *worker)
+                                struct r5worker *worker,
+                                struct list_head *temp_inactive_list)
 {
        struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
-       int i, batch_size = 0;
+       int i, batch_size = 0, hash;
+       bool release_inactive = false;
 
        while (batch_size < MAX_STRIPE_BATCH &&
                        (sh = __get_priority_stripe(conf, group)) != NULL)
                batch[batch_size++] = sh;
 
-       if (batch_size == 0)
-               return batch_size;
+       if (batch_size == 0) {
+               for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+                       if (!list_empty(temp_inactive_list + i))
+                               break;
+               if (i == NR_STRIPE_HASH_LOCKS)
+                       return batch_size;
+               release_inactive = true;
+       }
        spin_unlock_irq(&conf->device_lock);
 
+       release_inactive_stripe_list(conf, temp_inactive_list,
+                                    NR_STRIPE_HASH_LOCKS);
+
+       if (release_inactive) {
+               spin_lock_irq(&conf->device_lock);
+               return 0;
+       }
+
        for (i = 0; i < batch_size; i++)
                handle_stripe(batch[i]);
 
        cond_resched();
 
        spin_lock_irq(&conf->device_lock);
-       for (i = 0; i < batch_size; i++)
-               __release_stripe(conf, batch[i]);
+       for (i = 0; i < batch_size; i++) {
+               hash = batch[i]->hash_lock_index;
+               __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+       }
        return batch_size;
 }
 
@@ -5009,9 +5161,10 @@ static void raid5_do_work(struct work_struct *work)
        while (1) {
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, worker->temp_inactive_list);
 
-               batch_size = handle_active_stripes(conf, group_id, worker);
+               batch_size = handle_active_stripes(conf, group_id, worker,
+                                                  worker->temp_inactive_list);
                worker->working = false;
                if (!batch_size && !released)
                        break;
@@ -5050,7 +5203,7 @@ static void raid5d(struct md_thread *thread)
                struct bio *bio;
                int batch_size, released;
 
-               released = release_stripe_list(conf);
+               released = release_stripe_list(conf, conf->temp_inactive_list);
 
                if (
                    !list_empty(&conf->bitmap_list)) {
@@ -5060,7 +5213,7 @@ static void raid5d(struct md_thread *thread)
                        bitmap_unplug(mddev->bitmap);
                        spin_lock_irq(&conf->device_lock);
                        conf->seq_write = conf->seq_flush;
-                       activate_bit_delay(conf);
+                       activate_bit_delay(conf, conf->temp_inactive_list);
                }
                raid5_activate_delayed(conf);
 
@@ -5074,7 +5227,8 @@ static void raid5d(struct md_thread *thread)
                        handled++;
                }
 
-               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+               batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+                                                  conf->temp_inactive_list);
                if (!batch_size && !released)
                        break;
                handled += batch_size;
@@ -5110,22 +5264,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
        struct r5conf *conf = mddev->private;
        int err;
+       int hash;
 
        if (size <= 16 || size > 32768)
                return -EINVAL;
+       hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
        while (size < conf->max_nr_stripes) {
-               if (drop_one_stripe(conf))
+               if (drop_one_stripe(conf, hash))
                        conf->max_nr_stripes--;
                else
                        break;
+               hash--;
+               if (hash < 0)
+                       hash = NR_STRIPE_HASH_LOCKS - 1;
        }
        err = md_allow_write(mddev);
        if (err)
                return err;
+       hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
        while (size > conf->max_nr_stripes) {
-               if (grow_one_stripe(conf))
+               if (grow_one_stripe(conf, hash))
                        conf->max_nr_stripes++;
                else break;
+               hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
        }
        return 0;
 }
@@ -5276,7 +5437,7 @@ static struct attribute_group raid5_attrs_group = {
 
 static int alloc_thread_groups(struct r5conf *conf, int cnt)
 {
-       int i, j;
+       int i, j, k;
        ssize_t size;
        struct r5worker *workers;
 
@@ -5306,8 +5467,12 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt)
                group->workers = workers + i * cnt;
 
                for (j = 0; j < cnt; j++) {
-                       group->workers[j].group = group;
-                       INIT_WORK(&group->workers[j].work, raid5_do_work);
+                       struct r5worker *worker = group->workers + j;
+                       worker->group = group;
+                       INIT_WORK(&worker->work, raid5_do_work);
+
+                       for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+                               INIT_LIST_HEAD(worker->temp_inactive_list + k);
                }
        }
 
@@ -5458,6 +5623,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        struct md_rdev *rdev;
        struct disk_info *disk;
        char pers_name[6];
+       int i;
 
        if (mddev->new_level != 5
            && mddev->new_level != 4
@@ -5502,7 +5668,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        INIT_LIST_HEAD(&conf->hold_list);
        INIT_LIST_HEAD(&conf->delayed_list);
        INIT_LIST_HEAD(&conf->bitmap_list);
-       INIT_LIST_HEAD(&conf->inactive_list);
        init_llist_head(&conf->released_stripes);
        atomic_set(&conf->active_stripes, 0);
        atomic_set(&conf->preread_active_stripes, 0);
@@ -5528,6 +5693,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
 
+       spin_lock_init(conf->hash_locks);
+       for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+               spin_lock_init(conf->hash_locks + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->inactive_list + i);
+
+       for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+               INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
        conf->level = mddev->new_level;
        if (raid5_alloc_percpu(conf) != 0)
                goto abort;
@@ -5568,7 +5743,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
        else
                conf->max_degraded = 1;
        conf->algorithm = mddev->new_layout;
-       conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
                conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5577,7 +5751,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
                 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-       if (grow_stripes(conf, conf->max_nr_stripes)) {
+       atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+       if (grow_stripes(conf, NR_STRIPES)) {
                printk(KERN_ERR
                       "md/raid:%s: couldn't allocate %dkB for buffers\n",
                       mdname(mddev), memory);
@@ -6383,12 +6558,18 @@ static int raid5_start_reshape(struct mddev *mddev)
        if (!mddev->sync_thread) {
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
+               write_seqcount_begin(&conf->gen_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+               mddev->new_chunk_sectors =
+                       conf->chunk_sectors = conf->prev_chunk_sectors;
+               mddev->new_layout = conf->algorithm = conf->prev_algo;
                rdev_for_each(rdev, mddev)
                        rdev->new_data_offset = rdev->data_offset;
                smp_wmb();
+               conf->generation --;
                conf->reshape_progress = MaxSector;
                mddev->reshape_position = MaxSector;
+               write_seqcount_end(&conf->gen_lock);
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
@@ -6476,27 +6657,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
                break;
 
        case 1: /* stop all writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                /* '2' tells resync/reshape to pause so that all
                 * active stripes can drain
                 */
                conf->quiesce = 2;
-               wait_event_lock_irq(conf->wait_for_stripe,
+               wait_event_cmd(conf->wait_for_stripe,
                                    atomic_read(&conf->active_stripes) == 0 &&
                                    atomic_read(&conf->active_aligned_reads) == 0,
-                                   conf->device_lock);
+                                   unlock_all_device_hash_locks_irq(conf),
+                                   lock_all_device_hash_locks_irq(conf));
                conf->quiesce = 1;
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                /* allow reshape to continue */
                wake_up(&conf->wait_for_overlap);
                break;
 
        case 0: /* re-enable writes */
-               spin_lock_irq(&conf->device_lock);
+               lock_all_device_hash_locks_irq(conf);
                conf->quiesce = 0;
                wake_up(&conf->wait_for_stripe);
                wake_up(&conf->wait_for_overlap);
-               spin_unlock_irq(&conf->device_lock);
+               unlock_all_device_hash_locks_irq(conf);
                break;
        }
 }
index 2113ffa82c7a2506d21ce36076c3f6bd057d922b..e4407388670a53dc2da94dfdadd3d9cd2188f332 100644 (file)
@@ -205,6 +205,7 @@ struct stripe_head {
        short                   pd_idx;         /* parity disk index */
        short                   qd_idx;         /* 'Q' disk index for raid6 */
        short                   ddf_layout;/* use DDF ordering to calculate Q */
+       short                   hash_lock_index;
        unsigned long           state;          /* state flags */
        atomic_t                count;        /* nr of active thread/requests */
        int                     bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
        struct md_rdev  *rdev, *replacement;
 };
 
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
 struct r5worker {
        struct work_struct work;
        struct r5worker_group *group;
+       struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        bool working;
 };
 
@@ -382,6 +392,8 @@ struct r5worker_group {
 
 struct r5conf {
        struct hlist_head       *stripe_hashtbl;
+       /* only protect corresponding hash list and inactive_list */
+       spinlock_t              hash_locks[NR_STRIPE_HASH_LOCKS];
        struct mddev            *mddev;
        int                     chunk_sectors;
        int                     level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
         * Free stripes pool
         */
        atomic_t                active_stripes;
-       struct list_head        inactive_list;
+       struct list_head        inactive_list[NR_STRIPE_HASH_LOCKS];
+       atomic_t                empty_inactive_list_nr;
        struct llist_head       released_stripes;
        wait_queue_head_t       wait_for_stripe;
        wait_queue_head_t       wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
         * the new thread here until we fully activate the array.
         */
        struct md_thread        *thread;
+       struct list_head        temp_inactive_list[NR_STRIPE_HASH_LOCKS];
        struct r5worker_group   *worker_groups;
        int                     group_cnt;
        int                     worker_cnt_per_group;
index a67fc1635592fd34717ef559049924b1add707ca..d9eff54c69cfc1ba5696e205e8122e072019eb1c 100644 (file)
@@ -253,6 +253,42 @@ do {                                                                       \
        __ret;                                                          \
 })
 
+#define __wait_event_cmd(wq, condition, cmd1, cmd2)                    \
+do {                                                                   \
+       DEFINE_WAIT(__wait);                                            \
+                                                                       \
+       for (;;) {                                                      \
+               prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \
+               if (condition)                                          \
+                       break;                                          \
+               cmd1;                                                   \
+               schedule();                                             \
+               cmd2;                                                   \
+       }                                                               \
+       finish_wait(&wq, &__wait);                                      \
+} while (0)
+
+/**
+ * wait_event_cmd - sleep until a condition gets true
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * cmd1: the command will be executed before sleep
+ * cmd2: the command will be executed after sleep
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true. The @condition is checked each time
+ * the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ */
+#define wait_event_cmd(wq, condition, cmd1, cmd2)                      \
+do {                                                                   \
+       if (condition)                                                  \
+               break;                                                  \
+       __wait_event_cmd(wq, condition, cmd1, cmd2);                    \
+} while (0)
+
 #define __wait_event_interruptible(wq, condition, ret)                 \
 do {                                                                   \
        DEFINE_WAIT(__wait);                                            \