]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
block: fix accounting bug on cross partition merges
authorJerome Marchand <jmarchan@redhat.com>
Wed, 5 Jan 2011 15:57:38 +0000 (16:57 +0100)
committerGreg Kroah-Hartman <gregkh@suse.de>
Thu, 17 Feb 2011 23:14:17 +0000 (15:14 -0800)
commit 09e099d4bafea3b15be003d548bdf94b4b6e0e17 upstream.

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
block/blk-core.c
block/blk-merge.c
block/genhd.c
fs/partitions/check.c
include/linux/blkdev.h
include/linux/genhd.h

index 4ce953f1b3909f38f213adcc0a39251a34875330..87675207448fdf0dbffc93d209829caf0e016f7f 100644 (file)
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
                return;
 
        cpu = part_stat_lock();
-       part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-       if (!new_io)
+       if (!new_io) {
+               part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
-       else {
+       } else {
+               part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+               if (!kref_test_and_get(&part->ref)) {
+                       /*
+                        * The partition is already being removed,
+                        * the request will be accounted on the disk only
+                        *
+                        * We take a reference on disk->part0 although that
+                        * partition will never be deleted, so we can treat
+                        * it as any other partition.
+                        */
+                       part = &rq->rq_disk->part0;
+                       kref_get(&part->ref);
+               }
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
+               rq->part = part;
        }
 
        part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
+       rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
                int cpu;
 
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
                int cpu;
 
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
 
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rw);
 
+               kref_put(&part->ref, __delete_partition);
                part_stat_unlock();
        }
 }
index 74bc4a768f32e0f01e5c43f100f8663eea34c3a4..23ea74bc2ea47828b5c5fd5fc0acc0b6fc4211e1 100644 (file)
@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
                int cpu;
 
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
 
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rq_data_dir(req));
 
+               kref_put(&part->ref, __delete_partition);
                part_stat_unlock();
        }
 }
index 5fa2b44a72ffd22994d2e648b3d247f3f4877c95..0c55eae59750ced1f2110db336f4ef659a90c6b6 100644 (file)
@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
                        return NULL;
                }
                disk->part_tbl->part[0] = &disk->part0;
+               kref_init(&disk->part0.ref);
 
                disk->minors = minors;
                rand_initialize_disk(disk);
index 0a8b0ad0c7e25c803ca57a9a814934620ad80039..0123717e3d99cd85237775ee0211fcc631a3ac3c 100644 (file)
@@ -372,6 +372,13 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
        put_device(part_to_dev(part));
 }
 
+void __delete_partition(struct kref *ref)
+{
+       struct hd_struct *part = container_of(ref, struct hd_struct, ref);
+
+       call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
        struct disk_part_tbl *ptbl = disk->part_tbl;
@@ -390,7 +397,7 @@ void delete_partition(struct gendisk *disk, int partno)
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
 
-       call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+       kref_put(&part->ref, __delete_partition);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -489,6 +496,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
+       kref_init(&p->ref);
        return p;
 
 out_free_info:
index 36ab42c9bb991566dc5004fad9ca3c0c958bfeb0..7572b19383d2c629e3c5f9809edac8ec6d4dc7c2 100644 (file)
@@ -115,6 +115,7 @@ struct request {
        void *elevator_private3;
 
        struct gendisk *rq_disk;
+       struct hd_struct *part;
        unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
        unsigned long long start_time_ns;
index 7a7b9c1644e4f26de810d54fd00a12302d56c927..2ba2792a3dd4e1f1d02fb1f0039825d43afbe8ac 100644 (file)
@@ -116,6 +116,7 @@ struct hd_struct {
        struct disk_stats dkstats;
 #endif
        struct rcu_head rcu_head;
+       struct kref ref;
 };
 
 #define GENHD_FL_REMOVABLE                     1
@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
                                                     sector_t len, int flags,
                                                     struct partition_meta_info
                                                       *info);
+extern void __delete_partition(struct kref *ref);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);