aoe: become I/O request queue handler for increased user control

author Ed Cashin <ecashin@coraid.com>

Wed, 26 Sep 2012 01:35:00 +0000 (11:35 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Wed, 26 Sep 2012 05:45:41 +0000 (15:45 +1000)
author Ed Cashin <ecashin@coraid.com>
Wed, 26 Sep 2012 01:35:00 +0000 (11:35 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Wed, 26 Sep 2012 05:45:41 +0000 (15:45 +1000)
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h

index 0cd6c0f7a535d22ab50c3f9b6bf289e3038df382..8c4f6d942e05abf0ee237da95dec4be1dcc918d5 100644 (file)
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -90,7 +90,7 @@ enum {
         MIN_BUFS = 16,
         NTARGETS = 8,
         NAOEIFS = 8,
-       NSKBPOOLMAX = 128,
+       NSKBPOOLMAX = 256,
         NFACTIVE = 17,
  
         TIMERTICK = HZ / 10,
@@ -100,30 +100,26 @@ enum {
  };
  
  struct buf {
-       struct list_head bufs;
-       ulong stime;    /* for disk stats */
-       ulong flags;
         ulong nframesout;
         ulong resid;
         ulong bv_resid;
-       ulong bv_off;
         sector_t sector;
         struct bio *bio;
         struct bio_vec *bv;
+       struct request *rq;
  };
  
  struct frame {
         struct list_head head;
         u32 tag;
         ulong waited;
-       struct buf *buf;
         struct aoetgt *t;               /* parent target I belong to */
-       char *bufaddr;
-       ulong bcnt;
         sector_t lba;
         struct sk_buff *skb;            /* command skb freed on module exit */
         struct sk_buff *r_skb;          /* response skb for async processing */
+       struct buf *buf;
         struct bio_vec *bv;
+       ulong bcnt;
         ulong bv_off;
  };
  
@@ -161,6 +157,7 @@ struct aoedev {
         u16 rttavg;             /* round trip average of requests/responses */
         u16 mintimer;
         u16 fw_ver;             /* version of blade's firmware */
+       ulong ref;
         struct work_struct work;/* disk create work struct */
         struct gendisk *gd;
         struct request_queue *blkq;
@@ -168,11 +165,13 @@ struct aoedev {
         sector_t ssize;
         struct timer_list timer;
         spinlock_t lock;
-       struct sk_buff_head sendq;
         struct sk_buff_head skbpool;
         mempool_t *bufpool;     /* for deadlock-free Buf allocation */
-       struct list_head bufq;  /* queue of bios to work on */
-       struct buf *inprocess;  /* the one we're currently working on */
+       struct {                /* pointers to work in progress */
+               struct buf *buf;
+               struct bio *nxbio;
+               struct request *rq;
+       } ip;
         struct aoetgt *targets[NTARGETS];
         struct aoetgt **tgt;    /* target in use when working */
         struct aoetgt *htgt;    /* target needing rexmit assistance */
@@ -209,6 +208,8 @@ void aoecmd_exit(void);
  int aoecmd_init(void);
  struct sk_buff *aoecmd_ata_id(struct aoedev *);
  void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_end_request(struct aoedev *, struct request *, int);
  
  int aoedev_init(void);
  void aoedev_exit(void);
@@ -216,7 +217,8 @@ struct aoedev *aoedev_by_aoeaddr(int maj, int min);
  struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
  void aoedev_downdev(struct aoedev *d);
  int aoedev_flush(const char __user *str, size_t size);
-void aoe_failbuf(struct aoedev *d, struct buf *buf);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
  
  int aoenet_init(void);
  void aoenet_exit(void);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c

index 3a8f0933cc7db9e8f4f3c40f05d0ac70c2580f26..7ec4b8fa28fdd4c617104fa847f312f90f049d6e 100644 (file)
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -161,68 +161,22 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
  }
  
  static void
-aoeblk_make_request(struct request_queue *q, struct bio *bio)
+aoeblk_request(struct request_queue *q)
  {
-       struct sk_buff_head queue;
         struct aoedev *d;
-       struct buf *buf;
-       ulong flags;
-
-       blk_queue_bounce(q, &bio);
-
-       if (bio == NULL) {
-               printk(KERN_ERR "aoe: bio is NULL\n");
-               BUG();
-               return;
-       }
-       d = bio->bi_bdev->bd_disk->private_data;
-       if (d == NULL) {
-               printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
-               BUG();
-               bio_endio(bio, -ENXIO);
-               return;
-       } else if (bio->bi_io_vec == NULL) {
-               printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
-               BUG();
-               bio_endio(bio, -ENXIO);
-               return;
-       }
-       buf = mempool_alloc(d->bufpool, GFP_NOIO);
-       if (buf == NULL) {
-               printk(KERN_INFO "aoe: buf allocation failure\n");
-               bio_endio(bio, -ENOMEM);
-               return;
-       }
-       memset(buf, 0, sizeof(*buf));
-       INIT_LIST_HEAD(&buf->bufs);
-       buf->stime = jiffies;
-       buf->bio = bio;
-       buf->resid = bio->bi_size;
-       buf->sector = bio->bi_sector;
-       buf->bv = &bio->bi_io_vec[bio->bi_idx];
-       buf->bv_resid = buf->bv->bv_len;
-       WARN_ON(buf->bv_resid == 0);
-       buf->bv_off = buf->bv->bv_offset;
-
-       spin_lock_irqsave(&d->lock, flags);
+       struct request *rq;
  
+       d = q->queuedata;
         if ((d->flags & DEVFL_UP) == 0) {
                 pr_info_ratelimited("aoe: device %ld.%d is not up\n",
                         d->aoemajor, d->aoeminor);
-               spin_unlock_irqrestore(&d->lock, flags);
-               mempool_free(buf, d->bufpool);
-               bio_endio(bio, -ENXIO);
+               while ((rq = blk_peek_request(q))) {
+                       blk_start_request(rq);
+                       aoe_end_request(d, rq, 1);
+               }
                 return;
         }
-
-       list_add_tail(&buf->bufs, &d->bufq);
-
         aoecmd_work(d);
-       __skb_queue_head_init(&queue);
-       skb_queue_splice_init(&d->sendq, &queue);
-
-       spin_unlock_irqrestore(&d->lock, flags);
-       aoenet_xmit(&queue);
  }
  
  static int
@@ -254,34 +208,46 @@ aoeblk_gdalloc(void *vp)
  {
         struct aoedev *d = vp;
         struct gendisk *gd;
-       enum { KB = 1024, MB = KB * KB, READ_AHEAD = MB, };
+       mempool_t *mp;
+       struct request_queue *q;
+       enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
         ulong flags;
  
         gd = alloc_disk(AOE_PARTITIONS);
         if (gd == NULL) {
-               printk(KERN_ERR
-                       "aoe: cannot allocate disk structure for %ld.%d\n",
+               pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
                         d->aoemajor, d->aoeminor);
                 goto err;
         }
  
-       d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
-       if (d->bufpool == NULL) {
+       mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+               buf_pool_cache);
+       if (mp == NULL) {
                 printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
                         d->aoemajor, d->aoeminor);
                 goto err_disk;
         }
+       q = blk_init_queue(aoeblk_request, &d->lock);
+       if (q == NULL) {
+               pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+                       d->aoemajor, d->aoeminor);
+               mempool_destroy(mp);
+               goto err_disk;
+       }
  
         d->blkq = blk_alloc_queue(GFP_KERNEL);
         if (!d->blkq)
                 goto err_mempool;
-       blk_queue_make_request(d->blkq, aoeblk_make_request);
         d->blkq->backing_dev_info.name = "aoe";
         if (bdi_init(&d->blkq->backing_dev_info))
                 goto err_blkq;
         spin_lock_irqsave(&d->lock, flags);
         blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
-       d->blkq->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+       q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+       d->bufpool = mp;
+       d->blkq = gd->queue = q;
+       q->queuedata = d;
+       d->gd = gd;
         gd->major = AOE_MAJOR;
         gd->first_minor = d->sysminor * AOE_PARTITIONS;
         gd->fops = &aoe_bdops;
@@ -290,8 +256,6 @@ aoeblk_gdalloc(void *vp)
         snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
                 d->aoemajor, d->aoeminor);
  
-       gd->queue = d->blkq;
-       d->gd = gd;
         d->flags &= ~DEVFL_GDALLOC;
         d->flags |= DEVFL_UP;
  
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c

index f145388cb94ab9c7308750a275d9ec77877c8c5b..3557f0d04b46d9270195bbb565a43933d01f87c2 100644 (file)
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -106,6 +106,7 @@ loop:
                 spin_lock_irqsave(&d->lock, flags);
                 goto loop;
         }
+       aoedev_put(d);
         if (skb) {
                 struct sk_buff_head queue;
                 __skb_queue_head_init(&queue);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c

index 59b333c902a6485225a69c78ba3ef8417fa2b778..5928a08c1f3f35a5a23fe799e4e24ac77ccef87b 100644 (file)
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -23,6 +23,8 @@
  
  static void ktcomplete(struct frame *, struct sk_buff *);
  
+static struct buf *nextbuf(struct aoedev *);
+
  static int aoe_deadsecs = 60 * 3;
  module_param(aoe_deadsecs, int, 0644);
  MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
@@ -283,17 +285,20 @@ aoecmd_ata_rw(struct aoedev *d)
         struct bio_vec *bv;
         struct aoetgt *t;
         struct sk_buff *skb;
+       struct sk_buff_head queue;
         ulong bcnt, fbcnt;
         char writebit, extbit;
  
         writebit = 0x10;
         extbit = 0x4;
  
+       buf = nextbuf(d);
+       if (buf == NULL)
+               return 0;
         f = newframe(d);
         if (f == NULL)
                 return 0;
         t = *d->tgt;
-       buf = d->inprocess;
         bv = buf->bv;
         bcnt = t->ifp->maxbcnt;
         if (bcnt == 0)
@@ -312,7 +317,7 @@ aoecmd_ata_rw(struct aoedev *d)
                 fbcnt -= buf->bv_resid;
                 buf->resid -= buf->bv_resid;
                 if (buf->resid == 0) {
-                       d->inprocess = NULL;
+                       d->ip.buf = NULL;
                         break;
                 }
                 buf->bv++;
@@ -364,8 +369,11 @@ aoecmd_ata_rw(struct aoedev *d)
  
         skb->dev = t->ifp->nd;
         skb = skb_clone(skb, GFP_ATOMIC);
-       if (skb)
-               __skb_queue_tail(&d->sendq, skb);
+       if (skb) {
+               __skb_queue_head_init(&queue);
+               __skb_queue_tail(&queue, skb);
+               aoenet_xmit(&queue);
+       }
         return 1;
  }
  
@@ -415,6 +423,7 @@ static void
  resend(struct aoedev *d, struct frame *f)
  {
         struct sk_buff *skb;
+       struct sk_buff_head queue;
         struct aoe_hdr *h;
         struct aoe_atahdr *ah;
         struct aoetgt *t;
@@ -444,7 +453,9 @@ resend(struct aoedev *d, struct frame *f)
         skb = skb_clone(skb, GFP_ATOMIC);
         if (skb == NULL)
                 return;
-       __skb_queue_tail(&d->sendq, skb);
+       __skb_queue_head_init(&queue);
+       __skb_queue_tail(&queue, skb);
+       aoenet_xmit(&queue);
  }
  
  static int
@@ -554,7 +565,6 @@ ata_scnt(unsigned char *packet) {
  static void
  rexmit_timer(ulong vp)
  {
-       struct sk_buff_head queue;
         struct aoedev *d;
         struct aoetgt *t, **tt, **te;
         struct aoeif *ifp;
@@ -603,6 +613,12 @@ rexmit_timer(ulong vp)
                 }
         }
  
+       if (!list_empty(&flist)) {      /* retransmissions necessary */
+               n = d->rttavg <<= 1;
+               if (n > MAXTIMER)
+                       d->rttavg = MAXTIMER;
+       }
+
         /* process expired frames */
         while (!list_empty(&flist)) {
                 pos = flist.next;
@@ -641,45 +657,131 @@ rexmit_timer(ulong vp)
                 resend(d, f);
         }
  
-       if (!skb_queue_empty(&d->sendq)) {
-               n = d->rttavg <<= 1;
-               if (n > MAXTIMER)
-                       d->rttavg = MAXTIMER;
-       }
-
-       if (d->flags & DEVFL_KICKME || d->htgt) {
+       if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
                 d->flags &= ~DEVFL_KICKME;
-               aoecmd_work(d);
+               d->blkq->request_fn(d->blkq);
         }
  
-       __skb_queue_head_init(&queue);
-       skb_queue_splice_init(&d->sendq, &queue);
-
         d->timer.expires = jiffies + TIMERTICK;
         add_timer(&d->timer);
  
         spin_unlock_irqrestore(&d->lock, flags);
+}
  
-       aoenet_xmit(&queue);
+static unsigned long
+rqbiocnt(struct request *r)
+{
+       struct bio *bio;
+       unsigned long n = 0;
+
+       __rq_for_each_bio(bio, r)
+               n++;
+       return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios.  Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition.  So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+       struct bio_vec *bv;
+       struct page *page;
+       int i;
+
+       bio_for_each_segment(bv, bio, i) {
+               page = bv->bv_page;
+               /* Non-zero page count for non-head members of
+                * compound pages is no longer allowed by the kernel,
+                * but this has never been seen here.
+                */
+               if (unlikely(PageCompound(page)))
+                       if (compound_trans_head(page) != page) {
+                               pr_crit("page tail used for block I/O\n");
+                               BUG();
+                       }
+               atomic_inc(&page->_count);
+       }
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+       struct bio_vec *bv;
+       int i;
+
+       bio_for_each_segment(bv, bio, i)
+               atomic_dec(&bv->bv_page->_count);
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+       struct bio_vec *bv;
+
+       memset(buf, 0, sizeof(*buf));
+       buf->rq = rq;
+       buf->bio = bio;
+       buf->resid = bio->bi_size;
+       buf->sector = bio->bi_sector;
+       bio_pageinc(bio);
+       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+       buf->bv_resid = bv->bv_len;
+       WARN_ON(buf->bv_resid == 0);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+       struct request *rq;
+       struct request_queue *q;
+       struct buf *buf;
+       struct bio *bio;
+
+       q = d->blkq;
+       if (q == NULL)
+               return NULL;    /* initializing */
+       if (d->ip.buf)
+               return d->ip.buf;
+       rq = d->ip.rq;
+       if (rq == NULL) {
+               rq = blk_peek_request(q);
+               if (rq == NULL)
+                       return NULL;
+               blk_start_request(rq);
+               d->ip.rq = rq;
+               d->ip.nxbio = rq->bio;
+               rq->special = (void *) rqbiocnt(rq);
+       }
+       buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+       if (buf == NULL) {
+               pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+               return NULL;
+       }
+       bio = d->ip.nxbio;
+       bufinit(buf, rq, bio);
+       bio = bio->bi_next;
+       d->ip.nxbio = bio;
+       if (bio == NULL)
+               d->ip.rq = NULL;
+       return d->ip.buf = buf;
  }
  
  /* enters with d->lock held */
  void
  aoecmd_work(struct aoedev *d)
  {
-       struct buf *buf;
-loop:
         if (d->htgt && !sthtith(d))
                 return;
-       if (d->inprocess == NULL) {
-               if (list_empty(&d->bufq))
-                       return;
-               buf = container_of(d->bufq.next, struct buf, bufs);
-               list_del(d->bufq.next);
-               d->inprocess = buf;
-       }
-       if (aoecmd_ata_rw(d))
-               goto loop;
+       while (aoecmd_ata_rw(d))
+               ;
  }
  
  /* this function performs work that has been deferred until sleeping is OK
@@ -802,25 +904,6 @@ gettgt(struct aoedev *d, char *addr)
         return NULL;
  }
  
-static inline void
-diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
-{
-       unsigned long n_sect = bio->bi_size >> 9;
-       const int rw = bio_data_dir(bio);
-       struct hd_struct *part;
-       int cpu;
-
-       cpu = part_stat_lock();
-       part = disk_map_sector_rcu(disk, sector);
-
-       part_stat_inc(cpu, part, ios[rw]);
-       part_stat_add(cpu, part, ticks[rw], duration);
-       part_stat_add(cpu, part, sectors[rw], n_sect);
-       part_stat_add(cpu, part, io_ticks, duration);
-
-       part_stat_unlock();
-}
-
  static void
  bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt)
  {
@@ -842,6 +925,43 @@ loop:
         goto loop;
  }
  
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+       struct bio *bio;
+       int bok;
+       struct request_queue *q;
+
+       q = d->blkq;
+       if (rq == d->ip.rq)
+               d->ip.rq = NULL;
+       do {
+               bio = rq->bio;
+               bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+       } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size));
+
+       /* cf. http://lkml.org/lkml/2006/10/31/28 */
+       if (!fastfail)
+               q->request_fn(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+       struct request *rq;
+       unsigned long n;
+
+       if (buf == d->ip.buf)
+               d->ip.buf = NULL;
+       rq = buf->rq;
+       bio_pagedec(buf->bio);
+       mempool_free(buf, d->bufpool);
+       n = (unsigned long) rq->special;
+       rq->special = (void *) --n;
+       if (n == 0)
+               aoe_end_request(d, rq, 0);
+}
+
  static void
  ktiocomplete(struct frame *f)
  {
@@ -876,7 +996,7 @@ ktiocomplete(struct frame *f)
                         ahout->cmdstat, ahin->cmdstat,
                         d->aoemajor, d->aoeminor);
  noskb: if (buf)
-                       buf->flags |= BUFFL_FAIL;
+                       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
                 goto badrsp;
         }
  
@@ -887,7 +1007,7 @@ noskb:     if (buf)
                 if (skb->len < n) {
                         pr_err("aoe: runt data size in read.  skb->len=%d need=%ld\n",
                                 skb->len, n);
-                       buf->flags |= BUFFL_FAIL;
+                       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
                         break;
                 }
                 bvcpy(f->bv, f->bv_off, skb, n);
@@ -927,18 +1047,13 @@ badrsp:
  
         aoe_freetframe(f);
  
-       if (buf && --buf->nframesout == 0 && buf->resid == 0) {
-               struct bio *bio = buf->bio;
+       if (buf && --buf->nframesout == 0 && buf->resid == 0)
+               aoe_end_buf(d, buf);
  
-               diskstats(d->gd, bio, jiffies - buf->stime, buf->sector);
-               n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
-               mempool_free(buf, d->bufpool);
-               spin_unlock_irq(&d->lock);
-               if (n != -EIO)
-                       bio_flush_dcache_pages(buf->bio);
-               bio_endio(bio, n);
-       } else
-               spin_unlock_irq(&d->lock);
+       aoecmd_work(d);
+
+       spin_unlock_irq(&d->lock);
+       aoedev_put(d);
         dev_kfree_skb(skb);
  }
  
@@ -1061,12 +1176,14 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                 printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
                        d->aoemajor, d->aoeminor, h->src);
                 spin_unlock_irqrestore(&d->lock, flags);
+               aoedev_put(d);
                 return skb;
         }
         f = getframe(t, n);
         if (f == NULL) {
                 calc_rttavg(d, -tsince(n));
                 spin_unlock_irqrestore(&d->lock, flags);
+               aoedev_put(d);
                 snprintf(ebuf, sizeof ebuf,
                         "%15s e%d.%d    tag=%08x@%08lx\n",
                         "unexpected rsp",
@@ -1185,8 +1302,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
         struct aoeif *ifp;
         ulong flags, sysminor, aoemajor;
         struct sk_buff *sl;
+       struct sk_buff_head queue;
         u16 n;
  
+       sl = NULL;
         h = (struct aoe_hdr *) skb_mac_header(skb);
         ch = (struct aoe_cfghdr *) (h+1);
  
@@ -1223,10 +1342,8 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
         t = gettgt(d, h->src);
         if (!t) {
                 t = addtgt(d, h->src, n);
-               if (!t) {
-                       spin_unlock_irqrestore(&d->lock, flags);
-                       return;
-               }
+               if (!t)
+                       goto bail;
         }
         ifp = getif(t, skb->dev);
         if (!ifp) {
@@ -1235,8 +1352,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
                         printk(KERN_INFO
                                 "aoe: device addif failure; "
                                 "too many interfaces?\n");
-                       spin_unlock_irqrestore(&d->lock, flags);
-                       return;
+                       goto bail;
                 }
         }
         if (ifp->maxbcnt) {
@@ -1257,18 +1373,14 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
         }
  
         /* don't change users' perspective */
-       if (d->nopen) {
-               spin_unlock_irqrestore(&d->lock, flags);
-               return;
+       if (d->nopen == 0) {
+               d->fw_ver = be16_to_cpu(ch->fwver);
+               sl = aoecmd_ata_id(d);
         }
-       d->fw_ver = be16_to_cpu(ch->fwver);
-
-       sl = aoecmd_ata_id(d);
-
+bail:
         spin_unlock_irqrestore(&d->lock, flags);
-
+       aoedev_put(d);
         if (sl) {
-               struct sk_buff_head queue;
                 __skb_queue_head_init(&queue);
                 __skb_queue_tail(&queue, sl);
                 aoenet_xmit(&queue);
@@ -1297,8 +1409,19 @@ aoecmd_cleanslate(struct aoedev *d)
         }
  }
  
-static void
-flush_iocq(void)
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+       if (buf == NULL)
+               return;
+       buf->resid = 0;
+       clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+       if (buf->nframesout == 0)
+               aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
  {
         struct frame *f;
         struct aoedev *d;
@@ -1324,6 +1447,7 @@ flush_iocq(void)
                 aoe_freetframe(f);
                 spin_unlock_irqrestore(&d->lock, flags);
                 dev_kfree_skb(skb);
+               aoedev_put(d);
         }
  }
  
@@ -1344,5 +1468,5 @@ void
  aoecmd_exit(void)
  {
         aoe_ktstop(&kts);
-       flush_iocq();
+       aoe_flush_iocq();
  }
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c

index 40bae1a1ff1e23ea1f3e13ad708c9e34c54c8faa..635dc986cf770d50d1c984ef934b3dc3c1cf132e 100644 (file)
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -19,6 +19,17 @@ static void skbpoolfree(struct aoedev *d);
  static struct aoedev *devlist;
  static DEFINE_SPINLOCK(devlist_lock);
  
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr or
+ * aoedev_by_sysminor_m automatically get a reference count and must
+ * be responsible for performing a aoedev_put.  With the addition of
+ * async kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq.  When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
  struct aoedev *
  aoedev_by_aoeaddr(int maj, int min)
  {
@@ -28,13 +39,25 @@ aoedev_by_aoeaddr(int maj, int min)
         spin_lock_irqsave(&devlist_lock, flags);
  
         for (d=devlist; d; d=d->next)
-               if (d->aoemajor == maj && d->aoeminor == min)
+               if (d->aoemajor == maj && d->aoeminor == min) {
+                       d->ref++;
                         break;
+               }
  
         spin_unlock_irqrestore(&devlist_lock, flags);
         return d;
  }
  
+void
+aoedev_put(struct aoedev *d)
+{
+       ulong flags;
+
+       spin_lock_irqsave(&devlist_lock, flags);
+       d->ref--;
+       spin_unlock_irqrestore(&devlist_lock, flags);
+}
+
  static void
  dummy_timer(ulong vp)
  {
@@ -47,21 +70,26 @@ dummy_timer(ulong vp)
         add_timer(&d->timer);
  }
  
-void
-aoe_failbuf(struct aoedev *d, struct buf *buf)
+static void
+aoe_failip(struct aoedev *d)
  {
+       struct request *rq;
         struct bio *bio;
+       unsigned long n;
+
+       aoe_failbuf(d, d->ip.buf);
  
-       if (buf == NULL)
+       rq = d->ip.rq;
+       if (rq == NULL)
                 return;
-       buf->flags |= BUFFL_FAIL;
-       if (buf->nframesout == 0) {
-               if (buf == d->inprocess) /* ensure we only process this once */
-                       d->inprocess = NULL;
-               bio = buf->bio;
-               mempool_free(buf, d->bufpool);
-               bio_endio(bio, -EIO);
+       while ((bio = d->ip.nxbio)) {
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+               d->ip.nxbio = bio->bi_next;
+               n = (unsigned long) rq->special;
+               rq->special = (void *) --n;
         }
+       if ((unsigned long) rq->special == 0)
+               aoe_end_request(d, rq, 0);
  }
  
  void
@@ -70,8 +98,11 @@ aoedev_downdev(struct aoedev *d)
         struct aoetgt *t, **tt, **te;
         struct frame *f;
         struct list_head *head, *pos, *nx;
+       struct request *rq;
         int i;
  
+       d->flags &= ~DEVFL_UP;
+
         /* clean out active buffers on all targets */
         tt = d->targets;
         te = tt + NTARGETS;
@@ -92,22 +123,20 @@ aoedev_downdev(struct aoedev *d)
                 t->nout = 0;
         }
  
-       /* clean out the in-process buffer (if any) */
-       aoe_failbuf(d, d->inprocess);
-       d->inprocess = NULL;
+       /* clean out the in-process request (if any) */
+       aoe_failip(d);
         d->htgt = NULL;
  
-       /* clean out all pending I/O */
-       while (!list_empty(&d->bufq)) {
-               struct buf *buf = container_of(d->bufq.next, struct buf, bufs);
-               list_del(d->bufq.next);
-               aoe_failbuf(d, buf);
+       /* fast fail all pending I/O */
+       if (d->blkq) {
+               while ((rq = blk_peek_request(d->blkq))) {
+                       blk_start_request(rq);
+                       aoe_end_request(d, rq, 1);
+               }
         }
  
         if (d->gd)
                 set_capacity(d->gd, 0);
-
-       d->flags &= ~DEVFL_UP;
  }
  
  static void
@@ -120,6 +149,7 @@ aoedev_freedev(struct aoedev *d)
                 aoedisk_rm_sysfs(d);
                 del_gendisk(d->gd);
                 put_disk(d->gd);
+               blk_cleanup_queue(d->blkq);
         }
         t = d->targets;
         e = t + NTARGETS;
@@ -128,7 +158,6 @@ aoedev_freedev(struct aoedev *d)
         if (d->bufpool)
                 mempool_destroy(d->bufpool);
         skbpoolfree(d);
-       blk_cleanup_queue(d->blkq);
         kfree(d);
  }
  
@@ -155,7 +184,8 @@ aoedev_flush(const char __user *str, size_t cnt)
                 spin_lock(&d->lock);
                 if ((!all && (d->flags & DEVFL_UP))
                 || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
-               || d->nopen) {
+               || d->nopen
+               || d->ref) {
                         spin_unlock(&d->lock);
                         dd = &d->next;
                         continue;
@@ -176,12 +206,15 @@ aoedev_flush(const char __user *str, size_t cnt)
         return 0;
  }
  
-/* I'm not really sure that this is a realistic problem, but if the
-network driver goes gonzo let's just leak memory after complaining. */
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring.  The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
  static void
  skbfree(struct sk_buff *skb)
  {
-       enum { Sms = 100, Tms = 3*1000};
+       enum { Sms = 250, Tms = 30 * 1000};
         int i = Tms / Sms;
  
         if (skb == NULL)
@@ -222,8 +255,10 @@ aoedev_by_sysminor_m(ulong sysminor)
         spin_lock_irqsave(&devlist_lock, flags);
  
         for (d=devlist; d; d=d->next)
-               if (d->sysminor == sysminor)
+               if (d->sysminor == sysminor) {
+                       d->ref++;
                         break;
+               }
         if (d)
                 goto out;
         d = kcalloc(1, sizeof *d, GFP_ATOMIC);
@@ -231,7 +266,6 @@ aoedev_by_sysminor_m(ulong sysminor)
                 goto out;
         INIT_WORK(&d->work, aoecmd_sleepwork);
         spin_lock_init(&d->lock);
-       skb_queue_head_init(&d->sendq);
         skb_queue_head_init(&d->skbpool);
         init_timer(&d->timer);
         d->timer.data = (ulong) d;
@@ -240,7 +274,7 @@ aoedev_by_sysminor_m(ulong sysminor)
         add_timer(&d->timer);
         d->bufpool = NULL;      /* defer to aoeblk_gdalloc */
         d->tgt = d->targets;
-       INIT_LIST_HEAD(&d->bufq);
+       d->ref = 1;
         d->sysminor = sysminor;
         d->aoemajor = AOEMAJOR(sysminor);
         d->aoeminor = AOEMINOR(sysminor);
@@ -274,6 +308,7 @@ aoedev_exit(void)
         struct aoedev *d;
         ulong flags;
  
+       aoe_flush_iocq();
         while ((d = devlist)) {
                 devlist = d->next;
author	Ed Cashin <ecashin@coraid.com>
	Wed, 26 Sep 2012 01:35:00 +0000 (11:35 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Wed, 26 Sep 2012 05:45:41 +0000 (15:45 +1000)
drivers/block/aoe/aoe.h		patch \| blob \| history
drivers/block/aoe/aoeblk.c		patch \| blob \| history
drivers/block/aoe/aoechr.c		patch \| blob \| history
drivers/block/aoe/aoecmd.c		patch \| blob \| history
drivers/block/aoe/aoedev.c		patch \| blob \| history