1 /* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
4 * Filesystem request handling methods
7 #include <linux/hdreg.h>
8 #include <linux/blkdev.h>
9 #include <linux/skbuff.h>
10 #include <linux/netdevice.h>
11 #include <linux/genhd.h>
12 #include <linux/moduleparam.h>
13 #include <net/net_namespace.h>
14 #include <asm/unaligned.h>
17 static int aoe_deadsecs = 60 * 3;
18 module_param(aoe_deadsecs, int, 0644);
19 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
21 static struct sk_buff *
26 skb = alloc_skb(len, GFP_ATOMIC);
28 skb_reset_mac_header(skb);
29 skb_reset_network_header(skb);
30 skb->protocol = __constant_htons(ETH_P_AOE);
32 skb->next = skb->prev = NULL;
34 /* tell the network layer not to perform IP checksums
35 * or to get the NIC to do it
37 skb->ip_summed = CHECKSUM_NONE;
43 getframe(struct aoetgt *t, int tag)
56 * Leave the top bit clear so we have tagspace for userland.
57 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
58 * This driver reserves tag -1 to mean "unused frame."
61 newtag(struct aoetgt *t)
66 return n |= (++t->lasttag & 0x7fff) << 16;
70 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
72 u32 host_tag = newtag(t);
74 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
75 memcpy(h->dst, t->addr, sizeof h->dst);
76 h->type = __constant_cpu_to_be16(ETH_P_AOE);
78 h->major = cpu_to_be16(d->aoemajor);
79 h->minor = d->aoeminor;
81 h->tag = cpu_to_be32(host_tag);
87 put_lba(struct aoe_atahdr *ah, sector_t lba)
98 ifrotate(struct aoetgt *t)
101 if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
103 if (t->ifp->nd == NULL) {
104 printk(KERN_INFO "aoe: no interface to rotate to\n");
109 static struct frame *
110 freeframe(struct aoedev *d)
116 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
117 printk(KERN_ERR "aoe: NULL TARGETS!\n");
124 && (*t)->nout < (*t)->maxout) {
129 if (f->tag != FREETAG)
131 if (atomic_read(&skb_shinfo(f->skb)->dataref)
136 skb_shinfo(f->skb)->nr_frags = 0;
137 f->skb->data_len = 0;
143 if (n == 0) /* slow polling network card */
144 d->flags |= DEVFL_KICKME;
147 } while (t < &d->targets[NTARGETS] && *t);
152 aoecmd_ata_rw(struct aoedev *d)
156 struct aoe_atahdr *ah;
162 char writebit, extbit;
173 bcnt = t->ifp->maxbcnt;
176 if (bcnt > buf->bv_resid)
177 bcnt = buf->bv_resid;
178 /* initialize the headers & frame */
180 h = (struct aoe_hdr *) skb_mac_header(skb);
181 ah = (struct aoe_atahdr *) (h+1);
182 skb_put(skb, sizeof *h + sizeof *ah);
183 memset(h, 0, skb->len);
184 f->tag = aoehdr_atainit(d, t, h);
188 f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
190 f->lba = buf->sector;
192 /* set up ata header */
193 ah->scnt = bcnt >> 9;
194 put_lba(ah, buf->sector);
195 if (d->flags & DEVFL_EXT) {
196 ah->aflags |= AOEAFL_EXT;
200 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
202 if (bio_data_dir(buf->bio) == WRITE) {
203 skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
204 ah->aflags |= AOEAFL_WRITE;
206 skb->data_len = bcnt;
213 ah->cmdstat = WIN_READ | writebit | extbit;
215 /* mark all tracking fields and load out */
216 buf->nframesout += 1;
218 buf->bv_resid -= bcnt;
220 buf->sector += bcnt >> 9;
221 if (buf->resid == 0) {
223 } else if (buf->bv_resid == 0) {
225 buf->bv_resid = bv->bv_len;
226 WARN_ON(buf->bv_resid == 0);
227 buf->bv_off = bv->bv_offset;
230 skb->dev = t->ifp->nd;
231 skb = skb_clone(skb, GFP_ATOMIC);
234 d->sendq_tl->next = skb;
242 /* some callers cannot sleep, and they can call this function,
243 * transmitting the packets later, when interrupts are on
245 static struct sk_buff *
246 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff **tail)
249 struct aoe_cfghdr *ch;
250 struct sk_buff *skb, *sl, *sl_tail;
251 struct net_device *ifp;
255 read_lock(&dev_base_lock);
256 for_each_netdev(&init_net, ifp) {
258 if (!is_aoe_netif(ifp))
261 skb = new_skb(sizeof *h + sizeof *ch);
263 printk(KERN_INFO "aoe: skb alloc failure\n");
266 skb_put(skb, sizeof *h + sizeof *ch);
270 h = (struct aoe_hdr *) skb_mac_header(skb);
271 memset(h, 0, sizeof *h + sizeof *ch);
273 memset(h->dst, 0xff, sizeof h->dst);
274 memcpy(h->src, ifp->dev_addr, sizeof h->src);
275 h->type = __constant_cpu_to_be16(ETH_P_AOE);
277 h->major = cpu_to_be16(aoemajor);
286 read_unlock(&dev_base_lock);
294 resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
298 struct aoe_atahdr *ah;
305 h = (struct aoe_hdr *) skb_mac_header(skb);
306 ah = (struct aoe_atahdr *) (h+1);
308 snprintf(buf, sizeof buf,
309 "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x "
310 "s=%012llx d=%012llx nout=%d\n",
311 "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
312 mac_addr(h->src), mac_addr(h->dst), t->nout);
316 h->tag = cpu_to_be32(n);
317 memcpy(h->dst, t->addr, sizeof h->dst);
318 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
320 switch (ah->cmdstat) {
333 if (ah->aflags & AOEAFL_WRITE) {
334 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
335 offset_in_page(f->bufaddr), n);
336 skb->len = sizeof *h + sizeof *ah + n;
340 skb->dev = t->ifp->nd;
341 skb = skb_clone(skb, GFP_ATOMIC);
345 d->sendq_tl->next = skb;
356 n = jiffies & 0xffff;
363 static struct aoeif *
364 getif(struct aoetgt *t, struct net_device *nd)
376 static struct aoeif *
377 addif(struct aoetgt *t, struct net_device *nd)
385 p->maxbcnt = DEFAULTBCNT;
392 ejectif(struct aoetgt *t, struct aoeif *ifp)
397 e = t->ifs + NAOEIFS - 1;
398 n = (e - ifp) * sizeof *ifp;
399 memmove(ifp, ifp+1, n);
404 sthtith(struct aoedev *d)
406 struct frame *f, *e, *nf;
408 struct aoetgt *ht = *d->htgt;
413 if (f->tag == FREETAG)
425 resend(d, *d->tgt, nf);
427 /* he's clean, he's useless. take away his interfaces */
428 memset(ht->ifs, 0, sizeof ht->ifs);
433 static inline unsigned char
434 ata_scnt(unsigned char *packet) {
436 struct aoe_atahdr *ah;
438 h = (struct aoe_hdr *) packet;
439 ah = (struct aoe_atahdr *) (h+1);
444 rexmit_timer(ulong vp)
447 struct aoetgt *t, **tt, **te;
451 register long timeout;
454 d = (struct aoedev *) vp;
457 /* timeout is always ~150% of the moving average */
459 timeout += timeout >> 1;
461 spin_lock_irqsave(&d->lock, flags);
463 if (d->flags & DEVFL_TKILL) {
464 spin_unlock_irqrestore(&d->lock, flags);
469 for (; tt < te && *tt; tt++) {
474 if (f->tag == FREETAG
475 || tsince(f->tag) < timeout)
477 n = f->waited += timeout;
479 if (n > aoe_deadsecs) {
480 /* waited too long. device failure. */
485 if (n > HELPWAIT /* see if another target can help */
486 && (tt != d->targets || d->targets[1]))
489 if (t->nout == t->maxout) {
492 t->lastwadj = jiffies;
495 ifp = getif(t, f->skb->dev);
496 if (ifp && ++ifp->lost > (t->nframes << 1)
497 && (ifp != t->ifs || t->ifs[1].nd)) {
502 if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
503 && ifp && ++ifp->lostjumbo > (t->nframes << 1)
504 && ifp->maxbcnt != DEFAULTBCNT) {
507 "too many lost jumbo on "
509 "falling back to %d frames.\n",
510 d->aoemajor, d->aoeminor,
511 ifp->nd->name, mac_addr(t->addr),
519 if (t->nout == t->maxout
520 && t->maxout < t->nframes
521 && (jiffies - t->lastwadj)/HZ > 10) {
523 t->lastwadj = jiffies;
530 d->rttavg = MAXTIMER;
533 if (d->flags & DEVFL_KICKME || d->htgt) {
534 d->flags &= ~DEVFL_KICKME;
539 d->sendq_hd = d->sendq_tl = NULL;
541 d->timer.expires = jiffies + TIMERTICK;
542 add_timer(&d->timer);
544 spin_unlock_irqrestore(&d->lock, flags);
549 /* enters with d->lock held */
551 aoecmd_work(struct aoedev *d)
555 if (d->htgt && !sthtith(d))
557 if (d->inprocess == NULL) {
558 if (list_empty(&d->bufq))
560 buf = container_of(d->bufq.next, struct buf, bufs);
561 list_del(d->bufq.next);
564 if (aoecmd_ata_rw(d))
568 /* this function performs work that has been deferred until sleeping is OK
571 aoecmd_sleepwork(struct work_struct *work)
573 struct aoedev *d = container_of(work, struct aoedev, work);
575 if (d->flags & DEVFL_GDALLOC)
578 if (d->flags & DEVFL_NEWSIZE) {
579 struct block_device *bd;
583 ssize = d->gd->capacity;
584 bd = bdget_disk(d->gd, 0);
587 mutex_lock(&bd->bd_inode->i_mutex);
588 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
589 mutex_unlock(&bd->bd_inode->i_mutex);
592 spin_lock_irqsave(&d->lock, flags);
593 d->flags |= DEVFL_UP;
594 d->flags &= ~DEVFL_NEWSIZE;
595 spin_unlock_irqrestore(&d->lock, flags);
600 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
605 /* word 83: command set supported */
606 n = le16_to_cpu(get_unaligned((__le16 *) &id[83<<1]));
608 /* word 86: command set/feature enabled */
609 n |= le16_to_cpu(get_unaligned((__le16 *) &id[86<<1]));
611 if (n & (1<<10)) { /* bit 10: LBA 48 */
612 d->flags |= DEVFL_EXT;
614 /* word 100: number lba48 sectors */
615 ssize = le64_to_cpu(get_unaligned((__le64 *) &id[100<<1]));
617 /* set as in ide-disk.c:init_idedisk_capacity */
618 d->geo.cylinders = ssize;
619 d->geo.cylinders /= (255 * 63);
623 d->flags &= ~DEVFL_EXT;
625 /* number lba28 sectors */
626 ssize = le32_to_cpu(get_unaligned((__le32 *) &id[60<<1]));
628 /* NOTE: obsolete in ATA 6 */
629 d->geo.cylinders = le16_to_cpu(get_unaligned((__le16 *) &id[54<<1]));
630 d->geo.heads = le16_to_cpu(get_unaligned((__le16 *) &id[55<<1]));
631 d->geo.sectors = le16_to_cpu(get_unaligned((__le16 *) &id[56<<1]));
634 if (d->ssize != ssize)
635 printk(KERN_INFO "aoe: %012llx e%lu.%lu v%04x has %llu sectors\n",
636 (unsigned long long)mac_addr(t->addr),
637 d->aoemajor, d->aoeminor,
638 d->fw_ver, (long long)ssize);
642 d->gd->capacity = ssize;
643 d->flags |= DEVFL_NEWSIZE;
645 d->flags |= DEVFL_GDALLOC;
646 schedule_work(&d->work);
650 calc_rttavg(struct aoedev *d, int rtt)
659 else if (n > MAXTIMER)
661 d->mintimer += (n - d->mintimer) >> 1;
662 } else if (n < d->mintimer)
664 else if (n > MAXTIMER)
667 /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
672 static struct aoetgt *
673 gettgt(struct aoedev *d, char *addr)
675 struct aoetgt **t, **e;
679 for (; t < e && *t; t++)
680 if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
686 diskstats(struct gendisk *disk, struct bio *bio, ulong duration)
688 unsigned long n_sect = bio->bi_size >> 9;
689 const int rw = bio_data_dir(bio);
691 disk_stat_inc(disk, ios[rw]);
692 disk_stat_add(disk, ticks[rw], duration);
693 disk_stat_add(disk, sectors[rw], n_sect);
694 disk_stat_add(disk, io_ticks, duration);
698 aoecmd_ata_rsp(struct sk_buff *skb)
701 struct aoe_hdr *hin, *hout;
702 struct aoe_atahdr *ahin, *ahout;
713 hin = (struct aoe_hdr *) skb_mac_header(skb);
714 aoemajor = be16_to_cpu(get_unaligned(&hin->major));
715 d = aoedev_by_aoeaddr(aoemajor, hin->minor);
717 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
718 "for unknown device %d.%d\n",
719 aoemajor, hin->minor);
724 spin_lock_irqsave(&d->lock, flags);
726 n = be32_to_cpu(get_unaligned(&hin->tag));
727 t = gettgt(d, hin->src);
729 printk(KERN_INFO "aoe: can't find target e%ld.%d:%012llx\n",
730 d->aoemajor, d->aoeminor,
731 (unsigned long long) mac_addr(hin->src));
732 spin_unlock_irqrestore(&d->lock, flags);
737 calc_rttavg(d, -tsince(n));
738 spin_unlock_irqrestore(&d->lock, flags);
739 snprintf(ebuf, sizeof ebuf,
740 "%15s e%d.%d tag=%08x@%08lx\n",
742 be16_to_cpu(get_unaligned(&hin->major)),
744 be32_to_cpu(get_unaligned(&hin->tag)),
750 calc_rttavg(d, tsince(f->tag));
752 ahin = (struct aoe_atahdr *) (hin+1);
753 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
754 ahout = (struct aoe_atahdr *) (hout+1);
757 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
759 "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%ld\n",
760 ahout->cmdstat, ahin->cmdstat,
761 d->aoemajor, d->aoeminor);
763 buf->flags |= BUFFL_FAIL;
765 if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
767 n = ahout->scnt << 9;
768 switch (ahout->cmdstat) {
771 if (skb->len - sizeof *hin - sizeof *ahin < n) {
773 "aoe: %s. skb->len=%d need=%ld\n",
774 "runt data size in read", skb->len, n);
775 /* fail frame f? just returning will rexmit. */
776 spin_unlock_irqrestore(&d->lock, flags);
779 memcpy(f->bufaddr, ahin+1, n);
782 ifp = getif(t, skb->dev);
796 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
798 "aoe: runt data size in ataid. skb->len=%d\n",
800 spin_unlock_irqrestore(&d->lock, flags);
803 ataid_complete(d, t, (char *) (ahin+1));
807 "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
809 be16_to_cpu(get_unaligned(&hin->major)),
814 if (buf && --buf->nframesout == 0 && buf->resid == 0) {
815 diskstats(d->gd, buf->bio, jiffies - buf->stime);
816 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
817 bio_endio(buf->bio, n);
818 mempool_free(buf, d->bufpool);
828 d->sendq_hd = d->sendq_tl = NULL;
830 spin_unlock_irqrestore(&d->lock, flags);
835 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
839 sl = aoecmd_cfg_pkts(aoemajor, aoeminor, NULL);
845 aoecmd_ata_id(struct aoedev *d)
848 struct aoe_atahdr *ah;
859 /* initialize the headers & frame */
861 h = (struct aoe_hdr *) skb_mac_header(skb);
862 ah = (struct aoe_atahdr *) (h+1);
863 skb_put(skb, sizeof *h + sizeof *ah);
864 memset(h, 0, skb->len);
865 f->tag = aoehdr_atainit(d, t, h);
869 /* set up ata header */
871 ah->cmdstat = WIN_IDENTIFY;
874 skb->dev = t->ifp->nd;
876 d->rttavg = MAXTIMER;
877 d->timer.function = rexmit_timer;
879 return skb_clone(skb, GFP_ATOMIC);
882 static struct aoetgt *
883 addtgt(struct aoedev *d, char *addr, ulong nframes)
885 struct aoetgt *t, **tt, **te;
890 for (; tt < te && *tt; tt++)
896 t = kcalloc(1, sizeof *t, GFP_ATOMIC);
897 f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
900 t->nframes = nframes;
905 f->skb = new_skb(ETH_ZLEN);
910 while (f > t->frames) {
912 dev_kfree_skb(f->skb);
916 memcpy(t->addr, addr, sizeof t->addr);
918 t->maxout = t->nframes;
927 aoecmd_cfg_rsp(struct sk_buff *skb)
931 struct aoe_cfghdr *ch;
934 ulong flags, sysminor, aoemajor;
936 enum { MAXFRAMES = 16 };
939 h = (struct aoe_hdr *) skb_mac_header(skb);
940 ch = (struct aoe_cfghdr *) (h+1);
943 * Enough people have their dip switches set backwards to
944 * warrant a loud message for this special case.
946 aoemajor = be16_to_cpu(get_unaligned(&h->major));
947 if (aoemajor == 0xfff) {
948 printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
949 "Check shelf dip switches.\n");
953 sysminor = SYSMINOR(aoemajor, h->minor);
954 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
955 printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
956 aoemajor, (int) h->minor);
960 n = be16_to_cpu(ch->bufcnt);
961 if (n > MAXFRAMES) /* keep it reasonable */
964 d = aoedev_by_sysminor_m(sysminor);
966 printk(KERN_INFO "aoe: device sysminor_m failure\n");
970 spin_lock_irqsave(&d->lock, flags);
972 t = gettgt(d, h->src);
974 t = addtgt(d, h->src, n);
977 "aoe: device addtgt failure; "
978 "too many targets?\n");
979 spin_unlock_irqrestore(&d->lock, flags);
983 ifp = getif(t, skb->dev);
985 ifp = addif(t, skb->dev);
988 "aoe: device addif failure; "
989 "too many interfaces?\n");
990 spin_unlock_irqrestore(&d->lock, flags);
996 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
1000 n = n ? n * 512 : DEFAULTBCNT;
1001 if (n != ifp->maxbcnt) {
1003 "aoe: e%ld.%d: setting %d%s%s:%012llx\n",
1004 d->aoemajor, d->aoeminor, n,
1005 " byte data frames on ", ifp->nd->name,
1006 (unsigned long long) mac_addr(t->addr));
1011 /* don't change users' perspective */
1013 spin_unlock_irqrestore(&d->lock, flags);
1016 d->fw_ver = be16_to_cpu(ch->fwver);
1018 sl = aoecmd_ata_id(d);
1020 spin_unlock_irqrestore(&d->lock, flags);
1026 aoecmd_cleanslate(struct aoedev *d)
1028 struct aoetgt **t, **te;
1029 struct aoeif *p, *e;
1031 d->mintimer = MINTIMER;
1035 for (; t < te && *t; t++) {
1036 (*t)->maxout = (*t)->nframes;
1039 for (; p < e; p++) {
1042 p->maxbcnt = DEFAULTBCNT;