4 * XenLinux virtual block device driver.
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
38 #include <linux/interrupt.h>
39 #include <linux/blkdev.h>
40 #include <linux/hdreg.h>
41 #include <linux/cdrom.h>
42 #include <linux/module.h>
43 #include <linux/slab.h>
44 #include <linux/mutex.h>
45 #include <linux/scatterlist.h>
46 #include <linux/bitmap.h>
47 #include <linux/list.h>
50 #include <xen/xenbus.h>
51 #include <xen/grant_table.h>
52 #include <xen/events.h>
54 #include <xen/platform_pci.h>
56 #include <xen/interface/grant_table.h>
57 #include <xen/interface/io/blkif.h>
58 #include <xen/interface/io/protocols.h>
60 #include <asm/xen/hypervisor.h>
63 BLKIF_STATE_DISCONNECTED,
64 BLKIF_STATE_CONNECTED,
65 BLKIF_STATE_SUSPENDED,
71 struct list_head node;
75 struct blkif_request req;
76 struct request *request;
77 struct grant **grants_used;
78 struct grant **indirect_grants;
79 struct scatterlist *sg;
88 static DEFINE_MUTEX(blkfront_mutex);
89 static const struct block_device_operations xlvbd_block_fops;
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
97 static unsigned int xen_blkif_max_segments = 32;
98 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
101 #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
104 * We have one of these per vbd, whether ide, scsi or 'other'. They
105 * hang in private_data off the gendisk structure. We may end up
106 * putting all kinds of interesting stuff here :-)
112 struct xenbus_device *xbdev;
116 enum blkif_state connected;
118 struct blkif_front_ring ring;
119 unsigned int evtchn, irq;
120 struct request_queue *rq;
121 struct work_struct work;
122 struct gnttab_free_callback callback;
123 struct blk_shadow shadow[BLK_RING_SIZE];
124 struct list_head persistent_gnts;
125 unsigned int persistent_gnts_c;
126 unsigned long shadow_free;
127 unsigned int feature_flush;
128 unsigned int flush_op;
129 unsigned int feature_discard:1;
130 unsigned int feature_secdiscard:1;
131 unsigned int discard_granularity;
132 unsigned int discard_alignment;
133 unsigned int feature_persistent:1;
134 unsigned int max_indirect_segments;
138 static unsigned int nr_minors;
139 static unsigned long *minors;
140 static DEFINE_SPINLOCK(minor_lock);
142 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
143 (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
144 #define GRANT_INVALID_REF 0
146 #define PARTS_PER_DISK 16
147 #define PARTS_PER_EXT_DISK 256
149 #define BLKIF_MAJOR(dev) ((dev)>>8)
150 #define BLKIF_MINOR(dev) ((dev) & 0xff)
153 #define EXTENDED (1<<EXT_SHIFT)
154 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
155 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
156 #define EMULATED_HD_DISK_MINOR_OFFSET (0)
157 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
158 #define EMULATED_SD_DISK_MINOR_OFFSET (0)
159 #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
161 #define DEV_NAME "xvd" /* name in /dev */
163 #define SEGS_PER_INDIRECT_FRAME \
164 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
165 #define INDIRECT_GREFS(_segs) \
166 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
168 static int blkfront_setup_indirect(struct blkfront_info *info);
170 static int get_id_from_freelist(struct blkfront_info *info)
172 unsigned long free = info->shadow_free;
173 BUG_ON(free >= BLK_RING_SIZE);
174 info->shadow_free = info->shadow[free].req.u.rw.id;
175 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
179 static int add_id_to_freelist(struct blkfront_info *info,
182 if (info->shadow[id].req.u.rw.id != id)
184 if (info->shadow[id].request == NULL)
186 info->shadow[id].req.u.rw.id = info->shadow_free;
187 info->shadow[id].request = NULL;
188 info->shadow_free = id;
192 static int fill_grant_buffer(struct blkfront_info *info, int num)
194 struct page *granted_page;
195 struct grant *gnt_list_entry, *n;
199 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
203 granted_page = alloc_page(GFP_NOIO);
205 kfree(gnt_list_entry);
209 gnt_list_entry->pfn = page_to_pfn(granted_page);
210 gnt_list_entry->gref = GRANT_INVALID_REF;
211 list_add(&gnt_list_entry->node, &info->persistent_gnts);
218 list_for_each_entry_safe(gnt_list_entry, n,
219 &info->persistent_gnts, node) {
220 list_del(&gnt_list_entry->node);
221 __free_page(pfn_to_page(gnt_list_entry->pfn));
222 kfree(gnt_list_entry);
229 static struct grant *get_grant(grant_ref_t *gref_head,
230 struct blkfront_info *info)
232 struct grant *gnt_list_entry;
233 unsigned long buffer_mfn;
235 BUG_ON(list_empty(&info->persistent_gnts));
236 gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant,
238 list_del(&gnt_list_entry->node);
240 if (gnt_list_entry->gref != GRANT_INVALID_REF) {
241 info->persistent_gnts_c--;
242 return gnt_list_entry;
245 /* Assign a gref to this page */
246 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
247 BUG_ON(gnt_list_entry->gref == -ENOSPC);
248 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
249 gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
250 info->xbdev->otherend_id,
252 return gnt_list_entry;
255 static const char *op_name(int op)
257 static const char *const names[] = {
258 [BLKIF_OP_READ] = "read",
259 [BLKIF_OP_WRITE] = "write",
260 [BLKIF_OP_WRITE_BARRIER] = "barrier",
261 [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
262 [BLKIF_OP_DISCARD] = "discard" };
264 if (op < 0 || op >= ARRAY_SIZE(names))
272 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
274 unsigned int end = minor + nr;
277 if (end > nr_minors) {
278 unsigned long *bitmap, *old;
280 bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
285 spin_lock(&minor_lock);
286 if (end > nr_minors) {
288 memcpy(bitmap, minors,
289 BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
291 nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
294 spin_unlock(&minor_lock);
298 spin_lock(&minor_lock);
299 if (find_next_bit(minors, end, minor) >= end) {
300 bitmap_set(minors, minor, nr);
304 spin_unlock(&minor_lock);
309 static void xlbd_release_minors(unsigned int minor, unsigned int nr)
311 unsigned int end = minor + nr;
313 BUG_ON(end > nr_minors);
314 spin_lock(&minor_lock);
315 bitmap_clear(minors, minor, nr);
316 spin_unlock(&minor_lock);
319 static void blkif_restart_queue_callback(void *arg)
321 struct blkfront_info *info = (struct blkfront_info *)arg;
322 schedule_work(&info->work);
325 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
327 /* We don't have real geometry info, but let's at least return
328 values consistent with the size of the device */
329 sector_t nsect = get_capacity(bd->bd_disk);
330 sector_t cylinders = nsect;
334 sector_div(cylinders, hg->heads * hg->sectors);
335 hg->cylinders = cylinders;
336 if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
337 hg->cylinders = 0xffff;
341 static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
342 unsigned command, unsigned long argument)
344 struct blkfront_info *info = bdev->bd_disk->private_data;
347 dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
348 command, (long)argument);
351 case CDROMMULTISESSION:
352 dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
353 for (i = 0; i < sizeof(struct cdrom_multisession); i++)
354 if (put_user(0, (char __user *)(argument + i)))
358 case CDROM_GET_CAPABILITY: {
359 struct gendisk *gd = info->gd;
360 if (gd->flags & GENHD_FL_CD)
366 /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
368 return -EINVAL; /* same return as native Linux */
375 * Generate a Xen blkfront IO request from a blk layer request. Reads
376 * and writes are handled as expected.
378 * @req: a request struct
380 static int blkif_queue_request(struct request *req)
382 struct blkfront_info *info = req->rq_disk->private_data;
383 struct blkif_request *ring_req;
385 unsigned int fsect, lsect;
387 struct blkif_request_segment_aligned *segments = NULL;
390 * Used to store if we are able to queue the request by just using
391 * existing persistent grants, or if we have to get new grants,
392 * as there are not sufficiently many free.
394 bool new_persistent_gnts;
395 grant_ref_t gref_head;
396 struct grant *gnt_list_entry = NULL;
397 struct scatterlist *sg;
400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
403 max_grefs = req->nr_phys_segments;
404 if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
406 * If we are using indirect segments we need to account
407 * for the indirect grefs used in the request.
409 max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
411 /* Check if we have enough grants to allocate a requests */
412 if (info->persistent_gnts_c < max_grefs) {
413 new_persistent_gnts = 1;
414 if (gnttab_alloc_grant_references(
415 max_grefs - info->persistent_gnts_c,
417 gnttab_request_free_callback(
419 blkif_restart_queue_callback,
425 new_persistent_gnts = 0;
427 /* Fill out a communications ring structure. */
428 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
429 id = get_id_from_freelist(info);
430 info->shadow[id].request = req;
432 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
433 ring_req->operation = BLKIF_OP_DISCARD;
434 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
435 ring_req->u.discard.id = id;
436 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
437 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
438 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
440 ring_req->u.discard.flag = 0;
442 BUG_ON(info->max_indirect_segments == 0 &&
443 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
444 BUG_ON(info->max_indirect_segments &&
445 req->nr_phys_segments > info->max_indirect_segments);
446 nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
447 ring_req->u.rw.id = id;
448 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
450 * The indirect operation can only be a BLKIF_OP_READ or
453 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
454 ring_req->operation = BLKIF_OP_INDIRECT;
455 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
456 BLKIF_OP_WRITE : BLKIF_OP_READ;
457 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
458 ring_req->u.indirect.handle = info->handle;
459 ring_req->u.indirect.nr_segments = nseg;
461 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
462 ring_req->u.rw.handle = info->handle;
463 ring_req->operation = rq_data_dir(req) ?
464 BLKIF_OP_WRITE : BLKIF_OP_READ;
465 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
467 * Ideally we can do an unordered flush-to-disk. In case the
468 * backend onlysupports barriers, use that. A barrier request
469 * a superset of FUA, so we can implement it the same
470 * way. (It's also a FLUSH+FUA, since it is
471 * guaranteed ordered WRT previous writes.)
473 ring_req->operation = info->flush_op;
475 ring_req->u.rw.nr_segments = nseg;
477 for_each_sg(info->shadow[id].sg, sg, nseg, i) {
478 fsect = sg->offset >> 9;
479 lsect = fsect + (sg->length >> 9) - 1;
481 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
482 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
484 kunmap_atomic(segments);
486 n = i / SEGS_PER_INDIRECT_FRAME;
487 gnt_list_entry = get_grant(&gref_head, info);
488 info->shadow[id].indirect_grants[n] = gnt_list_entry;
489 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
490 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
493 gnt_list_entry = get_grant(&gref_head, info);
494 ref = gnt_list_entry->gref;
496 info->shadow[id].grants_used[i] = gnt_list_entry;
498 if (rq_data_dir(req)) {
502 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
504 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
505 bvec_data = kmap_atomic(sg_page(sg));
508 * this does not wipe data stored outside the
509 * range sg->offset..sg->offset+sg->length.
510 * Therefore, blkback *could* see data from
511 * previous requests. This is OK as long as
512 * persistent grants are shared with just one
513 * domain. It may need refactoring if this
516 memcpy(shared_data + sg->offset,
517 bvec_data + sg->offset,
520 kunmap_atomic(bvec_data);
521 kunmap_atomic(shared_data);
523 if (ring_req->operation != BLKIF_OP_INDIRECT) {
524 ring_req->u.rw.seg[i] =
525 (struct blkif_request_segment) {
528 .last_sect = lsect };
530 n = i % SEGS_PER_INDIRECT_FRAME;
532 (struct blkif_request_segment_aligned) {
535 .last_sect = lsect };
539 kunmap_atomic(segments);
542 info->ring.req_prod_pvt++;
544 /* Keep a private copy so we can reissue requests when recovering. */
545 info->shadow[id].req = *ring_req;
547 if (new_persistent_gnts)
548 gnttab_free_grant_references(gref_head);
554 static inline void flush_requests(struct blkfront_info *info)
558 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
561 notify_remote_via_irq(info->irq);
566 * read a block; request is in a request queue
568 static void do_blkif_request(struct request_queue *rq)
570 struct blkfront_info *info = NULL;
574 pr_debug("Entered do_blkif_request\n");
578 while ((req = blk_peek_request(rq)) != NULL) {
579 info = req->rq_disk->private_data;
581 if (RING_FULL(&info->ring))
584 blk_start_request(req);
586 if ((req->cmd_type != REQ_TYPE_FS) ||
587 ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
589 __blk_end_request_all(req, -EIO);
593 pr_debug("do_blk_req %p: cmd %p, sec %lx, "
594 "(%u/%u) buffer:%p [%s]\n",
595 req, req->cmd, (unsigned long)blk_rq_pos(req),
596 blk_rq_cur_sectors(req), blk_rq_sectors(req),
597 req->buffer, rq_data_dir(req) ? "write" : "read");
599 if (blkif_queue_request(req)) {
600 blk_requeue_request(rq, req);
602 /* Avoid pointless unplugs. */
611 flush_requests(info);
614 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
615 unsigned int physical_sector_size,
616 unsigned int segments)
618 struct request_queue *rq;
619 struct blkfront_info *info = gd->private_data;
621 rq = blk_init_queue(do_blkif_request, &info->io_lock);
625 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
627 if (info->feature_discard) {
628 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
629 blk_queue_max_discard_sectors(rq, get_capacity(gd));
630 rq->limits.discard_granularity = info->discard_granularity;
631 rq->limits.discard_alignment = info->discard_alignment;
632 if (info->feature_secdiscard)
633 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
636 /* Hard sector size and max sectors impersonate the equiv. hardware. */
637 blk_queue_logical_block_size(rq, sector_size);
638 blk_queue_physical_block_size(rq, physical_sector_size);
639 blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
641 /* Each segment in a request is up to an aligned page in size. */
642 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
643 blk_queue_max_segment_size(rq, PAGE_SIZE);
645 /* Ensure a merged request will fit in a single I/O ring slot. */
646 blk_queue_max_segments(rq, segments);
648 /* Make sure buffer addresses are sector-aligned. */
649 blk_queue_dma_alignment(rq, 511);
651 /* Make sure we don't use bounce buffers. */
652 blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
660 static void xlvbd_flush(struct blkfront_info *info)
662 blk_queue_flush(info->rq, info->feature_flush);
663 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
665 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
666 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
667 "flush diskcache" : "barrier or flush"),
668 info->feature_flush ? "enabled;" : "disabled;",
669 "persistent grants:",
670 info->feature_persistent ? "enabled;" : "disabled;",
671 "indirect descriptors:",
672 info->max_indirect_segments ? "enabled;" : "disabled;");
675 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
678 major = BLKIF_MAJOR(vdevice);
679 *minor = BLKIF_MINOR(vdevice);
682 *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
683 *minor = ((*minor / 64) * PARTS_PER_DISK) +
684 EMULATED_HD_DISK_MINOR_OFFSET;
687 *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
688 *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
689 EMULATED_HD_DISK_MINOR_OFFSET;
691 case XEN_SCSI_DISK0_MAJOR:
692 *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
693 *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
695 case XEN_SCSI_DISK1_MAJOR:
696 case XEN_SCSI_DISK2_MAJOR:
697 case XEN_SCSI_DISK3_MAJOR:
698 case XEN_SCSI_DISK4_MAJOR:
699 case XEN_SCSI_DISK5_MAJOR:
700 case XEN_SCSI_DISK6_MAJOR:
701 case XEN_SCSI_DISK7_MAJOR:
702 *offset = (*minor / PARTS_PER_DISK) +
703 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
704 EMULATED_SD_DISK_NAME_OFFSET;
706 ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
707 EMULATED_SD_DISK_MINOR_OFFSET;
709 case XEN_SCSI_DISK8_MAJOR:
710 case XEN_SCSI_DISK9_MAJOR:
711 case XEN_SCSI_DISK10_MAJOR:
712 case XEN_SCSI_DISK11_MAJOR:
713 case XEN_SCSI_DISK12_MAJOR:
714 case XEN_SCSI_DISK13_MAJOR:
715 case XEN_SCSI_DISK14_MAJOR:
716 case XEN_SCSI_DISK15_MAJOR:
717 *offset = (*minor / PARTS_PER_DISK) +
718 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
719 EMULATED_SD_DISK_NAME_OFFSET;
721 ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
722 EMULATED_SD_DISK_MINOR_OFFSET;
725 *offset = *minor / PARTS_PER_DISK;
728 printk(KERN_WARNING "blkfront: your disk configuration is "
729 "incorrect, please use an xvd device instead\n");
735 static char *encode_disk_name(char *ptr, unsigned int n)
738 ptr = encode_disk_name(ptr, n / 26 - 1);
743 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
744 struct blkfront_info *info,
745 u16 vdisk_info, u16 sector_size,
746 unsigned int physical_sector_size)
756 BUG_ON(info->gd != NULL);
757 BUG_ON(info->rq != NULL);
759 if ((info->vdevice>>EXT_SHIFT) > 1) {
760 /* this is above the extended range; something is wrong */
761 printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
765 if (!VDEV_IS_EXTENDED(info->vdevice)) {
766 err = xen_translate_vdev(info->vdevice, &minor, &offset);
769 nr_parts = PARTS_PER_DISK;
771 minor = BLKIF_MINOR_EXT(info->vdevice);
772 nr_parts = PARTS_PER_EXT_DISK;
773 offset = minor / nr_parts;
774 if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
775 printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
776 "emulated IDE disks,\n\t choose an xvd device name"
777 "from xvde on\n", info->vdevice);
779 if (minor >> MINORBITS) {
780 pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
781 info->vdevice, minor);
785 if ((minor % nr_parts) == 0)
786 nr_minors = nr_parts;
788 err = xlbd_reserve_minors(minor, nr_minors);
793 gd = alloc_disk(nr_minors);
797 strcpy(gd->disk_name, DEV_NAME);
798 ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
799 BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
803 snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
804 "%d", minor & (nr_parts - 1));
806 gd->major = XENVBD_MAJOR;
807 gd->first_minor = minor;
808 gd->fops = &xlvbd_block_fops;
809 gd->private_data = info;
810 gd->driverfs_dev = &(info->xbdev->dev);
811 set_capacity(gd, capacity);
813 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
814 info->max_indirect_segments ? :
815 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
820 info->rq = gd->queue;
825 if (vdisk_info & VDISK_READONLY)
828 if (vdisk_info & VDISK_REMOVABLE)
829 gd->flags |= GENHD_FL_REMOVABLE;
831 if (vdisk_info & VDISK_CDROM)
832 gd->flags |= GENHD_FL_CD;
837 xlbd_release_minors(minor, nr_minors);
842 static void xlvbd_release_gendisk(struct blkfront_info *info)
844 unsigned int minor, nr_minors;
847 if (info->rq == NULL)
850 spin_lock_irqsave(&info->io_lock, flags);
852 /* No more blkif_request(). */
853 blk_stop_queue(info->rq);
855 /* No more gnttab callback work. */
856 gnttab_cancel_free_callback(&info->callback);
857 spin_unlock_irqrestore(&info->io_lock, flags);
859 /* Flush gnttab callback work. Must be done with no locks held. */
860 flush_work(&info->work);
862 del_gendisk(info->gd);
864 minor = info->gd->first_minor;
865 nr_minors = info->gd->minors;
866 xlbd_release_minors(minor, nr_minors);
868 blk_cleanup_queue(info->rq);
875 static void kick_pending_request_queues(struct blkfront_info *info)
877 if (!RING_FULL(&info->ring)) {
878 /* Re-enable calldowns. */
879 blk_start_queue(info->rq);
880 /* Kick things off immediately. */
881 do_blkif_request(info->rq);
885 static void blkif_restart_queue(struct work_struct *work)
887 struct blkfront_info *info = container_of(work, struct blkfront_info, work);
889 spin_lock_irq(&info->io_lock);
890 if (info->connected == BLKIF_STATE_CONNECTED)
891 kick_pending_request_queues(info);
892 spin_unlock_irq(&info->io_lock);
895 static void blkif_free(struct blkfront_info *info, int suspend)
897 struct grant *persistent_gnt;
901 /* Prevent new requests being issued until we fix things up. */
902 spin_lock_irq(&info->io_lock);
903 info->connected = suspend ?
904 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
905 /* No more blkif_request(). */
907 blk_stop_queue(info->rq);
909 /* Remove all persistent grants */
910 if (!list_empty(&info->persistent_gnts)) {
911 list_for_each_entry_safe(persistent_gnt, n,
912 &info->persistent_gnts, node) {
913 list_del(&persistent_gnt->node);
914 if (persistent_gnt->gref != GRANT_INVALID_REF) {
915 gnttab_end_foreign_access(persistent_gnt->gref,
917 info->persistent_gnts_c--;
919 __free_page(pfn_to_page(persistent_gnt->pfn));
920 kfree(persistent_gnt);
923 BUG_ON(info->persistent_gnts_c != 0);
925 for (i = 0; i < BLK_RING_SIZE; i++) {
927 * Clear persistent grants present in requests already
930 if (!info->shadow[i].request)
933 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
934 info->shadow[i].req.u.indirect.nr_segments :
935 info->shadow[i].req.u.rw.nr_segments;
936 for (j = 0; j < segs; j++) {
937 persistent_gnt = info->shadow[i].grants_used[j];
938 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
939 __free_page(pfn_to_page(persistent_gnt->pfn));
940 kfree(persistent_gnt);
943 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
945 * If this is not an indirect operation don't try to
946 * free indirect segments
950 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
951 persistent_gnt = info->shadow[i].indirect_grants[j];
952 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
953 __free_page(pfn_to_page(persistent_gnt->pfn));
954 kfree(persistent_gnt);
958 kfree(info->shadow[i].grants_used);
959 info->shadow[i].grants_used = NULL;
960 kfree(info->shadow[i].indirect_grants);
961 info->shadow[i].indirect_grants = NULL;
962 kfree(info->shadow[i].sg);
963 info->shadow[i].sg = NULL;
966 /* No more gnttab callback work. */
967 gnttab_cancel_free_callback(&info->callback);
968 spin_unlock_irq(&info->io_lock);
970 /* Flush gnttab callback work. Must be done with no locks held. */
971 flush_work(&info->work);
973 /* Free resources associated with old device channel. */
974 if (info->ring_ref != GRANT_INVALID_REF) {
975 gnttab_end_foreign_access(info->ring_ref, 0,
976 (unsigned long)info->ring.sring);
977 info->ring_ref = GRANT_INVALID_REF;
978 info->ring.sring = NULL;
981 unbind_from_irqhandler(info->irq, info);
982 info->evtchn = info->irq = 0;
986 static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
987 struct blkif_response *bret)
990 struct scatterlist *sg;
995 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
996 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
998 if (bret->operation == BLKIF_OP_READ) {
1000 * Copy the data received from the backend into the bvec.
1001 * Since bv_offset can be different than 0, and bv_len different
1002 * than PAGE_SIZE, we have to keep track of the current offset,
1003 * to be sure we are copying the data from the right shared page.
1005 for_each_sg(s->sg, sg, nseg, i) {
1006 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
1007 shared_data = kmap_atomic(
1008 pfn_to_page(s->grants_used[i]->pfn));
1009 bvec_data = kmap_atomic(sg_page(sg));
1010 memcpy(bvec_data + sg->offset,
1011 shared_data + sg->offset,
1013 kunmap_atomic(bvec_data);
1014 kunmap_atomic(shared_data);
1017 /* Add the persistent grant into the list of free grants */
1018 for (i = 0; i < nseg; i++) {
1019 if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
1021 * If the grant is still mapped by the backend (the
1022 * backend has chosen to make this grant persistent)
1023 * we add it at the head of the list, so it will be
1026 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
1027 info->persistent_gnts_c++;
1030 * If the grant is not mapped by the backend we end the
1031 * foreign access and add it to the tail of the list,
1032 * so it will not be picked again unless we run out of
1033 * persistent grants.
1035 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
1036 s->grants_used[i]->gref = GRANT_INVALID_REF;
1037 list_add_tail(&s->grants_used[i]->node, &info->persistent_gnts);
1040 if (s->req.operation == BLKIF_OP_INDIRECT) {
1041 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1042 if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
1043 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1044 info->persistent_gnts_c++;
1046 gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
1047 s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1048 list_add_tail(&s->indirect_grants[i]->node,
1049 &info->persistent_gnts);
1055 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1057 struct request *req;
1058 struct blkif_response *bret;
1060 unsigned long flags;
1061 struct blkfront_info *info = (struct blkfront_info *)dev_id;
1064 spin_lock_irqsave(&info->io_lock, flags);
1066 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1067 spin_unlock_irqrestore(&info->io_lock, flags);
1072 rp = info->ring.sring->rsp_prod;
1073 rmb(); /* Ensure we see queued responses up to 'rp'. */
1075 for (i = info->ring.rsp_cons; i != rp; i++) {
1078 bret = RING_GET_RESPONSE(&info->ring, i);
1081 * The backend has messed up and given us an id that we would
1082 * never have given to it (we stamp it up to BLK_RING_SIZE -
1083 * look in get_id_from_freelist.
1085 if (id >= BLK_RING_SIZE) {
1086 WARN(1, "%s: response to %s has incorrect id (%ld)\n",
1087 info->gd->disk_name, op_name(bret->operation), id);
1088 /* We can't safely get the 'struct request' as
1089 * the id is busted. */
1092 req = info->shadow[id].request;
1094 if (bret->operation != BLKIF_OP_DISCARD)
1095 blkif_completion(&info->shadow[id], info, bret);
1097 if (add_id_to_freelist(info, id)) {
1098 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1099 info->gd->disk_name, op_name(bret->operation), id);
1103 error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
1104 switch (bret->operation) {
1105 case BLKIF_OP_DISCARD:
1106 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1107 struct request_queue *rq = info->rq;
1108 printk(KERN_WARNING "blkfront: %s: %s op failed\n",
1109 info->gd->disk_name, op_name(bret->operation));
1110 error = -EOPNOTSUPP;
1111 info->feature_discard = 0;
1112 info->feature_secdiscard = 0;
1113 queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1114 queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
1116 __blk_end_request_all(req, error);
1118 case BLKIF_OP_FLUSH_DISKCACHE:
1119 case BLKIF_OP_WRITE_BARRIER:
1120 if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
1121 printk(KERN_WARNING "blkfront: %s: %s op failed\n",
1122 info->gd->disk_name, op_name(bret->operation));
1123 error = -EOPNOTSUPP;
1125 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
1126 info->shadow[id].req.u.rw.nr_segments == 0)) {
1127 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
1128 info->gd->disk_name, op_name(bret->operation));
1129 error = -EOPNOTSUPP;
1131 if (unlikely(error)) {
1132 if (error == -EOPNOTSUPP)
1134 info->feature_flush = 0;
1140 case BLKIF_OP_WRITE:
1141 if (unlikely(bret->status != BLKIF_RSP_OKAY))
1142 dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
1143 "request: %x\n", bret->status);
1145 __blk_end_request_all(req, error);
1152 info->ring.rsp_cons = i;
1154 if (i != info->ring.req_prod_pvt) {
1156 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
1160 info->ring.sring->rsp_event = i + 1;
1162 kick_pending_request_queues(info);
1164 spin_unlock_irqrestore(&info->io_lock, flags);
1170 static int setup_blkring(struct xenbus_device *dev,
1171 struct blkfront_info *info)
1173 struct blkif_sring *sring;
1176 info->ring_ref = GRANT_INVALID_REF;
1178 sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
1180 xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
1183 SHARED_RING_INIT(sring);
1184 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1186 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1188 free_page((unsigned long)sring);
1189 info->ring.sring = NULL;
1192 info->ring_ref = err;
1194 err = xenbus_alloc_evtchn(dev, &info->evtchn);
1198 err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
1201 xenbus_dev_fatal(dev, err,
1202 "bind_evtchn_to_irqhandler failed");
1209 blkif_free(info, 0);
1214 /* Common code used when first setting up, and when resuming. */
1215 static int talk_to_blkback(struct xenbus_device *dev,
1216 struct blkfront_info *info)
1218 const char *message = NULL;
1219 struct xenbus_transaction xbt;
1222 /* Create shared ring, alloc event channel. */
1223 err = setup_blkring(dev, info);
1228 err = xenbus_transaction_start(&xbt);
1230 xenbus_dev_fatal(dev, err, "starting transaction");
1231 goto destroy_blkring;
1234 err = xenbus_printf(xbt, dev->nodename,
1235 "ring-ref", "%u", info->ring_ref);
1237 message = "writing ring-ref";
1238 goto abort_transaction;
1240 err = xenbus_printf(xbt, dev->nodename,
1241 "event-channel", "%u", info->evtchn);
1243 message = "writing event-channel";
1244 goto abort_transaction;
1246 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1247 XEN_IO_PROTO_ABI_NATIVE);
1249 message = "writing protocol";
1250 goto abort_transaction;
1252 err = xenbus_printf(xbt, dev->nodename,
1253 "feature-persistent", "%u", 1);
1256 "writing persistent grants feature to xenbus");
1258 err = xenbus_transaction_end(xbt, 0);
1262 xenbus_dev_fatal(dev, err, "completing transaction");
1263 goto destroy_blkring;
1266 xenbus_switch_state(dev, XenbusStateInitialised);
1271 xenbus_transaction_end(xbt, 1);
1273 xenbus_dev_fatal(dev, err, "%s", message);
1275 blkif_free(info, 0);
1281 * Entry point to this code when a new device is created. Allocate the basic
1282 * structures and the ring buffer for communication with the backend, and
1283 * inform the backend of the appropriate details for those. Switch to
1284 * Initialised state.
1286 static int blkfront_probe(struct xenbus_device *dev,
1287 const struct xenbus_device_id *id)
1289 int err, vdevice, i;
1290 struct blkfront_info *info;
1292 /* FIXME: Use dynamic device id if this is not set. */
1293 err = xenbus_scanf(XBT_NIL, dev->nodename,
1294 "virtual-device", "%i", &vdevice);
1296 /* go looking in the extended area instead */
1297 err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
1300 xenbus_dev_fatal(dev, err, "reading virtual-device");
1305 if (xen_hvm_domain()) {
1308 /* no unplug has been done: do not hook devices != xen vbds */
1309 if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
1312 if (!VDEV_IS_EXTENDED(vdevice))
1313 major = BLKIF_MAJOR(vdevice);
1315 major = XENVBD_MAJOR;
1317 if (major != XENVBD_MAJOR) {
1319 "%s: HVM does not support vbd %d as xen block device\n",
1320 __FUNCTION__, vdevice);
1324 /* do not create a PV cdrom device if we are an HVM guest */
1325 type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
1328 if (strncmp(type, "cdrom", 5) == 0) {
1334 info = kzalloc(sizeof(*info), GFP_KERNEL);
1336 xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
1340 mutex_init(&info->mutex);
1341 spin_lock_init(&info->io_lock);
1343 info->vdevice = vdevice;
1344 INIT_LIST_HEAD(&info->persistent_gnts);
1345 info->persistent_gnts_c = 0;
1346 info->connected = BLKIF_STATE_DISCONNECTED;
1347 INIT_WORK(&info->work, blkif_restart_queue);
1349 for (i = 0; i < BLK_RING_SIZE; i++)
1350 info->shadow[i].req.u.rw.id = i+1;
1351 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1353 /* Front end dir is a number, which is used as the id. */
1354 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
1355 dev_set_drvdata(&dev->dev, info);
1357 err = talk_to_blkback(dev, info);
1360 dev_set_drvdata(&dev->dev, NULL);
1368 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1370 static void trim_bio(struct bio *bio, int offset, int size)
1372 /* 'bio' is a cloned bio which we need to trim to match
1373 * the given offset and size.
1374 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1377 struct bio_vec *bvec;
1381 if (offset == 0 && size == bio->bi_size)
1384 bio->bi_sector += offset;
1385 bio->bi_size = size;
1387 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1389 while (bio->bi_idx < bio->bi_vcnt &&
1390 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1391 /* remove this whole bio_vec */
1392 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1395 if (bio->bi_idx < bio->bi_vcnt) {
1396 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1397 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1399 /* avoid any complications with bi_idx being non-zero*/
1401 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1402 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1403 bio->bi_vcnt -= bio->bi_idx;
1406 /* Make sure vcnt and last bv are not too big */
1407 bio_for_each_segment(bvec, bio, i) {
1408 if (sofar + bvec->bv_len > size)
1409 bvec->bv_len = size - sofar;
1410 if (bvec->bv_len == 0) {
1414 sofar += bvec->bv_len;
1418 static void split_bio_end(struct bio *bio, int error)
1420 struct split_bio *split_bio = bio->bi_private;
1423 split_bio->err = error;
1425 if (atomic_dec_and_test(&split_bio->pending)) {
1426 split_bio->bio->bi_phys_segments = 0;
1427 bio_endio(split_bio->bio, split_bio->err);
1433 static int blkif_recover(struct blkfront_info *info)
1436 struct request *req, *n;
1437 struct blk_shadow *copy;
1439 struct bio *bio, *cloned_bio;
1440 struct bio_list bio_list, merge_bio;
1441 unsigned int segs, offset;
1443 struct split_bio *split_bio;
1444 struct list_head requests;
1446 /* Stage 1: Make a safe copy of the shadow state. */
1447 copy = kmemdup(info->shadow, sizeof(info->shadow),
1448 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
1452 /* Stage 2: Set up free list. */
1453 memset(&info->shadow, 0, sizeof(info->shadow));
1454 for (i = 0; i < BLK_RING_SIZE; i++)
1455 info->shadow[i].req.u.rw.id = i+1;
1456 info->shadow_free = info->ring.req_prod_pvt;
1457 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1459 rc = blkfront_setup_indirect(info);
1465 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1466 blk_queue_max_segments(info->rq, segs);
1467 bio_list_init(&bio_list);
1468 INIT_LIST_HEAD(&requests);
1469 for (i = 0; i < BLK_RING_SIZE; i++) {
1471 if (!copy[i].request)
1475 * Get the bios in the request so we can re-queue them.
1477 if (copy[i].request->cmd_flags &
1478 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1480 * Flush operations don't contain bios, so
1481 * we need to requeue the whole request
1483 list_add(©[i].request->queuelist, &requests);
1486 merge_bio.head = copy[i].request->bio;
1487 merge_bio.tail = copy[i].request->biotail;
1488 bio_list_merge(&bio_list, &merge_bio);
1489 copy[i].request->bio = NULL;
1490 blk_put_request(copy[i].request);
1496 * Empty the queue, this is important because we might have
1497 * requests in the queue with more segments than what we
1500 spin_lock_irq(&info->io_lock);
1501 while ((req = blk_fetch_request(info->rq)) != NULL) {
1502 if (req->cmd_flags &
1503 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1504 list_add(&req->queuelist, &requests);
1507 merge_bio.head = req->bio;
1508 merge_bio.tail = req->biotail;
1509 bio_list_merge(&bio_list, &merge_bio);
1511 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1512 pr_alert("diskcache flush request found!\n");
1513 __blk_put_request(info->rq, req);
1515 spin_unlock_irq(&info->io_lock);
1517 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1519 spin_lock_irq(&info->io_lock);
1521 /* Now safe for us to use the shared ring */
1522 info->connected = BLKIF_STATE_CONNECTED;
1524 /* Kick any other new requests queued since we resumed */
1525 kick_pending_request_queues(info);
1527 list_for_each_entry_safe(req, n, &requests, queuelist) {
1528 /* Requeue pending requests (flush or discard) */
1529 list_del_init(&req->queuelist);
1530 BUG_ON(req->nr_phys_segments > segs);
1531 blk_requeue_request(info->rq, req);
1533 spin_unlock_irq(&info->io_lock);
1535 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1536 /* Traverse the list of pending bios and re-queue them */
1537 if (bio_segments(bio) > segs) {
1539 * This bio has more segments than what we can
1540 * handle, we have to split it.
1542 pending = (bio_segments(bio) + segs - 1) / segs;
1543 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1544 BUG_ON(split_bio == NULL);
1545 atomic_set(&split_bio->pending, pending);
1546 split_bio->bio = bio;
1547 for (i = 0; i < pending; i++) {
1548 offset = (i * segs * PAGE_SIZE) >> 9;
1549 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1550 (unsigned int)(bio->bi_size >> 9) - offset);
1551 cloned_bio = bio_clone(bio, GFP_NOIO);
1552 BUG_ON(cloned_bio == NULL);
1553 trim_bio(cloned_bio, offset, size);
1554 cloned_bio->bi_private = split_bio;
1555 cloned_bio->bi_end_io = split_bio_end;
1556 submit_bio(cloned_bio->bi_rw, cloned_bio);
1559 * Now we have to wait for all those smaller bios to
1560 * end, so we can also end the "parent" bio.
1564 /* We don't need to split this bio */
1565 submit_bio(bio->bi_rw, bio);
1572 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1573 * driver restart. We tear down our blkif structure and recreate it, but
1574 * leave the device-layer structures intact so that this is transparent to the
1575 * rest of the kernel.
1577 static int blkfront_resume(struct xenbus_device *dev)
1579 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1582 dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
1584 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1586 err = talk_to_blkback(dev, info);
1589 * We have to wait for the backend to switch to
1590 * connected state, since we want to read which
1591 * features it supports.
1598 blkfront_closing(struct blkfront_info *info)
1600 struct xenbus_device *xbdev = info->xbdev;
1601 struct block_device *bdev = NULL;
1603 mutex_lock(&info->mutex);
1605 if (xbdev->state == XenbusStateClosing) {
1606 mutex_unlock(&info->mutex);
1611 bdev = bdget_disk(info->gd, 0);
1613 mutex_unlock(&info->mutex);
1616 xenbus_frontend_closed(xbdev);
1620 mutex_lock(&bdev->bd_mutex);
1622 if (bdev->bd_openers) {
1623 xenbus_dev_error(xbdev, -EBUSY,
1624 "Device in use; refusing to close");
1625 xenbus_switch_state(xbdev, XenbusStateClosing);
1627 xlvbd_release_gendisk(info);
1628 xenbus_frontend_closed(xbdev);
1631 mutex_unlock(&bdev->bd_mutex);
1635 static void blkfront_setup_discard(struct blkfront_info *info)
1639 unsigned int discard_granularity;
1640 unsigned int discard_alignment;
1641 unsigned int discard_secure;
1643 type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
1647 info->feature_secdiscard = 0;
1648 if (strncmp(type, "phy", 3) == 0) {
1649 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1650 "discard-granularity", "%u", &discard_granularity,
1651 "discard-alignment", "%u", &discard_alignment,
1654 info->feature_discard = 1;
1655 info->discard_granularity = discard_granularity;
1656 info->discard_alignment = discard_alignment;
1658 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1659 "discard-secure", "%d", &discard_secure,
1662 info->feature_secdiscard = discard_secure;
1664 } else if (strncmp(type, "file", 4) == 0)
1665 info->feature_discard = 1;
1670 static int blkfront_setup_indirect(struct blkfront_info *info)
1672 unsigned int indirect_segments, segs;
1675 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1676 "feature-max-indirect-segments", "%u", &indirect_segments,
1679 info->max_indirect_segments = 0;
1680 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1682 info->max_indirect_segments = min(indirect_segments,
1683 xen_blkif_max_segments);
1684 segs = info->max_indirect_segments;
1687 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1691 for (i = 0; i < BLK_RING_SIZE; i++) {
1692 info->shadow[i].grants_used = kzalloc(
1693 sizeof(info->shadow[i].grants_used[0]) * segs,
1695 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1696 if (info->max_indirect_segments)
1697 info->shadow[i].indirect_grants = kzalloc(
1698 sizeof(info->shadow[i].indirect_grants[0]) *
1699 INDIRECT_GREFS(segs),
1701 if ((info->shadow[i].grants_used == NULL) ||
1702 (info->shadow[i].sg == NULL) ||
1703 (info->max_indirect_segments &&
1704 (info->shadow[i].indirect_grants == NULL)))
1706 sg_init_table(info->shadow[i].sg, segs);
1713 for (i = 0; i < BLK_RING_SIZE; i++) {
1714 kfree(info->shadow[i].grants_used);
1715 info->shadow[i].grants_used = NULL;
1716 kfree(info->shadow[i].sg);
1717 info->shadow[i].sg = NULL;
1718 kfree(info->shadow[i].indirect_grants);
1719 info->shadow[i].indirect_grants = NULL;
1725 * Invoked when the backend is finally 'ready' (and has told produced
1726 * the details about the physical device - #sectors, size, etc).
1728 static void blkfront_connect(struct blkfront_info *info)
1730 unsigned long long sectors;
1731 unsigned long sector_size;
1732 unsigned int physical_sector_size;
1735 int barrier, flush, discard, persistent;
1737 switch (info->connected) {
1738 case BLKIF_STATE_CONNECTED:
1740 * Potentially, the back-end may be signalling
1741 * a capacity change; update the capacity.
1743 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1744 "sectors", "%Lu", §ors);
1745 if (XENBUS_EXIST_ERR(err))
1747 printk(KERN_INFO "Setting capacity to %Lu\n",
1749 set_capacity(info->gd, sectors);
1750 revalidate_disk(info->gd);
1753 case BLKIF_STATE_SUSPENDED:
1755 * If we are recovering from suspension, we need to wait
1756 * for the backend to announce it's features before
1757 * reconnecting, at least we need to know if the backend
1758 * supports indirect descriptors, and how many.
1760 blkif_recover(info);
1767 dev_dbg(&info->xbdev->dev, "%s:%s.\n",
1768 __func__, info->xbdev->otherend);
1770 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1771 "sectors", "%llu", §ors,
1772 "info", "%u", &binfo,
1773 "sector-size", "%lu", §or_size,
1776 xenbus_dev_fatal(info->xbdev, err,
1777 "reading backend fields at %s",
1778 info->xbdev->otherend);
1783 * physcial-sector-size is a newer field, so old backends may not
1784 * provide this. Assume physical sector size to be the same as
1785 * sector_size in that case.
1787 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1788 "physical-sector-size", "%u", &physical_sector_size);
1790 physical_sector_size = sector_size;
1792 info->feature_flush = 0;
1795 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1796 "feature-barrier", "%d", &barrier,
1800 * If there's no "feature-barrier" defined, then it means
1801 * we're dealing with a very old backend which writes
1802 * synchronously; nothing to do.
1804 * If there are barriers, then we use flush.
1806 if (!err && barrier) {
1807 info->feature_flush = REQ_FLUSH | REQ_FUA;
1808 info->flush_op = BLKIF_OP_WRITE_BARRIER;
1811 * And if there is "feature-flush-cache" use that above
1814 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1815 "feature-flush-cache", "%d", &flush,
1818 if (!err && flush) {
1819 info->feature_flush = REQ_FLUSH;
1820 info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1823 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1824 "feature-discard", "%d", &discard,
1827 if (!err && discard)
1828 blkfront_setup_discard(info);
1830 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1831 "feature-persistent", "%u", &persistent,
1834 info->feature_persistent = 0;
1836 info->feature_persistent = persistent;
1838 err = blkfront_setup_indirect(info);
1840 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1841 info->xbdev->otherend);
1845 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1846 physical_sector_size);
1848 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1849 info->xbdev->otherend);
1853 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1855 /* Kick pending requests. */
1856 spin_lock_irq(&info->io_lock);
1857 info->connected = BLKIF_STATE_CONNECTED;
1858 kick_pending_request_queues(info);
1859 spin_unlock_irq(&info->io_lock);
1867 * Callback received when the backend's state changes.
1869 static void blkback_changed(struct xenbus_device *dev,
1870 enum xenbus_state backend_state)
1872 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1874 dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
1876 switch (backend_state) {
1877 case XenbusStateInitialising:
1878 case XenbusStateInitWait:
1879 case XenbusStateInitialised:
1880 case XenbusStateReconfiguring:
1881 case XenbusStateReconfigured:
1882 case XenbusStateUnknown:
1883 case XenbusStateClosed:
1886 case XenbusStateConnected:
1887 blkfront_connect(info);
1890 case XenbusStateClosing:
1891 blkfront_closing(info);
1896 static int blkfront_remove(struct xenbus_device *xbdev)
1898 struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
1899 struct block_device *bdev = NULL;
1900 struct gendisk *disk;
1902 dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
1904 blkif_free(info, 0);
1906 mutex_lock(&info->mutex);
1910 bdev = bdget_disk(disk, 0);
1913 mutex_unlock(&info->mutex);
1921 * The xbdev was removed before we reached the Closed
1922 * state. See if it's safe to remove the disk. If the bdev
1923 * isn't closed yet, we let release take care of it.
1926 mutex_lock(&bdev->bd_mutex);
1927 info = disk->private_data;
1929 dev_warn(disk_to_dev(disk),
1930 "%s was hot-unplugged, %d stale handles\n",
1931 xbdev->nodename, bdev->bd_openers);
1933 if (info && !bdev->bd_openers) {
1934 xlvbd_release_gendisk(info);
1935 disk->private_data = NULL;
1939 mutex_unlock(&bdev->bd_mutex);
1945 static int blkfront_is_ready(struct xenbus_device *dev)
1947 struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1949 return info->is_ready && info->xbdev;
1952 static int blkif_open(struct block_device *bdev, fmode_t mode)
1954 struct gendisk *disk = bdev->bd_disk;
1955 struct blkfront_info *info;
1958 mutex_lock(&blkfront_mutex);
1960 info = disk->private_data;
1967 mutex_lock(&info->mutex);
1970 /* xbdev is closed */
1973 mutex_unlock(&info->mutex);
1976 mutex_unlock(&blkfront_mutex);
1980 static void blkif_release(struct gendisk *disk, fmode_t mode)
1982 struct blkfront_info *info = disk->private_data;
1983 struct block_device *bdev;
1984 struct xenbus_device *xbdev;
1986 mutex_lock(&blkfront_mutex);
1988 bdev = bdget_disk(disk, 0);
1990 if (bdev->bd_openers)
1994 * Check if we have been instructed to close. We will have
1995 * deferred this request, because the bdev was still open.
1998 mutex_lock(&info->mutex);
1999 xbdev = info->xbdev;
2001 if (xbdev && xbdev->state == XenbusStateClosing) {
2002 /* pending switch to state closed */
2003 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2004 xlvbd_release_gendisk(info);
2005 xenbus_frontend_closed(info->xbdev);
2008 mutex_unlock(&info->mutex);
2011 /* sudden device removal */
2012 dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2013 xlvbd_release_gendisk(info);
2014 disk->private_data = NULL;
2020 mutex_unlock(&blkfront_mutex);
2023 static const struct block_device_operations xlvbd_block_fops =
2025 .owner = THIS_MODULE,
2027 .release = blkif_release,
2028 .getgeo = blkif_getgeo,
2029 .ioctl = blkif_ioctl,
2033 static const struct xenbus_device_id blkfront_ids[] = {
2038 static DEFINE_XENBUS_DRIVER(blkfront, ,
2039 .probe = blkfront_probe,
2040 .remove = blkfront_remove,
2041 .resume = blkfront_resume,
2042 .otherend_changed = blkback_changed,
2043 .is_ready = blkfront_is_ready,
2046 static int __init xlblk_init(void)
2053 if (xen_hvm_domain() && !xen_platform_pci_unplug)
2056 if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
2057 printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
2058 XENVBD_MAJOR, DEV_NAME);
2062 ret = xenbus_register_frontend(&blkfront_driver);
2064 unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2070 module_init(xlblk_init);
2073 static void __exit xlblk_exit(void)
2075 xenbus_unregister_driver(&blkfront_driver);
2076 unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2079 module_exit(xlblk_exit);
2081 MODULE_DESCRIPTION("Xen virtual block device frontend");
2082 MODULE_LICENSE("GPL");
2083 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
2084 MODULE_ALIAS("xen:vbd");
2085 MODULE_ALIAS("xenblk");