drivers/block/xen-blkback/blkback.c

   1 /******************************************************************************
   2  *
   3  * Back-end of the driver for virtual block devices. This portion of the
   4  * driver exports a 'unified' block-device interface that can be accessed
   5  * by any operating system that implements a compatible front end. A
   6  * reference front-end implementation can be found in:
   7  *  drivers/block/xen-blkfront.c
   8  *
   9  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10  * Copyright (c) 2005, Christopher Clark
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License version 2
  14  * as published by the Free Software Foundation; or, when distributed
  15  * separately from the Linux kernel or incorporated into other
  16  * software packages, subject to the following license:
  17  *
  18  * Permission is hereby granted, free of charge, to any person obtaining a copy
  19  * of this source file (the "Software"), to deal in the Software without
  20  * restriction, including without limitation the rights to use, copy, modify,
  21  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22  * and to permit persons to whom the Software is furnished to do so, subject to
  23  * the following conditions:
  24  *
  25  * The above copyright notice and this permission notice shall be included in
  26  * all copies or substantial portions of the Software.
  27  *
  28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34  * IN THE SOFTWARE.
  35  */
  36
  37 #include <linux/spinlock.h>
  38 #include <linux/kthread.h>
  39 #include <linux/list.h>
  40 #include <linux/delay.h>
  41 #include <linux/freezer.h>
  42 #include <linux/bitmap.h>
  43
  44 #include <xen/events.h>
  45 #include <xen/page.h>
  46 #include <xen/xen.h>
  47 #include <asm/xen/hypervisor.h>
  48 #include <asm/xen/hypercall.h>
  49 #include <xen/balloon.h>
  50 #include "common.h"
  51
  52 /*
  53  * These are rather arbitrary. They are fairly large because adjacent requests
  54  * pulled from a communication ring are quite likely to end up being part of
  55  * the same scatter/gather request at the disc.
  56  *
  57  * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
  58  *
  59  * This will increase the chances of being able to write whole tracks.
  60  * 64 should be enough to keep us competitive with Linux.
  61  */
  62 static int xen_blkif_reqs = 64;
  63 module_param_named(reqs, xen_blkif_reqs, int, 0);
  64 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
  65
  66 /* Run-time switchable: /sys/module/blkback/parameters/ */
  67 static unsigned int log_stats;
  68 module_param(log_stats, int, 0644);
  69
  70 /*
  71  * Each outstanding request that we've passed to the lower device layers has a
  72  * 'pending_req' allocated to it. Each buffer_head that completes decrements
  73  * the pendcnt towards zero. When it hits zero, the specified domain has a
  74  * response queued for it, with the saved 'id' passed back.
  75  */
  76 struct pending_req {
  77         struct xen_blkif        *blkif;
  78         u64                     id;
  79         int                     nr_pages;
  80         atomic_t                pendcnt;
  81         unsigned short          operation;
  82         int                     status;
  83         struct list_head        free_list;
  84         DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  85 };
  86
  87 #define BLKBACK_INVALID_HANDLE (~0)
  88
  89 struct xen_blkbk {
  90         struct pending_req      *pending_reqs;
  91         /* List of all 'pending_req' available */
  92         struct list_head        pending_free;
  93         /* And its spinlock. */
  94         spinlock_t              pending_free_lock;
  95         wait_queue_head_t       pending_free_wq;
  96         /* The list of all pages that are available. */
  97         struct page             **pending_pages;
  98         /* And the grant handles that are available. */
  99         grant_handle_t          *pending_grant_handles;
 100 };
 101
 102 static struct xen_blkbk *blkbk;
 103
 104 /*
 105  * Maximum number of grant pages that can be mapped in blkback.
 106  * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
 107  * pages that blkback will persistently map.
 108  * Currently, this is:
 109  * RING_SIZE = 32 (for all known ring types)
 110  * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
 111  * sizeof(struct persistent_gnt) = 48
 112  * So the maximum memory used to store the grants is:
 113  * 32 * 11 * 48 = 16896 bytes
 114  */
 115 static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
 116 {
 117         switch (protocol) {
 118         case BLKIF_PROTOCOL_NATIVE:
 119                 return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
 120                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 121         case BLKIF_PROTOCOL_X86_32:
 122                 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
 123                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 124         case BLKIF_PROTOCOL_X86_64:
 125                 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
 126                            BLKIF_MAX_SEGMENTS_PER_REQUEST;
 127         default:
 128                 BUG();
 129         }
 130         return 0;
 131 }
 132
 133
 134 /*
 135  * Little helpful macro to figure out the index and virtual address of the
 136  * pending_pages[..]. For each 'pending_req' we have have up to
 137  * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
 138  * 10 and would index in the pending_pages[..].
 139  */
 140 static inline int vaddr_pagenr(struct pending_req *req, int seg)
 141 {
 142         return (req - blkbk->pending_reqs) *
 143                 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 144 }
 145
 146 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 147
 148 static inline unsigned long vaddr(struct pending_req *req, int seg)
 149 {
 150         unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
 151         return (unsigned long)pfn_to_kaddr(pfn);
 152 }
 153
 154 #define pending_handle(_req, _seg) \
 155         (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 156
 157
 158 static int do_block_io_op(struct xen_blkif *blkif);
 159 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 160                                 struct blkif_request *req,
 161                                 struct pending_req *pending_req);
 162 static void make_response(struct xen_blkif *blkif, u64 id,
 163                           unsigned short op, int st);
 164
 165 #define foreach_grant_safe(pos, n, rbtree, node) \
 166         for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
 167              (n) = rb_next(&(pos)->node); \
 168              &(pos)->node != NULL; \
 169              (pos) = container_of(n, typeof(*(pos)), node), \
 170              (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
 171
 172
 173 static void add_persistent_gnt(struct rb_root *root,
 174                                struct persistent_gnt *persistent_gnt)
 175 {
 176         struct rb_node **new = &(root->rb_node), *parent = NULL;
 177         struct persistent_gnt *this;
 178
 179         /* Figure out where to put new node */
 180         while (*new) {
 181                 this = container_of(*new, struct persistent_gnt, node);
 182
 183                 parent = *new;
 184                 if (persistent_gnt->gnt < this->gnt)
 185                         new = &((*new)->rb_left);
 186                 else if (persistent_gnt->gnt > this->gnt)
 187                         new = &((*new)->rb_right);
 188                 else {
 189                         pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
 190                         BUG();
 191                 }
 192         }
 193
 194         /* Add new node and rebalance tree. */
 195         rb_link_node(&(persistent_gnt->node), parent, new);
 196         rb_insert_color(&(persistent_gnt->node), root);
 197 }
 198
 199 static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
 200                                                  grant_ref_t gref)
 201 {
 202         struct persistent_gnt *data;
 203         struct rb_node *node = root->rb_node;
 204
 205         while (node) {
 206                 data = container_of(node, struct persistent_gnt, node);
 207
 208                 if (gref < data->gnt)
 209                         node = node->rb_left;
 210                 else if (gref > data->gnt)
 211                         node = node->rb_right;
 212                 else
 213                         return data;
 214         }
 215         return NULL;
 216 }
 217
 218 static void free_persistent_gnts(struct rb_root *root, unsigned int num)
 219 {
 220         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 221         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 222         struct persistent_gnt *persistent_gnt;
 223         struct rb_node *n;
 224         int ret = 0;
 225         int segs_to_unmap = 0;
 226
 227         foreach_grant_safe(persistent_gnt, n, root, node) {
 228                 BUG_ON(persistent_gnt->handle ==
 229                         BLKBACK_INVALID_HANDLE);
 230                 gnttab_set_unmap_op(&unmap[segs_to_unmap],
 231                         (unsigned long) pfn_to_kaddr(page_to_pfn(
 232                                 persistent_gnt->page)),
 233                         GNTMAP_host_map,
 234                         persistent_gnt->handle);
 235
 236                 pages[segs_to_unmap] = persistent_gnt->page;
 237
 238                 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
 239                         !rb_next(&persistent_gnt->node)) {
 240                         ret = gnttab_unmap_refs(unmap, NULL, pages,
 241                                 segs_to_unmap);
 242                         BUG_ON(ret);
 243                         free_xenballooned_pages(segs_to_unmap, pages);
 244                         segs_to_unmap = 0;
 245                 }
 246
 247                 rb_erase(&persistent_gnt->node, root);
 248                 kfree(persistent_gnt);
 249                 num--;
 250         }
 251         BUG_ON(num != 0);
 252 }
 253
 254 /*
 255  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
 256  */
 257 static struct pending_req *alloc_req(void)
 258 {
 259         struct pending_req *req = NULL;
 260         unsigned long flags;
 261
 262         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 263         if (!list_empty(&blkbk->pending_free)) {
 264                 req = list_entry(blkbk->pending_free.next, struct pending_req,
 265                                  free_list);
 266                 list_del(&req->free_list);
 267         }
 268         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 269         return req;
 270 }
 271
 272 /*
 273  * Return the 'pending_req' structure back to the freepool. We also
 274  * wake up the thread if it was waiting for a free page.
 275  */
 276 static void free_req(struct pending_req *req)
 277 {
 278         unsigned long flags;
 279         int was_empty;
 280
 281         spin_lock_irqsave(&blkbk->pending_free_lock, flags);
 282         was_empty = list_empty(&blkbk->pending_free);
 283         list_add(&req->free_list, &blkbk->pending_free);
 284         spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
 285         if (was_empty)
 286                 wake_up(&blkbk->pending_free_wq);
 287 }
 288
 289 /*
 290  * Routines for managing virtual block devices (vbds).
 291  */
 292 static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 293                              int operation)
 294 {
 295         struct xen_vbd *vbd = &blkif->vbd;
 296         int rc = -EACCES;
 297
 298         if ((operation != READ) && vbd->readonly)
 299                 goto out;
 300
 301         if (likely(req->nr_sects)) {
 302                 blkif_sector_t end = req->sector_number + req->nr_sects;
 303
 304                 if (unlikely(end < req->sector_number))
 305                         goto out;
 306                 if (unlikely(end > vbd_sz(vbd)))
 307                         goto out;
 308         }
 309
 310         req->dev  = vbd->pdevice;
 311         req->bdev = vbd->bdev;
 312         rc = 0;
 313
 314  out:
 315         return rc;
 316 }
 317
 318 static void xen_vbd_resize(struct xen_blkif *blkif)
 319 {
 320         struct xen_vbd *vbd = &blkif->vbd;
 321         struct xenbus_transaction xbt;
 322         int err;
 323         struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 324         unsigned long long new_size = vbd_sz(vbd);
 325
 326         pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
 327                 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
 328         pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
 329         vbd->size = new_size;
 330 again:
 331         err = xenbus_transaction_start(&xbt);
 332         if (err) {
 333                 pr_warn(DRV_PFX "Error starting transaction");
 334                 return;
 335         }
 336         err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 337                             (unsigned long long)vbd_sz(vbd));
 338         if (err) {
 339                 pr_warn(DRV_PFX "Error writing new size");
 340                 goto abort;
 341         }
 342         /*
 343          * Write the current state; we will use this to synchronize
 344          * the front-end. If the current state is "connected" the
 345          * front-end will get the new size information online.
 346          */
 347         err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 348         if (err) {
 349                 pr_warn(DRV_PFX "Error writing the state");
 350                 goto abort;
 351         }
 352
 353         err = xenbus_transaction_end(xbt, 0);
 354         if (err == -EAGAIN)
 355                 goto again;
 356         if (err)
 357                 pr_warn(DRV_PFX "Error ending transaction");
 358         return;
 359 abort:
 360         xenbus_transaction_end(xbt, 1);
 361 }
 362
 363 /*
 364  * Notification from the guest OS.
 365  */
 366 static void blkif_notify_work(struct xen_blkif *blkif)
 367 {
 368         blkif->waiting_reqs = 1;
 369         wake_up(&blkif->wq);
 370 }
 371
 372 irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 373 {
 374         blkif_notify_work(dev_id);
 375         return IRQ_HANDLED;
 376 }
 377
 378 /*
 379  * SCHEDULER FUNCTIONS
 380  */
 381
 382 static void print_stats(struct xen_blkif *blkif)
 383 {
 384         pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d"
 385                  "  |  ds %4d\n",
 386                  current->comm, blkif->st_oo_req,
 387                  blkif->st_rd_req, blkif->st_wr_req,
 388                  blkif->st_f_req, blkif->st_ds_req);
 389         blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 390         blkif->st_rd_req = 0;
 391         blkif->st_wr_req = 0;
 392         blkif->st_oo_req = 0;
 393         blkif->st_ds_req = 0;
 394 }
 395
 396 int xen_blkif_schedule(void *arg)
 397 {
 398         struct xen_blkif *blkif = arg;
 399         struct xen_vbd *vbd = &blkif->vbd;
 400
 401         xen_blkif_get(blkif);
 402
 403         while (!kthread_should_stop()) {
 404                 if (try_to_freeze())
 405                         continue;
 406                 if (unlikely(vbd->size != vbd_sz(vbd)))
 407                         xen_vbd_resize(blkif);
 408
 409                 wait_event_interruptible(
 410                         blkif->wq,
 411                         blkif->waiting_reqs || kthread_should_stop());
 412                 wait_event_interruptible(
 413                         blkbk->pending_free_wq,
 414                         !list_empty(&blkbk->pending_free) ||
 415                         kthread_should_stop());
 416
 417                 blkif->waiting_reqs = 0;
 418                 smp_mb(); /* clear flag *before* checking for work */
 419
 420                 if (do_block_io_op(blkif))
 421                         blkif->waiting_reqs = 1;
 422
 423                 if (log_stats && time_after(jiffies, blkif->st_print))
 424                         print_stats(blkif);
 425         }
 426
 427         /* Free all persistent grant pages */
 428         if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
 429                 free_persistent_gnts(&blkif->persistent_gnts,
 430                         blkif->persistent_gnt_c);
 431
 432         BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
 433         blkif->persistent_gnt_c = 0;
 434
 435         if (log_stats)
 436                 print_stats(blkif);
 437
 438         blkif->xenblkd = NULL;
 439         xen_blkif_put(blkif);
 440
 441         return 0;
 442 }
 443
 444 struct seg_buf {
 445         unsigned long buf;
 446         unsigned int nsec;
 447 };
 448 /*
 449  * Unmap the grant references, and also remove the M2P over-rides
 450  * used in the 'pending_req'.
 451  */
 452 static void xen_blkbk_unmap(struct pending_req *req)
 453 {
 454         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 455         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 456         unsigned int i, invcount = 0;
 457         grant_handle_t handle;
 458         int ret;
 459
 460         for (i = 0; i < req->nr_pages; i++) {
 461                 if (!test_bit(i, req->unmap_seg))
 462                         continue;
 463                 handle = pending_handle(req, i);
 464                 if (handle == BLKBACK_INVALID_HANDLE)
 465                         continue;
 466                 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 467                                     GNTMAP_host_map, handle);
 468                 pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 469                 pages[invcount] = virt_to_page(vaddr(req, i));
 470                 invcount++;
 471         }
 472
 473         ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
 474         BUG_ON(ret);
 475 }
 476
 477 static int xen_blkbk_map(struct blkif_request *req,
 478                          struct pending_req *pending_req,
 479                          struct seg_buf seg[],
 480                          struct page *pages[])
 481 {
 482         struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 483         struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 484         struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 485         struct persistent_gnt *persistent_gnt = NULL;
 486         struct xen_blkif *blkif = pending_req->blkif;
 487         phys_addr_t addr = 0;
 488         int i, j;
 489         bool new_map;
 490         int nseg = req->u.rw.nr_segments;
 491         int segs_to_map = 0;
 492         int ret = 0;
 493         int use_persistent_gnts;
 494
 495         use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 496
 497         BUG_ON(blkif->persistent_gnt_c >
 498                    max_mapped_grant_pages(pending_req->blkif->blk_protocol));
 499
 500         /*
 501          * Fill out preq.nr_sects with proper amount of sectors, and setup
 502          * assign map[..] with the PFN of the page in our domain with the
 503          * corresponding grant reference for each page.
 504          */
 505         for (i = 0; i < nseg; i++) {
 506                 uint32_t flags;
 507
 508                 if (use_persistent_gnts)
 509                         persistent_gnt = get_persistent_gnt(
 510                                 &blkif->persistent_gnts,
 511                                 req->u.rw.seg[i].gref);
 512
 513                 if (persistent_gnt) {
 514                         /*
 515                          * We are using persistent grants and
 516                          * the grant is already mapped
 517                          */
 518                         new_map = false;
 519                 } else if (use_persistent_gnts &&
 520                            blkif->persistent_gnt_c <
 521                            max_mapped_grant_pages(blkif->blk_protocol)) {
 522                         /*
 523                          * We are using persistent grants, the grant is
 524                          * not mapped but we have room for it
 525                          */
 526                         new_map = true;
 527                         persistent_gnt = kmalloc(
 528                                 sizeof(struct persistent_gnt),
 529                                 GFP_KERNEL);
 530                         if (!persistent_gnt)
 531                                 return -ENOMEM;
 532                         if (alloc_xenballooned_pages(1, &persistent_gnt->page,
 533                             false)) {
 534                                 kfree(persistent_gnt);
 535                                 return -ENOMEM;
 536                         }
 537                         persistent_gnt->gnt = req->u.rw.seg[i].gref;
 538                         persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
 539
 540                         pages_to_gnt[segs_to_map] =
 541                                 persistent_gnt->page;
 542                         addr = (unsigned long) pfn_to_kaddr(
 543                                 page_to_pfn(persistent_gnt->page));
 544
 545                         add_persistent_gnt(&blkif->persistent_gnts,
 546                                 persistent_gnt);
 547                         blkif->persistent_gnt_c++;
 548                         pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
 549                                  persistent_gnt->gnt, blkif->persistent_gnt_c,
 550                                  max_mapped_grant_pages(blkif->blk_protocol));
 551                 } else {
 552                         /*
 553                          * We are either using persistent grants and
 554                          * hit the maximum limit of grants mapped,
 555                          * or we are not using persistent grants.
 556                          */
 557                         if (use_persistent_gnts &&
 558                                 !blkif->vbd.overflow_max_grants) {
 559                                 blkif->vbd.overflow_max_grants = 1;
 560                                 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
 561                                          blkif->domid, blkif->vbd.handle);
 562                         }
 563                         new_map = true;
 564                         pages[i] = blkbk->pending_page(pending_req, i);
 565                         addr = vaddr(pending_req, i);
 566                         pages_to_gnt[segs_to_map] =
 567                                 blkbk->pending_page(pending_req, i);
 568                 }
 569
 570                 if (persistent_gnt) {
 571                         pages[i] = persistent_gnt->page;
 572                         persistent_gnts[i] = persistent_gnt;
 573                 } else {
 574                         persistent_gnts[i] = NULL;
 575                 }
 576
 577                 if (new_map) {
 578                         flags = GNTMAP_host_map;
 579                         if (!persistent_gnt &&
 580                             (pending_req->operation != BLKIF_OP_READ))
 581                                 flags |= GNTMAP_readonly;
 582                         gnttab_set_map_op(&map[segs_to_map++], addr,
 583                                           flags, req->u.rw.seg[i].gref,
 584                                           blkif->domid);
 585                 }
 586         }
 587
 588         if (segs_to_map) {
 589                 ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
 590                 BUG_ON(ret);
 591         }
 592
 593         /*
 594          * Now swizzle the MFN in our domain with the MFN from the other domain
 595          * so that when we access vaddr(pending_req,i) it has the contents of
 596          * the page from the other domain.
 597          */
 598         bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 599         for (i = 0, j = 0; i < nseg; i++) {
 600                 if (!persistent_gnts[i] ||
 601                     persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
 602                         /* This is a newly mapped grant */
 603                         BUG_ON(j >= segs_to_map);
 604                         if (unlikely(map[j].status != 0)) {
 605                                 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
 606                                 map[j].handle = BLKBACK_INVALID_HANDLE;
 607                                 ret |= 1;
 608                                 if (persistent_gnts[i]) {
 609                                         rb_erase(&persistent_gnts[i]->node,
 610                                                  &blkif->persistent_gnts);
 611                                         blkif->persistent_gnt_c--;
 612                                         kfree(persistent_gnts[i]);
 613                                         persistent_gnts[i] = NULL;
 614                                 }
 615                         }
 616                 }
 617                 if (persistent_gnts[i]) {
 618                         if (persistent_gnts[i]->handle ==
 619                             BLKBACK_INVALID_HANDLE) {
 620                                 /*
 621                                  * If this is a new persistent grant
 622                                  * save the handler
 623                                  */
 624                                 persistent_gnts[i]->handle = map[j].handle;
 625                                 persistent_gnts[i]->dev_bus_addr =
 626                                         map[j++].dev_bus_addr;
 627                         }
 628                         pending_handle(pending_req, i) =
 629                                 persistent_gnts[i]->handle;
 630
 631                         if (ret)
 632                                 continue;
 633
 634                         seg[i].buf = persistent_gnts[i]->dev_bus_addr |
 635                                 (req->u.rw.seg[i].first_sect << 9);
 636                 } else {
 637                         pending_handle(pending_req, i) = map[j].handle;
 638                         bitmap_set(pending_req->unmap_seg, i, 1);
 639
 640                         if (ret) {
 641                                 j++;
 642                                 continue;
 643                         }
 644
 645                         seg[i].buf = map[j++].dev_bus_addr |
 646                                 (req->u.rw.seg[i].first_sect << 9);
 647                 }
 648         }
 649         return ret;
 650 }
 651
 652 static int dispatch_discard_io(struct xen_blkif *blkif,
 653                                 struct blkif_request *req)
 654 {
 655         int err = 0;
 656         int status = BLKIF_RSP_OKAY;
 657         struct block_device *bdev = blkif->vbd.bdev;
 658         unsigned long secure;
 659
 660         blkif->st_ds_req++;
 661
 662         xen_blkif_get(blkif);
 663         secure = (blkif->vbd.discard_secure &&
 664                  (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
 665                  BLKDEV_DISCARD_SECURE : 0;
 666
 667         err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
 668                                    req->u.discard.nr_sectors,
 669                                    GFP_KERNEL, secure);
 670
 671         if (err == -EOPNOTSUPP) {
 672                 pr_debug(DRV_PFX "discard op failed, not supported\n");
 673                 status = BLKIF_RSP_EOPNOTSUPP;
 674         } else if (err)
 675                 status = BLKIF_RSP_ERROR;
 676
 677         make_response(blkif, req->u.discard.id, req->operation, status);
 678         xen_blkif_put(blkif);
 679         return err;
 680 }
 681
 682 static void xen_blk_drain_io(struct xen_blkif *blkif)
 683 {
 684         atomic_set(&blkif->drain, 1);
 685         do {
 686                 /* The initial value is one, and one refcnt taken at the
 687                  * start of the xen_blkif_schedule thread. */
 688                 if (atomic_read(&blkif->refcnt) <= 2)
 689                         break;
 690                 wait_for_completion_interruptible_timeout(
 691                                 &blkif->drain_complete, HZ);
 692
 693                 if (!atomic_read(&blkif->drain))
 694                         break;
 695         } while (!kthread_should_stop());
 696         atomic_set(&blkif->drain, 0);
 697 }
 698
 699 /*
 700  * Completion callback on the bio's. Called as bh->b_end_io()
 701  */
 702
 703 static void __end_block_io_op(struct pending_req *pending_req, int error)
 704 {
 705         /* An error fails the entire request. */
 706         if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
 707             (error == -EOPNOTSUPP)) {
 708                 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
 709                 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 710                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 711         } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
 712                     (error == -EOPNOTSUPP)) {
 713                 pr_debug(DRV_PFX "write barrier op failed, not supported\n");
 714                 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
 715                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
 716         } else if (error) {
 717                 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
 718                          " error=%d\n", error);
 719                 pending_req->status = BLKIF_RSP_ERROR;
 720         }
 721
 722         /*
 723          * If all of the bio's have completed it is time to unmap
 724          * the grant references associated with 'request' and provide
 725          * the proper response on the ring.
 726          */
 727         if (atomic_dec_and_test(&pending_req->pendcnt)) {
 728                 xen_blkbk_unmap(pending_req);
 729                 make_response(pending_req->blkif, pending_req->id,
 730                               pending_req->operation, pending_req->status);
 731                 xen_blkif_put(pending_req->blkif);
 732                 if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
 733                         if (atomic_read(&pending_req->blkif->drain))
 734                                 complete(&pending_req->blkif->drain_complete);
 735                 }
 736                 free_req(pending_req);
 737         }
 738 }
 739
 740 /*
 741  * bio callback.
 742  */
 743 static void end_block_io_op(struct bio *bio, int error)
 744 {
 745         __end_block_io_op(bio->bi_private, error);
 746         bio_put(bio);
 747 }
 748
 749
 750
 751 /*
 752  * Function to copy the from the ring buffer the 'struct blkif_request'
 753  * (which has the sectors we want, number of them, grant references, etc),
 754  * and transmute  it to the block API to hand it over to the proper block disk.
 755  */
 756 static int
 757 __do_block_io_op(struct xen_blkif *blkif)
 758 {
 759         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 760         struct blkif_request req;
 761         struct pending_req *pending_req;
 762         RING_IDX rc, rp;
 763         int more_to_do = 0;
 764
 765         rc = blk_rings->common.req_cons;
 766         rp = blk_rings->common.sring->req_prod;
 767         rmb(); /* Ensure we see queued requests up to 'rp'. */
 768
 769         while (rc != rp) {
 770
 771                 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 772                         break;
 773
 774                 if (kthread_should_stop()) {
 775                         more_to_do = 1;
 776                         break;
 777                 }
 778
 779                 pending_req = alloc_req();
 780                 if (NULL == pending_req) {
 781                         blkif->st_oo_req++;
 782                         more_to_do = 1;
 783                         break;
 784                 }
 785
 786                 switch (blkif->blk_protocol) {
 787                 case BLKIF_PROTOCOL_NATIVE:
 788                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
 789                         break;
 790                 case BLKIF_PROTOCOL_X86_32:
 791                         blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
 792                         break;
 793                 case BLKIF_PROTOCOL_X86_64:
 794                         blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
 795                         break;
 796                 default:
 797                         BUG();
 798                 }
 799                 blk_rings->common.req_cons = ++rc; /* before make_response() */
 800
 801                 /* Apply all sanity checks to /private copy/ of request. */
 802                 barrier();
 803                 if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
 804                         free_req(pending_req);
 805                         if (dispatch_discard_io(blkif, &req))
 806                                 break;
 807                 } else if (dispatch_rw_block_io(blkif, &req, pending_req))
 808                         break;
 809
 810                 /* Yield point for this unbounded loop. */
 811                 cond_resched();
 812         }
 813
 814         return more_to_do;
 815 }
 816
 817 static int
 818 do_block_io_op(struct xen_blkif *blkif)
 819 {
 820         union blkif_back_rings *blk_rings = &blkif->blk_rings;
 821         int more_to_do;
 822
 823         do {
 824                 more_to_do = __do_block_io_op(blkif);
 825                 if (more_to_do)
 826                         break;
 827
 828                 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
 829         } while (more_to_do);
 830
 831         return more_to_do;
 832 }
 833 /*
 834  * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
 835  * and call the 'submit_bio' to pass it to the underlying storage.
 836  */
 837 static int dispatch_rw_block_io(struct xen_blkif *blkif,
 838                                 struct blkif_request *req,
 839                                 struct pending_req *pending_req)
 840 {
 841         struct phys_req preq;
 842         struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 843         unsigned int nseg;
 844         struct bio *bio = NULL;
 845         struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 846         int i, nbio = 0;
 847         int operation;
 848         struct blk_plug plug;
 849         bool drain = false;
 850         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 851
 852         switch (req->operation) {
 853         case BLKIF_OP_READ:
 854                 blkif->st_rd_req++;
 855                 operation = READ;
 856                 break;
 857         case BLKIF_OP_WRITE:
 858                 blkif->st_wr_req++;
 859                 operation = WRITE_ODIRECT;
 860                 break;
 861         case BLKIF_OP_WRITE_BARRIER:
 862                 drain = true;
 863         case BLKIF_OP_FLUSH_DISKCACHE:
 864                 blkif->st_f_req++;
 865                 operation = WRITE_FLUSH;
 866                 break;
 867         default:
 868                 operation = 0; /* make gcc happy */
 869                 goto fail_response;
 870                 break;
 871         }
 872
 873         /* Check that the number of segments is sane. */
 874         nseg = req->u.rw.nr_segments;
 875
 876         if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
 877             unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 878                 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 879                          nseg);
 880                 /* Haven't submitted any bio's yet. */
 881                 goto fail_response;
 882         }
 883
 884         preq.sector_number = req->u.rw.sector_number;
 885         preq.nr_sects      = 0;
 886
 887         pending_req->blkif     = blkif;
 888         pending_req->id        = req->u.rw.id;
 889         pending_req->operation = req->operation;
 890         pending_req->status    = BLKIF_RSP_OKAY;
 891         pending_req->nr_pages  = nseg;
 892
 893         for (i = 0; i < nseg; i++) {
 894                 seg[i].nsec = req->u.rw.seg[i].last_sect -
 895                         req->u.rw.seg[i].first_sect + 1;
 896                 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 897                     (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 898                         goto fail_response;
 899                 preq.nr_sects += seg[i].nsec;
 900
 901         }
 902
 903         if (xen_vbd_translate(&preq, blkif, operation) != 0) {
 904                 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
 905                          operation == READ ? "read" : "write",
 906                          preq.sector_number,
 907                          preq.sector_number + preq.nr_sects, preq.dev);
 908                 goto fail_response;
 909         }
 910
 911         /*
 912          * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
 913          * is set there.
 914          */
 915         for (i = 0; i < nseg; i++) {
 916                 if (((int)preq.sector_number|(int)seg[i].nsec) &
 917                     ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
 918                         pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
 919                                  blkif->domid);
 920                         goto fail_response;
 921                 }
 922         }
 923
 924         /* Wait on all outstanding I/O's and once that has been completed
 925          * issue the WRITE_FLUSH.
 926          */
 927         if (drain)
 928                 xen_blk_drain_io(pending_req->blkif);
 929
 930         /*
 931          * If we have failed at this point, we need to undo the M2P override,
 932          * set gnttab_set_unmap_op on all of the grant references and perform
 933          * the hypercall to unmap the grants - that is all done in
 934          * xen_blkbk_unmap.
 935          */
 936         if (xen_blkbk_map(req, pending_req, seg, pages))
 937                 goto fail_flush;
 938
 939         /*
 940          * This corresponding xen_blkif_put is done in __end_block_io_op, or
 941          * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
 942          */
 943         xen_blkif_get(blkif);
 944
 945         for (i = 0; i < nseg; i++) {
 946                 while ((bio == NULL) ||
 947                        (bio_add_page(bio,
 948                                      pages[i],
 949                                      seg[i].nsec << 9,
 950                                      seg[i].buf & ~PAGE_MASK) == 0)) {
 951
 952                         bio = bio_alloc(GFP_KERNEL, nseg-i);
 953                         if (unlikely(bio == NULL))
 954                                 goto fail_put_bio;
 955
 956                         biolist[nbio++] = bio;
 957                         bio->bi_bdev    = preq.bdev;
 958                         bio->bi_private = pending_req;
 959                         bio->bi_end_io  = end_block_io_op;
 960                         bio->bi_sector  = preq.sector_number;
 961                 }
 962
 963                 preq.sector_number += seg[i].nsec;
 964         }
 965
 966         /* This will be hit if the operation was a flush or discard. */
 967         if (!bio) {
 968                 BUG_ON(operation != WRITE_FLUSH);
 969
 970                 bio = bio_alloc(GFP_KERNEL, 0);
 971                 if (unlikely(bio == NULL))
 972                         goto fail_put_bio;
 973
 974                 biolist[nbio++] = bio;
 975                 bio->bi_bdev    = preq.bdev;
 976                 bio->bi_private = pending_req;
 977                 bio->bi_end_io  = end_block_io_op;
 978         }
 979
 980         /*
 981          * We set it one so that the last submit_bio does not have to call
 982          * atomic_inc.
 983          */
 984         atomic_set(&pending_req->pendcnt, nbio);
 985
 986         /* Get a reference count for the disk queue and start sending I/O */
 987         blk_start_plug(&plug);
 988
 989         for (i = 0; i < nbio; i++)
 990                 submit_bio(operation, biolist[i]);
 991
 992         /* Let the I/Os go.. */
 993         blk_finish_plug(&plug);
 994
 995         if (operation == READ)
 996                 blkif->st_rd_sect += preq.nr_sects;
 997         else if (operation & WRITE)
 998                 blkif->st_wr_sect += preq.nr_sects;
 999
1000         return 0;
1001
1002  fail_flush:
1003         xen_blkbk_unmap(pending_req);
1004  fail_response:
1005         /* Haven't submitted any bio's yet. */
1006         make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
1007         free_req(pending_req);
1008         msleep(1); /* back off a bit */
1009         return -EIO;
1010
1011  fail_put_bio:
1012         for (i = 0; i < nbio; i++)
1013                 bio_put(biolist[i]);
1014         __end_block_io_op(pending_req, -EINVAL);
1015         msleep(1); /* back off a bit */
1016         return -EIO;
1017 }
1018
1019
1020
1021 /*
1022  * Put a response on the ring on how the operation fared.
1023  */
1024 static void make_response(struct xen_blkif *blkif, u64 id,
1025                           unsigned short op, int st)
1026 {
1027         struct blkif_response  resp;
1028         unsigned long     flags;
1029         union blkif_back_rings *blk_rings = &blkif->blk_rings;
1030         int notify;
1031
1032         resp.id        = id;
1033         resp.operation = op;
1034         resp.status    = st;
1035
1036         spin_lock_irqsave(&blkif->blk_ring_lock, flags);
1037         /* Place on the response ring for the relevant domain. */
1038         switch (blkif->blk_protocol) {
1039         case BLKIF_PROTOCOL_NATIVE:
1040                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
1041                        &resp, sizeof(resp));
1042                 break;
1043         case BLKIF_PROTOCOL_X86_32:
1044                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
1045                        &resp, sizeof(resp));
1046                 break;
1047         case BLKIF_PROTOCOL_X86_64:
1048                 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
1049                        &resp, sizeof(resp));
1050                 break;
1051         default:
1052                 BUG();
1053         }
1054         blk_rings->common.rsp_prod_pvt++;
1055         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1056         spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
1057         if (notify)
1058                 notify_remote_via_irq(blkif->irq);
1059 }
1060
1061 static int __init xen_blkif_init(void)
1062 {
1063         int i, mmap_pages;
1064         int rc = 0;
1065
1066         if (!xen_domain())
1067                 return -ENODEV;
1068
1069         blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1070         if (!blkbk) {
1071                 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1072                 return -ENOMEM;
1073         }
1074
1075         mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1076
1077         blkbk->pending_reqs          = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1078                                         xen_blkif_reqs, GFP_KERNEL);
1079         blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1080                                         mmap_pages, GFP_KERNEL);
1081         blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
1082                                         mmap_pages, GFP_KERNEL);
1083
1084         if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1085             !blkbk->pending_pages) {
1086                 rc = -ENOMEM;
1087                 goto out_of_memory;
1088         }
1089
1090         for (i = 0; i < mmap_pages; i++) {
1091                 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1092                 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1093                 if (blkbk->pending_pages[i] == NULL) {
1094                         rc = -ENOMEM;
1095                         goto out_of_memory;
1096                 }
1097         }
1098         rc = xen_blkif_interface_init();
1099         if (rc)
1100                 goto failed_init;
1101
1102         INIT_LIST_HEAD(&blkbk->pending_free);
1103         spin_lock_init(&blkbk->pending_free_lock);
1104         init_waitqueue_head(&blkbk->pending_free_wq);
1105
1106         for (i = 0; i < xen_blkif_reqs; i++)
1107                 list_add_tail(&blkbk->pending_reqs[i].free_list,
1108                               &blkbk->pending_free);
1109
1110         rc = xen_blkif_xenbus_init();
1111         if (rc)
1112                 goto failed_init;
1113
1114         return 0;
1115
1116  out_of_memory:
1117         pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1118  failed_init:
1119         kfree(blkbk->pending_reqs);
1120         kfree(blkbk->pending_grant_handles);
1121         if (blkbk->pending_pages) {
1122                 for (i = 0; i < mmap_pages; i++) {
1123                         if (blkbk->pending_pages[i])
1124                                 __free_page(blkbk->pending_pages[i]);
1125                 }
1126                 kfree(blkbk->pending_pages);
1127         }
1128         kfree(blkbk);
1129         blkbk = NULL;
1130         return rc;
1131 }
1132
1133 module_init(xen_blkif_init);
1134
1135 MODULE_LICENSE("Dual BSD/GPL");
1136 MODULE_ALIAS("xen-backend:vbd");