drivers/block/rbd.c

   1 /*
   2    rbd.c -- Export ceph rados objects as a Linux block device
   3
   4
   5    based on drivers/block/osdblk.c:
   6
   7    Copyright 2009 Red Hat, Inc.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation.
  12
  13    This program is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program; see the file COPYING.  If not, write to
  20    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23
  24    For usage instructions, please refer to:
  25
  26                  Documentation/ABI/testing/sysfs-bus-rbd
  27
  28  */
  29
  30 #include <linux/ceph/libceph.h>
  31 #include <linux/ceph/osd_client.h>
  32 #include <linux/ceph/mon_client.h>
  33 #include <linux/ceph/decode.h>
  34 #include <linux/parser.h>
  35
  36 #include <linux/kernel.h>
  37 #include <linux/device.h>
  38 #include <linux/module.h>
  39 #include <linux/fs.h>
  40 #include <linux/blkdev.h>
  41
  42 #include "rbd_types.h"
  43
  44 #define RBD_DEBUG       /* Activate rbd_assert() calls */
  45
  46 /*
  47  * The basic unit of block I/O is a sector.  It is interpreted in a
  48  * number of contexts in Linux (blk, bio, genhd), but the default is
  49  * universally 512 bytes.  These symbols are just slightly more
  50  * meaningful than the bare numbers they represent.
  51  */
  52 #define SECTOR_SHIFT    9
  53 #define SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  54
  55 /* It might be useful to have these defined elsewhere */
  56
  57 #define U8_MAX  ((u8)   (~0U))
  58 #define U16_MAX ((u16)  (~0U))
  59 #define U32_MAX ((u32)  (~0U))
  60 #define U64_MAX ((u64)  (~0ULL))
  61
  62 #define RBD_DRV_NAME "rbd"
  63 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  64
  65 #define RBD_MINORS_PER_MAJOR    256             /* max minors per blkdev */
  66
  67 #define RBD_SNAP_DEV_NAME_PREFIX        "snap_"
  68 #define RBD_MAX_SNAP_NAME_LEN   \
  69                         (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  70
  71 #define RBD_MAX_SNAP_COUNT      510     /* allows max snapc to fit in 4KB */
  72
  73 #define RBD_SNAP_HEAD_NAME      "-"
  74
  75 /* This allows a single page to hold an image name sent by OSD */
  76 #define RBD_IMAGE_NAME_LEN_MAX  (PAGE_SIZE - sizeof (__le32) - 1)
  77 #define RBD_IMAGE_ID_LEN_MAX    64
  78
  79 #define RBD_OBJ_PREFIX_LEN_MAX  64
  80
  81 /* Feature bits */
  82
  83 #define RBD_FEATURE_LAYERING      1
  84
  85 /* Features supported by this (client software) implementation. */
  86
  87 #define RBD_FEATURES_ALL          (0)
  88
  89 /*
  90  * An RBD device name will be "rbd#", where the "rbd" comes from
  91  * RBD_DRV_NAME above, and # is a unique integer identifier.
  92  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
  93  * enough to hold all possible device names.
  94  */
  95 #define DEV_NAME_LEN            32
  96 #define MAX_INT_FORMAT_WIDTH    ((5 * sizeof (int)) / 2 + 1)
  97
  98 /*
  99  * block device image metadata (in-memory version)
 100  */
 101 struct rbd_image_header {
 102         /* These four fields never change for a given rbd image */
 103         char *object_prefix;
 104         u64 features;
 105         __u8 obj_order;
 106         __u8 crypt_type;
 107         __u8 comp_type;
 108
 109         /* The remaining fields need to be updated occasionally */
 110         u64 image_size;
 111         struct ceph_snap_context *snapc;
 112         char *snap_names;
 113         u64 *snap_sizes;
 114
 115         u64 obj_version;
 116 };
 117
 118 /*
 119  * An rbd image specification.
 120  *
 121  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 122  * identify an image.  Each rbd_dev structure includes a pointer to
 123  * an rbd_spec structure that encapsulates this identity.
 124  *
 125  * Each of the id's in an rbd_spec has an associated name.  For a
 126  * user-mapped image, the names are supplied and the id's associated
 127  * with them are looked up.  For a layered image, a parent image is
 128  * defined by the tuple, and the names are looked up.
 129  *
 130  * An rbd_dev structure contains a parent_spec pointer which is
 131  * non-null if the image it represents is a child in a layered
 132  * image.  This pointer will refer to the rbd_spec structure used
 133  * by the parent rbd_dev for its own identity (i.e., the structure
 134  * is shared between the parent and child).
 135  *
 136  * Since these structures are populated once, during the discovery
 137  * phase of image construction, they are effectively immutable so
 138  * we make no effort to synchronize access to them.
 139  *
 140  * Note that code herein does not assume the image name is known (it
 141  * could be a null pointer).
 142  */
 143 struct rbd_spec {
 144         u64             pool_id;
 145         char            *pool_name;
 146
 147         char            *image_id;
 148         char            *image_name;
 149
 150         u64             snap_id;
 151         char            *snap_name;
 152
 153         struct kref     kref;
 154 };
 155
 156 /*
 157  * an instance of the client.  multiple devices may share an rbd client.
 158  */
 159 struct rbd_client {
 160         struct ceph_client      *client;
 161         struct kref             kref;
 162         struct list_head        node;
 163 };
 164
 165 struct rbd_img_request;
 166 typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 167
 168 #define BAD_WHICH       U32_MAX         /* Good which or bad which, which? */
 169
 170 struct rbd_obj_request;
 171 typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 172
 173 enum obj_request_type {
 174         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 175 };
 176
 177 struct rbd_obj_request {
 178         const char              *object_name;
 179         u64                     offset;         /* object start byte */
 180         u64                     length;         /* bytes from offset */
 181
 182         struct rbd_img_request  *img_request;
 183         struct list_head        links;          /* img_request->obj_requests */
 184         u32                     which;          /* posn image request list */
 185
 186         enum obj_request_type   type;
 187         union {
 188                 struct bio      *bio_list;
 189                 struct {
 190                         struct page     **pages;
 191                         u32             page_count;
 192                 };
 193         };
 194
 195         struct ceph_osd_request *osd_req;
 196
 197         u64                     xferred;        /* bytes transferred */
 198         u64                     version;
 199         int                     result;
 200         atomic_t                done;
 201
 202         rbd_obj_callback_t      callback;
 203         struct completion       completion;
 204
 205         struct kref             kref;
 206 };
 207
 208 struct rbd_img_request {
 209         struct request          *rq;
 210         struct rbd_device       *rbd_dev;
 211         u64                     offset; /* starting image byte offset */
 212         u64                     length; /* byte count from offset */
 213         bool                    write_request;  /* false for read */
 214         union {
 215                 struct ceph_snap_context *snapc;        /* for writes */
 216                 u64             snap_id;                /* for reads */
 217         };
 218         spinlock_t              completion_lock;/* protects next_completion */
 219         u32                     next_completion;
 220         rbd_img_callback_t      callback;
 221
 222         u32                     obj_request_count;
 223         struct list_head        obj_requests;   /* rbd_obj_request structs */
 224
 225         struct kref             kref;
 226 };
 227
 228 #define for_each_obj_request(ireq, oreq) \
 229         list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 230 #define for_each_obj_request_from(ireq, oreq) \
 231         list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 232 #define for_each_obj_request_safe(ireq, oreq, n) \
 233         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 234
 235 struct rbd_snap {
 236         struct  device          dev;
 237         const char              *name;
 238         u64                     size;
 239         struct list_head        node;
 240         u64                     id;
 241         u64                     features;
 242 };
 243
 244 struct rbd_mapping {
 245         u64                     size;
 246         u64                     features;
 247         bool                    read_only;
 248 };
 249
 250 /*
 251  * a single device
 252  */
 253 struct rbd_device {
 254         int                     dev_id;         /* blkdev unique id */
 255
 256         int                     major;          /* blkdev assigned major */
 257         struct gendisk          *disk;          /* blkdev's gendisk and rq */
 258
 259         u32                     image_format;   /* Either 1 or 2 */
 260         struct rbd_client       *rbd_client;
 261
 262         char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 263
 264         spinlock_t              lock;           /* queue, flags, open_count */
 265
 266         struct rbd_image_header header;
 267         unsigned long           flags;          /* possibly lock protected */
 268         struct rbd_spec         *spec;
 269
 270         char                    *header_name;
 271
 272         struct ceph_file_layout layout;
 273
 274         struct ceph_osd_event   *watch_event;
 275         struct rbd_obj_request  *watch_request;
 276
 277         struct rbd_spec         *parent_spec;
 278         u64                     parent_overlap;
 279
 280         /* protects updating the header */
 281         struct rw_semaphore     header_rwsem;
 282
 283         struct rbd_mapping      mapping;
 284
 285         struct list_head        node;
 286
 287         /* list of snapshots */
 288         struct list_head        snaps;
 289
 290         /* sysfs related */
 291         struct device           dev;
 292         unsigned long           open_count;     /* protected by lock */
 293 };
 294
 295 /*
 296  * Flag bits for rbd_dev->flags.  If atomicity is required,
 297  * rbd_dev->lock is used to protect access.
 298  *
 299  * Currently, only the "removing" flag (which is coupled with the
 300  * "open_count" field) requires atomic access.
 301  */
 302 enum rbd_dev_flags {
 303         RBD_DEV_FLAG_EXISTS,    /* mapped snapshot has not been deleted */
 304         RBD_DEV_FLAG_REMOVING,  /* this mapping is being removed */
 305 };
 306
 307 static DEFINE_MUTEX(ctl_mutex);   /* Serialize open/close/setup/teardown */
 308
 309 static LIST_HEAD(rbd_dev_list);    /* devices */
 310 static DEFINE_SPINLOCK(rbd_dev_list_lock);
 311
 312 static LIST_HEAD(rbd_client_list);              /* clients */
 313 static DEFINE_SPINLOCK(rbd_client_list_lock);
 314
 315 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 316 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 317
 318 static void rbd_dev_release(struct device *dev);
 319 static void rbd_remove_snap_dev(struct rbd_snap *snap);
 320
 321 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 322                        size_t count);
 323 static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 324                           size_t count);
 325
 326 static struct bus_attribute rbd_bus_attrs[] = {
 327         __ATTR(add, S_IWUSR, NULL, rbd_add),
 328         __ATTR(remove, S_IWUSR, NULL, rbd_remove),
 329         __ATTR_NULL
 330 };
 331
 332 static struct bus_type rbd_bus_type = {
 333         .name           = "rbd",
 334         .bus_attrs      = rbd_bus_attrs,
 335 };
 336
 337 static void rbd_root_dev_release(struct device *dev)
 338 {
 339 }
 340
 341 static struct device rbd_root_dev = {
 342         .init_name =    "rbd",
 343         .release =      rbd_root_dev_release,
 344 };
 345
 346 static __printf(2, 3)
 347 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 348 {
 349         struct va_format vaf;
 350         va_list args;
 351
 352         va_start(args, fmt);
 353         vaf.fmt = fmt;
 354         vaf.va = &args;
 355
 356         if (!rbd_dev)
 357                 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 358         else if (rbd_dev->disk)
 359                 printk(KERN_WARNING "%s: %s: %pV\n",
 360                         RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 361         else if (rbd_dev->spec && rbd_dev->spec->image_name)
 362                 printk(KERN_WARNING "%s: image %s: %pV\n",
 363                         RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 364         else if (rbd_dev->spec && rbd_dev->spec->image_id)
 365                 printk(KERN_WARNING "%s: id %s: %pV\n",
 366                         RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 367         else    /* punt */
 368                 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 369                         RBD_DRV_NAME, rbd_dev, &vaf);
 370         va_end(args);
 371 }
 372
 373 #ifdef RBD_DEBUG
 374 #define rbd_assert(expr)                                                \
 375                 if (unlikely(!(expr))) {                                \
 376                         printk(KERN_ERR "\nAssertion failure in %s() "  \
 377                                                 "at line %d:\n\n"       \
 378                                         "\trbd_assert(%s);\n\n",        \
 379                                         __func__, __LINE__, #expr);     \
 380                         BUG();                                          \
 381                 }
 382 #else /* !RBD_DEBUG */
 383 #  define rbd_assert(expr)      ((void) 0)
 384 #endif /* !RBD_DEBUG */
 385
 386 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 387 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 388
 389 static int rbd_open(struct block_device *bdev, fmode_t mode)
 390 {
 391         struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 392         bool removing = false;
 393
 394         if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 395                 return -EROFS;
 396
 397         spin_lock_irq(&rbd_dev->lock);
 398         if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 399                 removing = true;
 400         else
 401                 rbd_dev->open_count++;
 402         spin_unlock_irq(&rbd_dev->lock);
 403         if (removing)
 404                 return -ENOENT;
 405
 406         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 407         (void) get_device(&rbd_dev->dev);
 408         set_device_ro(bdev, rbd_dev->mapping.read_only);
 409         mutex_unlock(&ctl_mutex);
 410
 411         return 0;
 412 }
 413
 414 static int rbd_release(struct gendisk *disk, fmode_t mode)
 415 {
 416         struct rbd_device *rbd_dev = disk->private_data;
 417         unsigned long open_count_before;
 418
 419         spin_lock_irq(&rbd_dev->lock);
 420         open_count_before = rbd_dev->open_count--;
 421         spin_unlock_irq(&rbd_dev->lock);
 422         rbd_assert(open_count_before > 0);
 423
 424         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 425         put_device(&rbd_dev->dev);
 426         mutex_unlock(&ctl_mutex);
 427
 428         return 0;
 429 }
 430
 431 static const struct block_device_operations rbd_bd_ops = {
 432         .owner                  = THIS_MODULE,
 433         .open                   = rbd_open,
 434         .release                = rbd_release,
 435 };
 436
 437 /*
 438  * Initialize an rbd client instance.
 439  * We own *ceph_opts.
 440  */
 441 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 442 {
 443         struct rbd_client *rbdc;
 444         int ret = -ENOMEM;
 445
 446         dout("%s:\n", __func__);
 447         rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 448         if (!rbdc)
 449                 goto out_opt;
 450
 451         kref_init(&rbdc->kref);
 452         INIT_LIST_HEAD(&rbdc->node);
 453
 454         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 455
 456         rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 457         if (IS_ERR(rbdc->client))
 458                 goto out_mutex;
 459         ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 460
 461         ret = ceph_open_session(rbdc->client);
 462         if (ret < 0)
 463                 goto out_err;
 464
 465         spin_lock(&rbd_client_list_lock);
 466         list_add_tail(&rbdc->node, &rbd_client_list);
 467         spin_unlock(&rbd_client_list_lock);
 468
 469         mutex_unlock(&ctl_mutex);
 470         dout("%s: rbdc %p\n", __func__, rbdc);
 471
 472         return rbdc;
 473
 474 out_err:
 475         ceph_destroy_client(rbdc->client);
 476 out_mutex:
 477         mutex_unlock(&ctl_mutex);
 478         kfree(rbdc);
 479 out_opt:
 480         if (ceph_opts)
 481                 ceph_destroy_options(ceph_opts);
 482         dout("%s: error %d\n", __func__, ret);
 483
 484         return ERR_PTR(ret);
 485 }
 486
 487 /*
 488  * Find a ceph client with specific addr and configuration.  If
 489  * found, bump its reference count.
 490  */
 491 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 492 {
 493         struct rbd_client *client_node;
 494         bool found = false;
 495
 496         if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 497                 return NULL;
 498
 499         spin_lock(&rbd_client_list_lock);
 500         list_for_each_entry(client_node, &rbd_client_list, node) {
 501                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
 502                         kref_get(&client_node->kref);
 503                         found = true;
 504                         break;
 505                 }
 506         }
 507         spin_unlock(&rbd_client_list_lock);
 508
 509         return found ? client_node : NULL;
 510 }
 511
 512 /*
 513  * mount options
 514  */
 515 enum {
 516         Opt_last_int,
 517         /* int args above */
 518         Opt_last_string,
 519         /* string args above */
 520         Opt_read_only,
 521         Opt_read_write,
 522         /* Boolean args above */
 523         Opt_last_bool,
 524 };
 525
 526 static match_table_t rbd_opts_tokens = {
 527         /* int args above */
 528         /* string args above */
 529         {Opt_read_only, "read_only"},
 530         {Opt_read_only, "ro"},          /* Alternate spelling */
 531         {Opt_read_write, "read_write"},
 532         {Opt_read_write, "rw"},         /* Alternate spelling */
 533         /* Boolean args above */
 534         {-1, NULL}
 535 };
 536
 537 struct rbd_options {
 538         bool    read_only;
 539 };
 540
 541 #define RBD_READ_ONLY_DEFAULT   false
 542
 543 static int parse_rbd_opts_token(char *c, void *private)
 544 {
 545         struct rbd_options *rbd_opts = private;
 546         substring_t argstr[MAX_OPT_ARGS];
 547         int token, intval, ret;
 548
 549         token = match_token(c, rbd_opts_tokens, argstr);
 550         if (token < 0)
 551                 return -EINVAL;
 552
 553         if (token < Opt_last_int) {
 554                 ret = match_int(&argstr[0], &intval);
 555                 if (ret < 0) {
 556                         pr_err("bad mount option arg (not int) "
 557                                "at '%s'\n", c);
 558                         return ret;
 559                 }
 560                 dout("got int token %d val %d\n", token, intval);
 561         } else if (token > Opt_last_int && token < Opt_last_string) {
 562                 dout("got string token %d val %s\n", token,
 563                      argstr[0].from);
 564         } else if (token > Opt_last_string && token < Opt_last_bool) {
 565                 dout("got Boolean token %d\n", token);
 566         } else {
 567                 dout("got token %d\n", token);
 568         }
 569
 570         switch (token) {
 571         case Opt_read_only:
 572                 rbd_opts->read_only = true;
 573                 break;
 574         case Opt_read_write:
 575                 rbd_opts->read_only = false;
 576                 break;
 577         default:
 578                 rbd_assert(false);
 579                 break;
 580         }
 581         return 0;
 582 }
 583
 584 /*
 585  * Get a ceph client with specific addr and configuration, if one does
 586  * not exist create it.
 587  */
 588 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 589 {
 590         struct rbd_client *rbdc;
 591
 592         rbdc = rbd_client_find(ceph_opts);
 593         if (rbdc)       /* using an existing client */
 594                 ceph_destroy_options(ceph_opts);
 595         else
 596                 rbdc = rbd_client_create(ceph_opts);
 597
 598         return rbdc;
 599 }
 600
 601 /*
 602  * Destroy ceph client
 603  *
 604  * Caller must hold rbd_client_list_lock.
 605  */
 606 static void rbd_client_release(struct kref *kref)
 607 {
 608         struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 609
 610         dout("%s: rbdc %p\n", __func__, rbdc);
 611         spin_lock(&rbd_client_list_lock);
 612         list_del(&rbdc->node);
 613         spin_unlock(&rbd_client_list_lock);
 614
 615         ceph_destroy_client(rbdc->client);
 616         kfree(rbdc);
 617 }
 618
 619 /*
 620  * Drop reference to ceph client node. If it's not referenced anymore, release
 621  * it.
 622  */
 623 static void rbd_put_client(struct rbd_client *rbdc)
 624 {
 625         if (rbdc)
 626                 kref_put(&rbdc->kref, rbd_client_release);
 627 }
 628
 629 static bool rbd_image_format_valid(u32 image_format)
 630 {
 631         return image_format == 1 || image_format == 2;
 632 }
 633
 634 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 635 {
 636         size_t size;
 637         u32 snap_count;
 638
 639         /* The header has to start with the magic rbd header text */
 640         if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 641                 return false;
 642
 643         /* The bio layer requires at least sector-sized I/O */
 644
 645         if (ondisk->options.order < SECTOR_SHIFT)
 646                 return false;
 647
 648         /* If we use u64 in a few spots we may be able to loosen this */
 649
 650         if (ondisk->options.order > 8 * sizeof (int) - 1)
 651                 return false;
 652
 653         /*
 654          * The size of a snapshot header has to fit in a size_t, and
 655          * that limits the number of snapshots.
 656          */
 657         snap_count = le32_to_cpu(ondisk->snap_count);
 658         size = SIZE_MAX - sizeof (struct ceph_snap_context);
 659         if (snap_count > size / sizeof (__le64))
 660                 return false;
 661
 662         /*
 663          * Not only that, but the size of the entire the snapshot
 664          * header must also be representable in a size_t.
 665          */
 666         size -= snap_count * sizeof (__le64);
 667         if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 668                 return false;
 669
 670         return true;
 671 }
 672
 673 /*
 674  * Create a new header structure, translate header format from the on-disk
 675  * header.
 676  */
 677 static int rbd_header_from_disk(struct rbd_image_header *header,
 678                                  struct rbd_image_header_ondisk *ondisk)
 679 {
 680         u32 snap_count;
 681         size_t len;
 682         size_t size;
 683         u32 i;
 684
 685         memset(header, 0, sizeof (*header));
 686
 687         snap_count = le32_to_cpu(ondisk->snap_count);
 688
 689         len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
 690         header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
 691         if (!header->object_prefix)
 692                 return -ENOMEM;
 693         memcpy(header->object_prefix, ondisk->object_prefix, len);
 694         header->object_prefix[len] = '\0';
 695
 696         if (snap_count) {
 697                 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 698
 699                 /* Save a copy of the snapshot names */
 700
 701                 if (snap_names_len > (u64) SIZE_MAX)
 702                         return -EIO;
 703                 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 704                 if (!header->snap_names)
 705                         goto out_err;
 706                 /*
 707                  * Note that rbd_dev_v1_header_read() guarantees
 708                  * the ondisk buffer we're working with has
 709                  * snap_names_len bytes beyond the end of the
 710                  * snapshot id array, this memcpy() is safe.
 711                  */
 712                 memcpy(header->snap_names, &ondisk->snaps[snap_count],
 713                         snap_names_len);
 714
 715                 /* Record each snapshot's size */
 716
 717                 size = snap_count * sizeof (*header->snap_sizes);
 718                 header->snap_sizes = kmalloc(size, GFP_KERNEL);
 719                 if (!header->snap_sizes)
 720                         goto out_err;
 721                 for (i = 0; i < snap_count; i++)
 722                         header->snap_sizes[i] =
 723                                 le64_to_cpu(ondisk->snaps[i].image_size);
 724         } else {
 725                 WARN_ON(ondisk->snap_names_len);
 726                 header->snap_names = NULL;
 727                 header->snap_sizes = NULL;
 728         }
 729
 730         header->features = 0;   /* No features support in v1 images */
 731         header->obj_order = ondisk->options.order;
 732         header->crypt_type = ondisk->options.crypt_type;
 733         header->comp_type = ondisk->options.comp_type;
 734
 735         /* Allocate and fill in the snapshot context */
 736
 737         header->image_size = le64_to_cpu(ondisk->image_size);
 738         size = sizeof (struct ceph_snap_context);
 739         size += snap_count * sizeof (header->snapc->snaps[0]);
 740         header->snapc = kzalloc(size, GFP_KERNEL);
 741         if (!header->snapc)
 742                 goto out_err;
 743
 744         atomic_set(&header->snapc->nref, 1);
 745         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 746         header->snapc->num_snaps = snap_count;
 747         for (i = 0; i < snap_count; i++)
 748                 header->snapc->snaps[i] =
 749                         le64_to_cpu(ondisk->snaps[i].id);
 750
 751         return 0;
 752
 753 out_err:
 754         kfree(header->snap_sizes);
 755         header->snap_sizes = NULL;
 756         kfree(header->snap_names);
 757         header->snap_names = NULL;
 758         kfree(header->object_prefix);
 759         header->object_prefix = NULL;
 760
 761         return -ENOMEM;
 762 }
 763
 764 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 765 {
 766         struct rbd_snap *snap;
 767
 768         if (snap_id == CEPH_NOSNAP)
 769                 return RBD_SNAP_HEAD_NAME;
 770
 771         list_for_each_entry(snap, &rbd_dev->snaps, node)
 772                 if (snap_id == snap->id)
 773                         return snap->name;
 774
 775         return NULL;
 776 }
 777
 778 static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 779 {
 780
 781         struct rbd_snap *snap;
 782
 783         list_for_each_entry(snap, &rbd_dev->snaps, node) {
 784                 if (!strcmp(snap_name, snap->name)) {
 785                         rbd_dev->spec->snap_id = snap->id;
 786                         rbd_dev->mapping.size = snap->size;
 787                         rbd_dev->mapping.features = snap->features;
 788
 789                         return 0;
 790                 }
 791         }
 792
 793         return -ENOENT;
 794 }
 795
 796 static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 797 {
 798         int ret;
 799
 800         if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 801                     sizeof (RBD_SNAP_HEAD_NAME))) {
 802                 rbd_dev->spec->snap_id = CEPH_NOSNAP;
 803                 rbd_dev->mapping.size = rbd_dev->header.image_size;
 804                 rbd_dev->mapping.features = rbd_dev->header.features;
 805                 ret = 0;
 806         } else {
 807                 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 808                 if (ret < 0)
 809                         goto done;
 810                 rbd_dev->mapping.read_only = true;
 811         }
 812         set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 813
 814 done:
 815         return ret;
 816 }
 817
 818 static void rbd_header_free(struct rbd_image_header *header)
 819 {
 820         kfree(header->object_prefix);
 821         header->object_prefix = NULL;
 822         kfree(header->snap_sizes);
 823         header->snap_sizes = NULL;
 824         kfree(header->snap_names);
 825         header->snap_names = NULL;
 826         ceph_put_snap_context(header->snapc);
 827         header->snapc = NULL;
 828 }
 829
 830 static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
 831 {
 832         char *name;
 833         u64 segment;
 834         int ret;
 835
 836         name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 837         if (!name)
 838                 return NULL;
 839         segment = offset >> rbd_dev->header.obj_order;
 840         ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
 841                         rbd_dev->header.object_prefix, segment);
 842         if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
 843                 pr_err("error formatting segment name for #%llu (%d)\n",
 844                         segment, ret);
 845                 kfree(name);
 846                 name = NULL;
 847         }
 848
 849         return name;
 850 }
 851
 852 static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 853 {
 854         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 855
 856         return offset & (segment_size - 1);
 857 }
 858
 859 static u64 rbd_segment_length(struct rbd_device *rbd_dev,
 860                                 u64 offset, u64 length)
 861 {
 862         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 863
 864         offset &= segment_size - 1;
 865
 866         rbd_assert(length <= U64_MAX - offset);
 867         if (offset + length > segment_size)
 868                 length = segment_size - offset;
 869
 870         return length;
 871 }
 872
 873 /*
 874  * returns the size of an object in the image
 875  */
 876 static u64 rbd_obj_bytes(struct rbd_image_header *header)
 877 {
 878         return 1 << header->obj_order;
 879 }
 880
 881 /*
 882  * bio helpers
 883  */
 884
 885 static void bio_chain_put(struct bio *chain)
 886 {
 887         struct bio *tmp;
 888
 889         while (chain) {
 890                 tmp = chain;
 891                 chain = chain->bi_next;
 892                 bio_put(tmp);
 893         }
 894 }
 895
 896 /*
 897  * zeros a bio chain, starting at specific offset
 898  */
 899 static void zero_bio_chain(struct bio *chain, int start_ofs)
 900 {
 901         struct bio_vec *bv;
 902         unsigned long flags;
 903         void *buf;
 904         int i;
 905         int pos = 0;
 906
 907         while (chain) {
 908                 bio_for_each_segment(bv, chain, i) {
 909                         if (pos + bv->bv_len > start_ofs) {
 910                                 int remainder = max(start_ofs - pos, 0);
 911                                 buf = bvec_kmap_irq(bv, &flags);
 912                                 memset(buf + remainder, 0,
 913                                        bv->bv_len - remainder);
 914                                 bvec_kunmap_irq(buf, &flags);
 915                         }
 916                         pos += bv->bv_len;
 917                 }
 918
 919                 chain = chain->bi_next;
 920         }
 921 }
 922
 923 /*
 924  * Clone a portion of a bio, starting at the given byte offset
 925  * and continuing for the number of bytes indicated.
 926  */
 927 static struct bio *bio_clone_range(struct bio *bio_src,
 928                                         unsigned int offset,
 929                                         unsigned int len,
 930                                         gfp_t gfpmask)
 931 {
 932         struct bio_vec *bv;
 933         unsigned int resid;
 934         unsigned short idx;
 935         unsigned int voff;
 936         unsigned short end_idx;
 937         unsigned short vcnt;
 938         struct bio *bio;
 939
 940         /* Handle the easy case for the caller */
 941
 942         if (!offset && len == bio_src->bi_size)
 943                 return bio_clone(bio_src, gfpmask);
 944
 945         if (WARN_ON_ONCE(!len))
 946                 return NULL;
 947         if (WARN_ON_ONCE(len > bio_src->bi_size))
 948                 return NULL;
 949         if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
 950                 return NULL;
 951
 952         /* Find first affected segment... */
 953
 954         resid = offset;
 955         __bio_for_each_segment(bv, bio_src, idx, 0) {
 956                 if (resid < bv->bv_len)
 957                         break;
 958                 resid -= bv->bv_len;
 959         }
 960         voff = resid;
 961
 962         /* ...and the last affected segment */
 963
 964         resid += len;
 965         __bio_for_each_segment(bv, bio_src, end_idx, idx) {
 966                 if (resid <= bv->bv_len)
 967                         break;
 968                 resid -= bv->bv_len;
 969         }
 970         vcnt = end_idx - idx + 1;
 971
 972         /* Build the clone */
 973
 974         bio = bio_alloc(gfpmask, (unsigned int) vcnt);
 975         if (!bio)
 976                 return NULL;    /* ENOMEM */
 977
 978         bio->bi_bdev = bio_src->bi_bdev;
 979         bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
 980         bio->bi_rw = bio_src->bi_rw;
 981         bio->bi_flags |= 1 << BIO_CLONED;
 982
 983         /*
 984          * Copy over our part of the bio_vec, then update the first
 985          * and last (or only) entries.
 986          */
 987         memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
 988                         vcnt * sizeof (struct bio_vec));
 989         bio->bi_io_vec[0].bv_offset += voff;
 990         if (vcnt > 1) {
 991                 bio->bi_io_vec[0].bv_len -= voff;
 992                 bio->bi_io_vec[vcnt - 1].bv_len = resid;
 993         } else {
 994                 bio->bi_io_vec[0].bv_len = len;
 995         }
 996
 997         bio->bi_vcnt = vcnt;
 998         bio->bi_size = len;
 999         bio->bi_idx = 0;
1000
1001         return bio;
1002 }
1003
1004 /*
1005  * Clone a portion of a bio chain, starting at the given byte offset
1006  * into the first bio in the source chain and continuing for the
1007  * number of bytes indicated.  The result is another bio chain of
1008  * exactly the given length, or a null pointer on error.
1009  *
1010  * The bio_src and offset parameters are both in-out.  On entry they
1011  * refer to the first source bio and the offset into that bio where
1012  * the start of data to be cloned is located.
1013  *
1014  * On return, bio_src is updated to refer to the bio in the source
1015  * chain that contains first un-cloned byte, and *offset will
1016  * contain the offset of that byte within that bio.
1017  */
1018 static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019                                         unsigned int *offset,
1020                                         unsigned int len,
1021                                         gfp_t gfpmask)
1022 {
1023         struct bio *bi = *bio_src;
1024         unsigned int off = *offset;
1025         struct bio *chain = NULL;
1026         struct bio **end;
1027
1028         /* Build up a chain of clone bios up to the limit */
1029
1030         if (!bi || off >= bi->bi_size || !len)
1031                 return NULL;            /* Nothing to clone */
1032
1033         end = &chain;
1034         while (len) {
1035                 unsigned int bi_size;
1036                 struct bio *bio;
1037
1038                 if (!bi) {
1039                         rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1040                         goto out_err;   /* EINVAL; ran out of bio's */
1041                 }
1042                 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043                 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044                 if (!bio)
1045                         goto out_err;   /* ENOMEM */
1046
1047                 *end = bio;
1048                 end = &bio->bi_next;
1049
1050                 off += bi_size;
1051                 if (off == bi->bi_size) {
1052                         bi = bi->bi_next;
1053                         off = 0;
1054                 }
1055                 len -= bi_size;
1056         }
1057         *bio_src = bi;
1058         *offset = off;
1059
1060         return chain;
1061 out_err:
1062         bio_chain_put(chain);
1063
1064         return NULL;
1065 }
1066
1067 static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068 {
1069         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070                 atomic_read(&obj_request->kref.refcount));
1071         kref_get(&obj_request->kref);
1072 }
1073
1074 static void rbd_obj_request_destroy(struct kref *kref);
1075 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076 {
1077         rbd_assert(obj_request != NULL);
1078         dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079                 atomic_read(&obj_request->kref.refcount));
1080         kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081 }
1082
1083 static void rbd_img_request_get(struct rbd_img_request *img_request)
1084 {
1085         dout("%s: img %p (was %d)\n", __func__, img_request,
1086                 atomic_read(&img_request->kref.refcount));
1087         kref_get(&img_request->kref);
1088 }
1089
1090 static void rbd_img_request_destroy(struct kref *kref);
1091 static void rbd_img_request_put(struct rbd_img_request *img_request)
1092 {
1093         rbd_assert(img_request != NULL);
1094         dout("%s: img %p (was %d)\n", __func__, img_request,
1095                 atomic_read(&img_request->kref.refcount));
1096         kref_put(&img_request->kref, rbd_img_request_destroy);
1097 }
1098
1099 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100                                         struct rbd_obj_request *obj_request)
1101 {
1102         rbd_assert(obj_request->img_request == NULL);
1103
1104         rbd_obj_request_get(obj_request);
1105         obj_request->img_request = img_request;
1106         obj_request->which = img_request->obj_request_count;
1107         rbd_assert(obj_request->which != BAD_WHICH);
1108         img_request->obj_request_count++;
1109         list_add_tail(&obj_request->links, &img_request->obj_requests);
1110         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111                 obj_request->which);
1112 }
1113
1114 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115                                         struct rbd_obj_request *obj_request)
1116 {
1117         rbd_assert(obj_request->which != BAD_WHICH);
1118
1119         dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120                 obj_request->which);
1121         list_del(&obj_request->links);
1122         rbd_assert(img_request->obj_request_count > 0);
1123         img_request->obj_request_count--;
1124         rbd_assert(obj_request->which == img_request->obj_request_count);
1125         obj_request->which = BAD_WHICH;
1126         rbd_assert(obj_request->img_request == img_request);
1127         obj_request->img_request = NULL;
1128         obj_request->callback = NULL;
1129         rbd_obj_request_put(obj_request);
1130 }
1131
1132 static bool obj_request_type_valid(enum obj_request_type type)
1133 {
1134         switch (type) {
1135         case OBJ_REQUEST_NODATA:
1136         case OBJ_REQUEST_BIO:
1137         case OBJ_REQUEST_PAGES:
1138                 return true;
1139         default:
1140                 return false;
1141         }
1142 }
1143
1144 static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1145 {
1146         struct ceph_osd_req_op *op;
1147         va_list args;
1148         size_t size;
1149
1150         op = kzalloc(sizeof (*op), GFP_NOIO);
1151         if (!op)
1152                 return NULL;
1153         op->op = opcode;
1154         va_start(args, opcode);
1155         switch (opcode) {
1156         case CEPH_OSD_OP_READ:
1157         case CEPH_OSD_OP_WRITE:
1158                 /* rbd_osd_req_op_create(READ, offset, length) */
1159                 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160                 op->extent.offset = va_arg(args, u64);
1161                 op->extent.length = va_arg(args, u64);
1162                 if (opcode == CEPH_OSD_OP_WRITE)
1163                         op->payload_len = op->extent.length;
1164                 break;
1165         case CEPH_OSD_OP_STAT:
1166                 break;
1167         case CEPH_OSD_OP_CALL:
1168                 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169                 op->cls.class_name = va_arg(args, char *);
1170                 size = strlen(op->cls.class_name);
1171                 rbd_assert(size <= (size_t) U8_MAX);
1172                 op->cls.class_len = size;
1173                 op->payload_len = size;
1174
1175                 op->cls.method_name = va_arg(args, char *);
1176                 size = strlen(op->cls.method_name);
1177                 rbd_assert(size <= (size_t) U8_MAX);
1178                 op->cls.method_len = size;
1179                 op->payload_len += size;
1180
1181                 op->cls.argc = 0;
1182                 op->cls.indata = va_arg(args, void *);
1183                 size = va_arg(args, size_t);
1184                 rbd_assert(size <= (size_t) U32_MAX);
1185                 op->cls.indata_len = (u32) size;
1186                 op->payload_len += size;
1187                 break;
1188         case CEPH_OSD_OP_NOTIFY_ACK:
1189         case CEPH_OSD_OP_WATCH:
1190                 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191                 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192                 op->watch.cookie = va_arg(args, u64);
1193                 op->watch.ver = va_arg(args, u64);
1194                 op->watch.ver = cpu_to_le64(op->watch.ver);
1195                 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196                         op->watch.flag = (u8) 1;
1197                 break;
1198         default:
1199                 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200                 kfree(op);
1201                 op = NULL;
1202                 break;
1203         }
1204         va_end(args);
1205
1206         return op;
1207 }
1208
1209 static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210 {
1211         kfree(op);
1212 }
1213
1214 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215                                 struct rbd_obj_request *obj_request)
1216 {
1217         dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1218
1219         return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220 }
1221
1222 static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223 {
1224         dout("%s: img %p\n", __func__, img_request);
1225         if (img_request->callback)
1226                 img_request->callback(img_request);
1227         else
1228                 rbd_img_request_put(img_request);
1229 }
1230
1231 /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232
1233 static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234 {
1235         dout("%s: obj %p\n", __func__, obj_request);
1236
1237         return wait_for_completion_interruptible(&obj_request->completion);
1238 }
1239
1240 static void obj_request_done_init(struct rbd_obj_request *obj_request)
1241 {
1242         atomic_set(&obj_request->done, 0);
1243         smp_wmb();
1244 }
1245
1246 static void obj_request_done_set(struct rbd_obj_request *obj_request)
1247 {
1248         int done;
1249
1250         done = atomic_inc_return(&obj_request->done);
1251         if (done > 1) {
1252                 struct rbd_img_request *img_request = obj_request->img_request;
1253                 struct rbd_device *rbd_dev;
1254
1255                 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256                 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257                         obj_request);
1258         }
1259 }
1260
1261 static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1262 {
1263         smp_mb();
1264         return atomic_read(&obj_request->done) != 0;
1265 }
1266
1267 static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1268 {
1269         dout("%s: obj %p cb %p\n", __func__, obj_request,
1270                 obj_request->callback);
1271         if (obj_request->callback)
1272                 obj_request->callback(obj_request);
1273         else
1274                 complete_all(&obj_request->completion);
1275 }
1276
1277 static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1278 {
1279         dout("%s: obj %p\n", __func__, obj_request);
1280         obj_request_done_set(obj_request);
1281 }
1282
1283 static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1284 {
1285         dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1286                 obj_request->result, obj_request->xferred, obj_request->length);
1287         /*
1288          * ENOENT means a hole in the object.  We zero-fill the
1289          * entire length of the request.  A short read also implies
1290          * zero-fill to the end of the request.  Either way we
1291          * update the xferred count to indicate the whole request
1292          * was satisfied.
1293          */
1294         if (obj_request->result == -ENOENT) {
1295                 zero_bio_chain(obj_request->bio_list, 0);
1296                 obj_request->result = 0;
1297                 obj_request->xferred = obj_request->length;
1298         } else if (obj_request->xferred < obj_request->length &&
1299                         !obj_request->result) {
1300                 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1301                 obj_request->xferred = obj_request->length;
1302         }
1303         obj_request_done_set(obj_request);
1304 }
1305
1306 static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1307 {
1308         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1309                 obj_request->result, obj_request->length);
1310         /*
1311          * There is no such thing as a successful short write.
1312          * Our xferred value is the number of bytes transferred
1313          * back.  Set it to our originally-requested length.
1314          */
1315         obj_request->xferred = obj_request->length;
1316         obj_request_done_set(obj_request);
1317 }
1318
1319 /*
1320  * For a simple stat call there's nothing to do.  We'll do more if
1321  * this is part of a write sequence for a layered image.
1322  */
1323 static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1324 {
1325         dout("%s: obj %p\n", __func__, obj_request);
1326         obj_request_done_set(obj_request);
1327 }
1328
1329 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1330                                 struct ceph_msg *msg)
1331 {
1332         struct rbd_obj_request *obj_request = osd_req->r_priv;
1333         u16 opcode;
1334
1335         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1336         rbd_assert(osd_req == obj_request->osd_req);
1337         rbd_assert(!!obj_request->img_request ^
1338                                 (obj_request->which == BAD_WHICH));
1339
1340         if (osd_req->r_result < 0)
1341                 obj_request->result = osd_req->r_result;
1342         obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1343
1344         WARN_ON(osd_req->r_num_ops != 1);       /* For now */
1345
1346         /*
1347          * We support a 64-bit length, but ultimately it has to be
1348          * passed to blk_end_request(), which takes an unsigned int.
1349          */
1350         obj_request->xferred = osd_req->r_reply_op_len[0];
1351         rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1352         opcode = osd_req->r_request_ops[0].op;
1353         switch (opcode) {
1354         case CEPH_OSD_OP_READ:
1355                 rbd_osd_read_callback(obj_request);
1356                 break;
1357         case CEPH_OSD_OP_WRITE:
1358                 rbd_osd_write_callback(obj_request);
1359                 break;
1360         case CEPH_OSD_OP_STAT:
1361                 rbd_osd_stat_callback(obj_request);
1362                 break;
1363         case CEPH_OSD_OP_CALL:
1364         case CEPH_OSD_OP_NOTIFY_ACK:
1365         case CEPH_OSD_OP_WATCH:
1366                 rbd_osd_trivial_callback(obj_request);
1367                 break;
1368         default:
1369                 rbd_warn(NULL, "%s: unsupported op %hu\n",
1370                         obj_request->object_name, (unsigned short) opcode);
1371                 break;
1372         }
1373
1374         if (obj_request_done_test(obj_request))
1375                 rbd_obj_request_complete(obj_request);
1376 }
1377
1378 static struct ceph_osd_request *rbd_osd_req_create(
1379                                         struct rbd_device *rbd_dev,
1380                                         bool write_request,
1381                                         struct rbd_obj_request *obj_request,
1382                                         struct ceph_osd_req_op *op)
1383 {
1384         struct rbd_img_request *img_request = obj_request->img_request;
1385         struct ceph_snap_context *snapc = NULL;
1386         struct ceph_osd_client *osdc;
1387         struct ceph_osd_request *osd_req;
1388         struct timespec now;
1389         struct timespec *mtime;
1390         u64 snap_id = CEPH_NOSNAP;
1391         u64 offset = obj_request->offset;
1392         u64 length = obj_request->length;
1393
1394         if (img_request) {
1395                 rbd_assert(img_request->write_request == write_request);
1396                 if (img_request->write_request)
1397                         snapc = img_request->snapc;
1398                 else
1399                         snap_id = img_request->snap_id;
1400         }
1401
1402         /* Allocate and initialize the request, for the single op */
1403
1404         osdc = &rbd_dev->rbd_client->client->osdc;
1405         osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1406         if (!osd_req)
1407                 return NULL;    /* ENOMEM */
1408
1409         rbd_assert(obj_request_type_valid(obj_request->type));
1410         switch (obj_request->type) {
1411         case OBJ_REQUEST_NODATA:
1412                 break;          /* Nothing to do */
1413         case OBJ_REQUEST_BIO:
1414                 rbd_assert(obj_request->bio_list != NULL);
1415                 osd_req->r_bio = obj_request->bio_list;
1416                 break;
1417         case OBJ_REQUEST_PAGES:
1418                 osd_req->r_pages = obj_request->pages;
1419                 osd_req->r_num_pages = obj_request->page_count;
1420                 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1421                 break;
1422         }
1423
1424         if (write_request) {
1425                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1426                 now = CURRENT_TIME;
1427                 mtime = &now;
1428         } else {
1429                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1430                 mtime = NULL;   /* not needed for reads */
1431                 offset = 0;     /* These are not used... */
1432                 length = 0;     /* ...for osd read requests */
1433         }
1434
1435         osd_req->r_callback = rbd_osd_req_callback;
1436         osd_req->r_priv = obj_request;
1437
1438         osd_req->r_oid_len = strlen(obj_request->object_name);
1439         rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1440         memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1441
1442         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
1443
1444         /* osd_req will get its own reference to snapc (if non-null) */
1445
1446         ceph_osdc_build_request(osd_req, offset, length, 1, op,
1447                                 snapc, snap_id, mtime);
1448
1449         return osd_req;
1450 }
1451
1452 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1453 {
1454         ceph_osdc_put_request(osd_req);
1455 }
1456
1457 /* object_name is assumed to be a non-null pointer and NUL-terminated */
1458
1459 static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1460                                                 u64 offset, u64 length,
1461                                                 enum obj_request_type type)
1462 {
1463         struct rbd_obj_request *obj_request;
1464         size_t size;
1465         char *name;
1466
1467         rbd_assert(obj_request_type_valid(type));
1468
1469         size = strlen(object_name) + 1;
1470         obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471         if (!obj_request)
1472                 return NULL;
1473
1474         name = (char *)(obj_request + 1);
1475         obj_request->object_name = memcpy(name, object_name, size);
1476         obj_request->offset = offset;
1477         obj_request->length = length;
1478         obj_request->which = BAD_WHICH;
1479         obj_request->type = type;
1480         INIT_LIST_HEAD(&obj_request->links);
1481         obj_request_done_init(obj_request);
1482         init_completion(&obj_request->completion);
1483         kref_init(&obj_request->kref);
1484
1485         dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486                 offset, length, (int)type, obj_request);
1487
1488         return obj_request;
1489 }
1490
1491 static void rbd_obj_request_destroy(struct kref *kref)
1492 {
1493         struct rbd_obj_request *obj_request;
1494
1495         obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
1497         dout("%s: obj %p\n", __func__, obj_request);
1498
1499         rbd_assert(obj_request->img_request == NULL);
1500         rbd_assert(obj_request->which == BAD_WHICH);
1501
1502         if (obj_request->osd_req)
1503                 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505         rbd_assert(obj_request_type_valid(obj_request->type));
1506         switch (obj_request->type) {
1507         case OBJ_REQUEST_NODATA:
1508                 break;          /* Nothing to do */
1509         case OBJ_REQUEST_BIO:
1510                 if (obj_request->bio_list)
1511                         bio_chain_put(obj_request->bio_list);
1512                 break;
1513         case OBJ_REQUEST_PAGES:
1514                 if (obj_request->pages)
1515                         ceph_release_page_vector(obj_request->pages,
1516                                                 obj_request->page_count);
1517                 break;
1518         }
1519
1520         kfree(obj_request);
1521 }
1522
1523 /*
1524  * Caller is responsible for filling in the list of object requests
1525  * that comprises the image request, and the Linux request pointer
1526  * (if there is one).
1527  */
1528 static struct rbd_img_request *rbd_img_request_create(
1529                                         struct rbd_device *rbd_dev,
1530                                         u64 offset, u64 length,
1531                                         bool write_request)
1532 {
1533         struct rbd_img_request *img_request;
1534         struct ceph_snap_context *snapc = NULL;
1535
1536         img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1537         if (!img_request)
1538                 return NULL;
1539
1540         if (write_request) {
1541                 down_read(&rbd_dev->header_rwsem);
1542                 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1543                 up_read(&rbd_dev->header_rwsem);
1544                 if (WARN_ON(!snapc)) {
1545                         kfree(img_request);
1546                         return NULL;    /* Shouldn't happen */
1547                 }
1548         }
1549
1550         img_request->rq = NULL;
1551         img_request->rbd_dev = rbd_dev;
1552         img_request->offset = offset;
1553         img_request->length = length;
1554         img_request->write_request = write_request;
1555         if (write_request)
1556                 img_request->snapc = snapc;
1557         else
1558                 img_request->snap_id = rbd_dev->spec->snap_id;
1559         spin_lock_init(&img_request->completion_lock);
1560         img_request->next_completion = 0;
1561         img_request->callback = NULL;
1562         img_request->obj_request_count = 0;
1563         INIT_LIST_HEAD(&img_request->obj_requests);
1564         kref_init(&img_request->kref);
1565
1566         rbd_img_request_get(img_request);       /* Avoid a warning */
1567         rbd_img_request_put(img_request);       /* TEMPORARY */
1568
1569         dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1570                 write_request ? "write" : "read", offset, length,
1571                 img_request);
1572
1573         return img_request;
1574 }
1575
1576 static void rbd_img_request_destroy(struct kref *kref)
1577 {
1578         struct rbd_img_request *img_request;
1579         struct rbd_obj_request *obj_request;
1580         struct rbd_obj_request *next_obj_request;
1581
1582         img_request = container_of(kref, struct rbd_img_request, kref);
1583
1584         dout("%s: img %p\n", __func__, img_request);
1585
1586         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1587                 rbd_img_obj_request_del(img_request, obj_request);
1588         rbd_assert(img_request->obj_request_count == 0);
1589
1590         if (img_request->write_request)
1591                 ceph_put_snap_context(img_request->snapc);
1592
1593         kfree(img_request);
1594 }
1595
1596 static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1597                                         struct bio *bio_list)
1598 {
1599         struct rbd_device *rbd_dev = img_request->rbd_dev;
1600         struct rbd_obj_request *obj_request = NULL;
1601         struct rbd_obj_request *next_obj_request;
1602         unsigned int bio_offset;
1603         u64 image_offset;
1604         u64 resid;
1605         u16 opcode;
1606
1607         dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1608
1609         opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1610                                               : CEPH_OSD_OP_READ;
1611         bio_offset = 0;
1612         image_offset = img_request->offset;
1613         rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1614         resid = img_request->length;
1615         rbd_assert(resid > 0);
1616         while (resid) {
1617                 const char *object_name;
1618                 unsigned int clone_size;
1619                 struct ceph_osd_req_op *op;
1620                 u64 offset;
1621                 u64 length;
1622
1623                 object_name = rbd_segment_name(rbd_dev, image_offset);
1624                 if (!object_name)
1625                         goto out_unwind;
1626                 offset = rbd_segment_offset(rbd_dev, image_offset);
1627                 length = rbd_segment_length(rbd_dev, image_offset, resid);
1628                 obj_request = rbd_obj_request_create(object_name,
1629                                                 offset, length,
1630                                                 OBJ_REQUEST_BIO);
1631                 kfree(object_name);     /* object request has its own copy */
1632                 if (!obj_request)
1633                         goto out_unwind;
1634
1635                 rbd_assert(length <= (u64) UINT_MAX);
1636                 clone_size = (unsigned int) length;
1637                 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1638                                                 &bio_offset, clone_size,
1639                                                 GFP_ATOMIC);
1640                 if (!obj_request->bio_list)
1641                         goto out_partial;
1642
1643                 /*
1644                  * Build up the op to use in building the osd
1645                  * request.  Note that the contents of the op are
1646                  * copied by rbd_osd_req_create().
1647                  */
1648                 op = rbd_osd_req_op_create(opcode, offset, length);
1649                 if (!op)
1650                         goto out_partial;
1651                 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1652                                                 img_request->write_request,
1653                                                 obj_request, op);
1654                 rbd_osd_req_op_destroy(op);
1655                 if (!obj_request->osd_req)
1656                         goto out_partial;
1657                 /* status and version are initially zero-filled */
1658
1659                 rbd_img_obj_request_add(img_request, obj_request);
1660
1661                 image_offset += length;
1662                 resid -= length;
1663         }
1664
1665         return 0;
1666
1667 out_partial:
1668         rbd_obj_request_put(obj_request);
1669 out_unwind:
1670         for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1671                 rbd_obj_request_put(obj_request);
1672
1673         return -ENOMEM;
1674 }
1675
1676 static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1677 {
1678         struct rbd_img_request *img_request;
1679         u32 which = obj_request->which;
1680         bool more = true;
1681
1682         img_request = obj_request->img_request;
1683
1684         dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1685         rbd_assert(img_request != NULL);
1686         rbd_assert(img_request->rq != NULL);
1687         rbd_assert(img_request->obj_request_count > 0);
1688         rbd_assert(which != BAD_WHICH);
1689         rbd_assert(which < img_request->obj_request_count);
1690         rbd_assert(which >= img_request->next_completion);
1691
1692         spin_lock_irq(&img_request->completion_lock);
1693         if (which != img_request->next_completion)
1694                 goto out;
1695
1696         for_each_obj_request_from(img_request, obj_request) {
1697                 unsigned int xferred;
1698                 int result;
1699
1700                 rbd_assert(more);
1701                 rbd_assert(which < img_request->obj_request_count);
1702
1703                 if (!obj_request_done_test(obj_request))
1704                         break;
1705
1706                 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1707                 xferred = (unsigned int) obj_request->xferred;
1708                 result = (int) obj_request->result;
1709                 if (result)
1710                         rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1711                                 img_request->write_request ? "write" : "read",
1712                                 result, xferred);
1713
1714                 more = blk_end_request(img_request->rq, result, xferred);
1715                 which++;
1716         }
1717
1718         rbd_assert(more ^ (which == img_request->obj_request_count));
1719         img_request->next_completion = which;
1720 out:
1721         spin_unlock_irq(&img_request->completion_lock);
1722
1723         if (!more)
1724                 rbd_img_request_complete(img_request);
1725 }
1726
1727 static int rbd_img_request_submit(struct rbd_img_request *img_request)
1728 {
1729         struct rbd_device *rbd_dev = img_request->rbd_dev;
1730         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1731         struct rbd_obj_request *obj_request;
1732
1733         dout("%s: img %p\n", __func__, img_request);
1734         for_each_obj_request(img_request, obj_request) {
1735                 int ret;
1736
1737                 obj_request->callback = rbd_img_obj_callback;
1738                 ret = rbd_obj_request_submit(osdc, obj_request);
1739                 if (ret)
1740                         return ret;
1741                 /*
1742                  * The image request has its own reference to each
1743                  * of its object requests, so we can safely drop the
1744                  * initial one here.
1745                  */
1746                 rbd_obj_request_put(obj_request);
1747         }
1748
1749         return 0;
1750 }
1751
1752 static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1753                                    u64 ver, u64 notify_id)
1754 {
1755         struct rbd_obj_request *obj_request;
1756         struct ceph_osd_req_op *op;
1757         struct ceph_osd_client *osdc;
1758         int ret;
1759
1760         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1761                                                         OBJ_REQUEST_NODATA);
1762         if (!obj_request)
1763                 return -ENOMEM;
1764
1765         ret = -ENOMEM;
1766         op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1767         if (!op)
1768                 goto out;
1769         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1770                                                 obj_request, op);
1771         rbd_osd_req_op_destroy(op);
1772         if (!obj_request->osd_req)
1773                 goto out;
1774
1775         osdc = &rbd_dev->rbd_client->client->osdc;
1776         obj_request->callback = rbd_obj_request_put;
1777         ret = rbd_obj_request_submit(osdc, obj_request);
1778 out:
1779         if (ret)
1780                 rbd_obj_request_put(obj_request);
1781
1782         return ret;
1783 }
1784
1785 static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1786 {
1787         struct rbd_device *rbd_dev = (struct rbd_device *)data;
1788         u64 hver;
1789         int rc;
1790
1791         if (!rbd_dev)
1792                 return;
1793
1794         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1795                 rbd_dev->header_name, (unsigned long long) notify_id,
1796                 (unsigned int) opcode);
1797         rc = rbd_dev_refresh(rbd_dev, &hver);
1798         if (rc)
1799                 rbd_warn(rbd_dev, "got notification but failed to "
1800                            " update snaps: %d\n", rc);
1801
1802         rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1803 }
1804
1805 /*
1806  * Request sync osd watch/unwatch.  The value of "start" determines
1807  * whether a watch request is being initiated or torn down.
1808  */
1809 static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1810 {
1811         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1812         struct rbd_obj_request *obj_request;
1813         struct ceph_osd_req_op *op;
1814         int ret;
1815
1816         rbd_assert(start ^ !!rbd_dev->watch_event);
1817         rbd_assert(start ^ !!rbd_dev->watch_request);
1818
1819         if (start) {
1820                 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
1821                                                 &rbd_dev->watch_event);
1822                 if (ret < 0)
1823                         return ret;
1824                 rbd_assert(rbd_dev->watch_event != NULL);
1825         }
1826
1827         ret = -ENOMEM;
1828         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1829                                                         OBJ_REQUEST_NODATA);
1830         if (!obj_request)
1831                 goto out_cancel;
1832
1833         op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1834                                 rbd_dev->watch_event->cookie,
1835                                 rbd_dev->header.obj_version, start);
1836         if (!op)
1837                 goto out_cancel;
1838         obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1839                                                         obj_request, op);
1840         rbd_osd_req_op_destroy(op);
1841         if (!obj_request->osd_req)
1842                 goto out_cancel;
1843
1844         if (start)
1845                 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
1846         else
1847                 ceph_osdc_unregister_linger_request(osdc,
1848                                         rbd_dev->watch_request->osd_req);
1849         ret = rbd_obj_request_submit(osdc, obj_request);
1850         if (ret)
1851                 goto out_cancel;
1852         ret = rbd_obj_request_wait(obj_request);
1853         if (ret)
1854                 goto out_cancel;
1855         ret = obj_request->result;
1856         if (ret)
1857                 goto out_cancel;
1858
1859         /*
1860          * A watch request is set to linger, so the underlying osd
1861          * request won't go away until we unregister it.  We retain
1862          * a pointer to the object request during that time (in
1863          * rbd_dev->watch_request), so we'll keep a reference to
1864          * it.  We'll drop that reference (below) after we've
1865          * unregistered it.
1866          */
1867         if (start) {
1868                 rbd_dev->watch_request = obj_request;
1869
1870                 return 0;
1871         }
1872
1873         /* We have successfully torn down the watch request */
1874
1875         rbd_obj_request_put(rbd_dev->watch_request);
1876         rbd_dev->watch_request = NULL;
1877 out_cancel:
1878         /* Cancel the event if we're tearing down, or on error */
1879         ceph_osdc_cancel_event(rbd_dev->watch_event);
1880         rbd_dev->watch_event = NULL;
1881         if (obj_request)
1882                 rbd_obj_request_put(obj_request);
1883
1884         return ret;
1885 }
1886
1887 /*
1888  * Synchronous osd object method call
1889  */
1890 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1891                              const char *object_name,
1892                              const char *class_name,
1893                              const char *method_name,
1894                              const char *outbound,
1895                              size_t outbound_size,
1896                              char *inbound,
1897                              size_t inbound_size,
1898                              u64 *version)
1899 {
1900         struct rbd_obj_request *obj_request;
1901         struct ceph_osd_client *osdc;
1902         struct ceph_osd_req_op *op;
1903         struct page **pages;
1904         u32 page_count;
1905         int ret;
1906
1907         /*
1908          * Method calls are ultimately read operations but they
1909          * don't involve object data (so no offset or length).
1910          * The result should placed into the inbound buffer
1911          * provided.  They also supply outbound data--parameters for
1912          * the object method.  Currently if this is present it will
1913          * be a snapshot id.
1914          */
1915         page_count = (u32) calc_pages_for(0, inbound_size);
1916         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1917         if (IS_ERR(pages))
1918                 return PTR_ERR(pages);
1919
1920         ret = -ENOMEM;
1921         obj_request = rbd_obj_request_create(object_name, 0, 0,
1922                                                         OBJ_REQUEST_PAGES);
1923         if (!obj_request)
1924                 goto out;
1925
1926         obj_request->pages = pages;
1927         obj_request->page_count = page_count;
1928
1929         op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1930                                         method_name, outbound, outbound_size);
1931         if (!op)
1932                 goto out;
1933         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1934                                                 obj_request, op);
1935         rbd_osd_req_op_destroy(op);
1936         if (!obj_request->osd_req)
1937                 goto out;
1938
1939         osdc = &rbd_dev->rbd_client->client->osdc;
1940         ret = rbd_obj_request_submit(osdc, obj_request);
1941         if (ret)
1942                 goto out;
1943         ret = rbd_obj_request_wait(obj_request);
1944         if (ret)
1945                 goto out;
1946
1947         ret = obj_request->result;
1948         if (ret < 0)
1949                 goto out;
1950         ret = 0;
1951         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
1952         if (version)
1953                 *version = obj_request->version;
1954 out:
1955         if (obj_request)
1956                 rbd_obj_request_put(obj_request);
1957         else
1958                 ceph_release_page_vector(pages, page_count);
1959
1960         return ret;
1961 }
1962
1963 static void rbd_request_fn(struct request_queue *q)
1964                 __releases(q->queue_lock) __acquires(q->queue_lock)
1965 {
1966         struct rbd_device *rbd_dev = q->queuedata;
1967         bool read_only = rbd_dev->mapping.read_only;
1968         struct request *rq;
1969         int result;
1970
1971         while ((rq = blk_fetch_request(q))) {
1972                 bool write_request = rq_data_dir(rq) == WRITE;
1973                 struct rbd_img_request *img_request;
1974                 u64 offset;
1975                 u64 length;
1976
1977                 /* Ignore any non-FS requests that filter through. */
1978
1979                 if (rq->cmd_type != REQ_TYPE_FS) {
1980                         dout("%s: non-fs request type %d\n", __func__,
1981                                 (int) rq->cmd_type);
1982                         __blk_end_request_all(rq, 0);
1983                         continue;
1984                 }
1985
1986                 /* Ignore/skip any zero-length requests */
1987
1988                 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1989                 length = (u64) blk_rq_bytes(rq);
1990
1991                 if (!length) {
1992                         dout("%s: zero-length request\n", __func__);
1993                         __blk_end_request_all(rq, 0);
1994                         continue;
1995                 }
1996
1997                 spin_unlock_irq(q->queue_lock);
1998
1999                 /* Disallow writes to a read-only device */
2000
2001                 if (write_request) {
2002                         result = -EROFS;
2003                         if (read_only)
2004                                 goto end_request;
2005                         rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2006                 }
2007
2008                 /*
2009                  * Quit early if the mapped snapshot no longer
2010                  * exists.  It's still possible the snapshot will
2011                  * have disappeared by the time our request arrives
2012                  * at the osd, but there's no sense in sending it if
2013                  * we already know.
2014                  */
2015                 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2016                         dout("request for non-existent snapshot");
2017                         rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2018                         result = -ENXIO;
2019                         goto end_request;
2020                 }
2021
2022                 result = -EINVAL;
2023                 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2024                         goto end_request;       /* Shouldn't happen */
2025
2026                 result = -ENOMEM;
2027                 img_request = rbd_img_request_create(rbd_dev, offset, length,
2028                                                         write_request);
2029                 if (!img_request)
2030                         goto end_request;
2031
2032                 img_request->rq = rq;
2033
2034                 result = rbd_img_request_fill_bio(img_request, rq->bio);
2035                 if (!result)
2036                         result = rbd_img_request_submit(img_request);
2037                 if (result)
2038                         rbd_img_request_put(img_request);
2039 end_request:
2040                 spin_lock_irq(q->queue_lock);
2041                 if (result < 0) {
2042                         rbd_warn(rbd_dev, "obj_request %s result %d\n",
2043                                 write_request ? "write" : "read", result);
2044                         __blk_end_request_all(rq, result);
2045                 }
2046         }
2047 }
2048
2049 /*
2050  * a queue callback. Makes sure that we don't create a bio that spans across
2051  * multiple osd objects. One exception would be with a single page bios,
2052  * which we handle later at bio_chain_clone_range()
2053  */
2054 static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2055                           struct bio_vec *bvec)
2056 {
2057         struct rbd_device *rbd_dev = q->queuedata;
2058         sector_t sector_offset;
2059         sector_t sectors_per_obj;
2060         sector_t obj_sector_offset;
2061         int ret;
2062
2063         /*
2064          * Find how far into its rbd object the partition-relative
2065          * bio start sector is to offset relative to the enclosing
2066          * device.
2067          */
2068         sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2069         sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2070         obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2071
2072         /*
2073          * Compute the number of bytes from that offset to the end
2074          * of the object.  Account for what's already used by the bio.
2075          */
2076         ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2077         if (ret > bmd->bi_size)
2078                 ret -= bmd->bi_size;
2079         else
2080                 ret = 0;
2081
2082         /*
2083          * Don't send back more than was asked for.  And if the bio
2084          * was empty, let the whole thing through because:  "Note
2085          * that a block device *must* allow a single page to be
2086          * added to an empty bio."
2087          */
2088         rbd_assert(bvec->bv_len <= PAGE_SIZE);
2089         if (ret > (int) bvec->bv_len || !bmd->bi_size)
2090                 ret = (int) bvec->bv_len;
2091
2092         return ret;
2093 }
2094
2095 static void rbd_free_disk(struct rbd_device *rbd_dev)
2096 {
2097         struct gendisk *disk = rbd_dev->disk;
2098
2099         if (!disk)
2100                 return;
2101
2102         if (disk->flags & GENHD_FL_UP)
2103                 del_gendisk(disk);
2104         if (disk->queue)
2105                 blk_cleanup_queue(disk->queue);
2106         put_disk(disk);
2107 }
2108
2109 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2110                                 const char *object_name,
2111                                 u64 offset, u64 length,
2112                                 char *buf, u64 *version)
2113
2114 {
2115         struct ceph_osd_req_op *op;
2116         struct rbd_obj_request *obj_request;
2117         struct ceph_osd_client *osdc;
2118         struct page **pages = NULL;
2119         u32 page_count;
2120         size_t size;
2121         int ret;
2122
2123         page_count = (u32) calc_pages_for(offset, length);
2124         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2125         if (IS_ERR(pages))
2126                 ret = PTR_ERR(pages);
2127
2128         ret = -ENOMEM;
2129         obj_request = rbd_obj_request_create(object_name, offset, length,
2130                                                         OBJ_REQUEST_PAGES);
2131         if (!obj_request)
2132                 goto out;
2133
2134         obj_request->pages = pages;
2135         obj_request->page_count = page_count;
2136
2137         op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2138         if (!op)
2139                 goto out;
2140         obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2141                                                 obj_request, op);
2142         rbd_osd_req_op_destroy(op);
2143         if (!obj_request->osd_req)
2144                 goto out;
2145
2146         osdc = &rbd_dev->rbd_client->client->osdc;
2147         ret = rbd_obj_request_submit(osdc, obj_request);
2148         if (ret)
2149                 goto out;
2150         ret = rbd_obj_request_wait(obj_request);
2151         if (ret)
2152                 goto out;
2153
2154         ret = obj_request->result;
2155         if (ret < 0)
2156                 goto out;
2157
2158         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2159         size = (size_t) obj_request->xferred;
2160         ceph_copy_from_page_vector(pages, buf, 0, size);
2161         rbd_assert(size <= (size_t) INT_MAX);
2162         ret = (int) size;
2163         if (version)
2164                 *version = obj_request->version;
2165 out:
2166         if (obj_request)
2167                 rbd_obj_request_put(obj_request);
2168         else
2169                 ceph_release_page_vector(pages, page_count);
2170
2171         return ret;
2172 }
2173
2174 /*
2175  * Read the complete header for the given rbd device.
2176  *
2177  * Returns a pointer to a dynamically-allocated buffer containing
2178  * the complete and validated header.  Caller can pass the address
2179  * of a variable that will be filled in with the version of the
2180  * header object at the time it was read.
2181  *
2182  * Returns a pointer-coded errno if a failure occurs.
2183  */
2184 static struct rbd_image_header_ondisk *
2185 rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2186 {
2187         struct rbd_image_header_ondisk *ondisk = NULL;
2188         u32 snap_count = 0;
2189         u64 names_size = 0;
2190         u32 want_count;
2191         int ret;
2192
2193         /*
2194          * The complete header will include an array of its 64-bit
2195          * snapshot ids, followed by the names of those snapshots as
2196          * a contiguous block of NUL-terminated strings.  Note that
2197          * the number of snapshots could change by the time we read
2198          * it in, in which case we re-read it.
2199          */
2200         do {
2201                 size_t size;
2202
2203                 kfree(ondisk);
2204
2205                 size = sizeof (*ondisk);
2206                 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2207                 size += names_size;
2208                 ondisk = kmalloc(size, GFP_KERNEL);
2209                 if (!ondisk)
2210                         return ERR_PTR(-ENOMEM);
2211
2212                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
2213                                        0, size,
2214                                        (char *) ondisk, version);
2215                 if (ret < 0)
2216                         goto out_err;
2217                 if (WARN_ON((size_t) ret < size)) {
2218                         ret = -ENXIO;
2219                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2220                                 size, ret);
2221                         goto out_err;
2222                 }
2223                 if (!rbd_dev_ondisk_valid(ondisk)) {
2224                         ret = -ENXIO;
2225                         rbd_warn(rbd_dev, "invalid header");
2226                         goto out_err;
2227                 }
2228
2229                 names_size = le64_to_cpu(ondisk->snap_names_len);
2230                 want_count = snap_count;
2231                 snap_count = le32_to_cpu(ondisk->snap_count);
2232         } while (snap_count != want_count);
2233
2234         return ondisk;
2235
2236 out_err:
2237         kfree(ondisk);
2238
2239         return ERR_PTR(ret);
2240 }
2241
2242 /*
2243  * reload the ondisk the header
2244  */
2245 static int rbd_read_header(struct rbd_device *rbd_dev,
2246                            struct rbd_image_header *header)
2247 {
2248         struct rbd_image_header_ondisk *ondisk;
2249         u64 ver = 0;
2250         int ret;
2251
2252         ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2253         if (IS_ERR(ondisk))
2254                 return PTR_ERR(ondisk);
2255         ret = rbd_header_from_disk(header, ondisk);
2256         if (ret >= 0)
2257                 header->obj_version = ver;
2258         kfree(ondisk);
2259
2260         return ret;
2261 }
2262
2263 static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2264 {
2265         struct rbd_snap *snap;
2266         struct rbd_snap *next;
2267
2268         list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
2269                 rbd_remove_snap_dev(snap);
2270 }
2271
2272 static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2273 {
2274         sector_t size;
2275
2276         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2277                 return;
2278
2279         size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2280         dout("setting size to %llu sectors", (unsigned long long) size);
2281         rbd_dev->mapping.size = (u64) size;
2282         set_capacity(rbd_dev->disk, size);
2283 }
2284
2285 /*
2286  * only read the first part of the ondisk header, without the snaps info
2287  */
2288 static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2289 {
2290         int ret;
2291         struct rbd_image_header h;
2292
2293         ret = rbd_read_header(rbd_dev, &h);
2294         if (ret < 0)
2295                 return ret;
2296
2297         down_write(&rbd_dev->header_rwsem);
2298
2299         /* Update image size, and check for resize of mapped image */
2300         rbd_dev->header.image_size = h.image_size;
2301         rbd_update_mapping_size(rbd_dev);
2302
2303         /* rbd_dev->header.object_prefix shouldn't change */
2304         kfree(rbd_dev->header.snap_sizes);
2305         kfree(rbd_dev->header.snap_names);
2306         /* osd requests may still refer to snapc */
2307         ceph_put_snap_context(rbd_dev->header.snapc);
2308
2309         if (hver)
2310                 *hver = h.obj_version;
2311         rbd_dev->header.obj_version = h.obj_version;
2312         rbd_dev->header.image_size = h.image_size;
2313         rbd_dev->header.snapc = h.snapc;
2314         rbd_dev->header.snap_names = h.snap_names;
2315         rbd_dev->header.snap_sizes = h.snap_sizes;
2316         /* Free the extra copy of the object prefix */
2317         WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2318         kfree(h.object_prefix);
2319
2320         ret = rbd_dev_snaps_update(rbd_dev);
2321         if (!ret)
2322                 ret = rbd_dev_snaps_register(rbd_dev);
2323
2324         up_write(&rbd_dev->header_rwsem);
2325
2326         return ret;
2327 }
2328
2329 static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
2330 {
2331         int ret;
2332
2333         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
2334         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2335         if (rbd_dev->image_format == 1)
2336                 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2337         else
2338                 ret = rbd_dev_v2_refresh(rbd_dev, hver);
2339         mutex_unlock(&ctl_mutex);
2340
2341         return ret;
2342 }
2343
2344 static int rbd_init_disk(struct rbd_device *rbd_dev)
2345 {
2346         struct gendisk *disk;
2347         struct request_queue *q;
2348         u64 segment_size;
2349
2350         /* create gendisk info */
2351         disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2352         if (!disk)
2353                 return -ENOMEM;
2354
2355         snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2356                  rbd_dev->dev_id);
2357         disk->major = rbd_dev->major;
2358         disk->first_minor = 0;
2359         disk->fops = &rbd_bd_ops;
2360         disk->private_data = rbd_dev;
2361
2362         q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2363         if (!q)
2364                 goto out_disk;
2365
2366         /* We use the default size, but let's be explicit about it. */
2367         blk_queue_physical_block_size(q, SECTOR_SIZE);
2368
2369         /* set io sizes to object size */
2370         segment_size = rbd_obj_bytes(&rbd_dev->header);
2371         blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2372         blk_queue_max_segment_size(q, segment_size);
2373         blk_queue_io_min(q, segment_size);
2374         blk_queue_io_opt(q, segment_size);
2375
2376         blk_queue_merge_bvec(q, rbd_merge_bvec);
2377         disk->queue = q;
2378
2379         q->queuedata = rbd_dev;
2380
2381         rbd_dev->disk = disk;
2382
2383         set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2384
2385         return 0;
2386 out_disk:
2387         put_disk(disk);
2388
2389         return -ENOMEM;
2390 }
2391
2392 /*
2393   sysfs
2394 */
2395
2396 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2397 {
2398         return container_of(dev, struct rbd_device, dev);
2399 }
2400
2401 static ssize_t rbd_size_show(struct device *dev,
2402                              struct device_attribute *attr, char *buf)
2403 {
2404         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2405         sector_t size;
2406
2407         down_read(&rbd_dev->header_rwsem);
2408         size = get_capacity(rbd_dev->disk);
2409         up_read(&rbd_dev->header_rwsem);
2410
2411         return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2412 }
2413
2414 /*
2415  * Note this shows the features for whatever's mapped, which is not
2416  * necessarily the base image.
2417  */
2418 static ssize_t rbd_features_show(struct device *dev,
2419                              struct device_attribute *attr, char *buf)
2420 {
2421         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2422
2423         return sprintf(buf, "0x%016llx\n",
2424                         (unsigned long long) rbd_dev->mapping.features);
2425 }
2426
2427 static ssize_t rbd_major_show(struct device *dev,
2428                               struct device_attribute *attr, char *buf)
2429 {
2430         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2431
2432         return sprintf(buf, "%d\n", rbd_dev->major);
2433 }
2434
2435 static ssize_t rbd_client_id_show(struct device *dev,
2436                                   struct device_attribute *attr, char *buf)
2437 {
2438         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2439
2440         return sprintf(buf, "client%lld\n",
2441                         ceph_client_id(rbd_dev->rbd_client->client));
2442 }
2443
2444 static ssize_t rbd_pool_show(struct device *dev,
2445                              struct device_attribute *attr, char *buf)
2446 {
2447         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2448
2449         return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2450 }
2451
2452 static ssize_t rbd_pool_id_show(struct device *dev,
2453                              struct device_attribute *attr, char *buf)
2454 {
2455         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2456
2457         return sprintf(buf, "%llu\n",
2458                 (unsigned long long) rbd_dev->spec->pool_id);
2459 }
2460
2461 static ssize_t rbd_name_show(struct device *dev,
2462                              struct device_attribute *attr, char *buf)
2463 {
2464         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2465
2466         if (rbd_dev->spec->image_name)
2467                 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2468
2469         return sprintf(buf, "(unknown)\n");
2470 }
2471
2472 static ssize_t rbd_image_id_show(struct device *dev,
2473                              struct device_attribute *attr, char *buf)
2474 {
2475         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2476
2477         return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2478 }
2479
2480 /*
2481  * Shows the name of the currently-mapped snapshot (or
2482  * RBD_SNAP_HEAD_NAME for the base image).
2483  */
2484 static ssize_t rbd_snap_show(struct device *dev,
2485                              struct device_attribute *attr,
2486                              char *buf)
2487 {
2488         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2489
2490         return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2491 }
2492
2493 /*
2494  * For an rbd v2 image, shows the pool id, image id, and snapshot id
2495  * for the parent image.  If there is no parent, simply shows
2496  * "(no parent image)".
2497  */
2498 static ssize_t rbd_parent_show(struct device *dev,
2499                              struct device_attribute *attr,
2500                              char *buf)
2501 {
2502         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2503         struct rbd_spec *spec = rbd_dev->parent_spec;
2504         int count;
2505         char *bufp = buf;
2506
2507         if (!spec)
2508                 return sprintf(buf, "(no parent image)\n");
2509
2510         count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2511                         (unsigned long long) spec->pool_id, spec->pool_name);
2512         if (count < 0)
2513                 return count;
2514         bufp += count;
2515
2516         count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2517                         spec->image_name ? spec->image_name : "(unknown)");
2518         if (count < 0)
2519                 return count;
2520         bufp += count;
2521
2522         count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2523                         (unsigned long long) spec->snap_id, spec->snap_name);
2524         if (count < 0)
2525                 return count;
2526         bufp += count;
2527
2528         count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2529         if (count < 0)
2530                 return count;
2531         bufp += count;
2532
2533         return (ssize_t) (bufp - buf);
2534 }
2535
2536 static ssize_t rbd_image_refresh(struct device *dev,
2537                                  struct device_attribute *attr,
2538                                  const char *buf,
2539                                  size_t size)
2540 {
2541         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2542         int ret;
2543
2544         ret = rbd_dev_refresh(rbd_dev, NULL);
2545
2546         return ret < 0 ? ret : size;
2547 }
2548
2549 static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2550 static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2551 static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2552 static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2553 static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
2554 static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2555 static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2556 static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2557 static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2558 static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2559 static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2560
2561 static struct attribute *rbd_attrs[] = {
2562         &dev_attr_size.attr,
2563         &dev_attr_features.attr,
2564         &dev_attr_major.attr,
2565         &dev_attr_client_id.attr,
2566         &dev_attr_pool.attr,
2567         &dev_attr_pool_id.attr,
2568         &dev_attr_name.attr,
2569         &dev_attr_image_id.attr,
2570         &dev_attr_current_snap.attr,
2571         &dev_attr_parent.attr,
2572         &dev_attr_refresh.attr,
2573         NULL
2574 };
2575
2576 static struct attribute_group rbd_attr_group = {
2577         .attrs = rbd_attrs,
2578 };
2579
2580 static const struct attribute_group *rbd_attr_groups[] = {
2581         &rbd_attr_group,
2582         NULL
2583 };
2584
2585 static void rbd_sysfs_dev_release(struct device *dev)
2586 {
2587 }
2588
2589 static struct device_type rbd_device_type = {
2590         .name           = "rbd",
2591         .groups         = rbd_attr_groups,
2592         .release        = rbd_sysfs_dev_release,
2593 };
2594
2595
2596 /*
2597   sysfs - snapshots
2598 */
2599
2600 static ssize_t rbd_snap_size_show(struct device *dev,
2601                                   struct device_attribute *attr,
2602                                   char *buf)
2603 {
2604         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2605
2606         return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2607 }
2608
2609 static ssize_t rbd_snap_id_show(struct device *dev,
2610                                 struct device_attribute *attr,
2611                                 char *buf)
2612 {
2613         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2614
2615         return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2616 }
2617
2618 static ssize_t rbd_snap_features_show(struct device *dev,
2619                                 struct device_attribute *attr,
2620                                 char *buf)
2621 {
2622         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2623
2624         return sprintf(buf, "0x%016llx\n",
2625                         (unsigned long long) snap->features);
2626 }
2627
2628 static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2629 static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2630 static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2631
2632 static struct attribute *rbd_snap_attrs[] = {
2633         &dev_attr_snap_size.attr,
2634         &dev_attr_snap_id.attr,
2635         &dev_attr_snap_features.attr,
2636         NULL,
2637 };
2638
2639 static struct attribute_group rbd_snap_attr_group = {
2640         .attrs = rbd_snap_attrs,
2641 };
2642
2643 static void rbd_snap_dev_release(struct device *dev)
2644 {
2645         struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2646         kfree(snap->name);
2647         kfree(snap);
2648 }
2649
2650 static const struct attribute_group *rbd_snap_attr_groups[] = {
2651         &rbd_snap_attr_group,
2652         NULL
2653 };
2654
2655 static struct device_type rbd_snap_device_type = {
2656         .groups         = rbd_snap_attr_groups,
2657         .release        = rbd_snap_dev_release,
2658 };
2659
2660 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2661 {
2662         kref_get(&spec->kref);
2663
2664         return spec;
2665 }
2666
2667 static void rbd_spec_free(struct kref *kref);
2668 static void rbd_spec_put(struct rbd_spec *spec)
2669 {
2670         if (spec)
2671                 kref_put(&spec->kref, rbd_spec_free);
2672 }
2673
2674 static struct rbd_spec *rbd_spec_alloc(void)
2675 {
2676         struct rbd_spec *spec;
2677
2678         spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2679         if (!spec)
2680                 return NULL;
2681         kref_init(&spec->kref);
2682
2683         rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
2684
2685         return spec;
2686 }
2687
2688 static void rbd_spec_free(struct kref *kref)
2689 {
2690         struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2691
2692         kfree(spec->pool_name);
2693         kfree(spec->image_id);
2694         kfree(spec->image_name);
2695         kfree(spec->snap_name);
2696         kfree(spec);
2697 }
2698
2699 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2700                                 struct rbd_spec *spec)
2701 {
2702         struct rbd_device *rbd_dev;
2703
2704         rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2705         if (!rbd_dev)
2706                 return NULL;
2707
2708         spin_lock_init(&rbd_dev->lock);
2709         rbd_dev->flags = 0;
2710         INIT_LIST_HEAD(&rbd_dev->node);
2711         INIT_LIST_HEAD(&rbd_dev->snaps);
2712         init_rwsem(&rbd_dev->header_rwsem);
2713
2714         rbd_dev->spec = spec;
2715         rbd_dev->rbd_client = rbdc;
2716
2717         /* Initialize the layout used for all rbd requests */
2718
2719         rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2720         rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2721         rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2722         rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2723
2724         return rbd_dev;
2725 }
2726
2727 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2728 {
2729         rbd_spec_put(rbd_dev->parent_spec);
2730         kfree(rbd_dev->header_name);
2731         rbd_put_client(rbd_dev->rbd_client);
2732         rbd_spec_put(rbd_dev->spec);
2733         kfree(rbd_dev);
2734 }
2735
2736 static bool rbd_snap_registered(struct rbd_snap *snap)
2737 {
2738         bool ret = snap->dev.type == &rbd_snap_device_type;
2739         bool reg = device_is_registered(&snap->dev);
2740
2741         rbd_assert(!ret ^ reg);
2742
2743         return ret;
2744 }
2745
2746 static void rbd_remove_snap_dev(struct rbd_snap *snap)
2747 {
2748         list_del(&snap->node);
2749         if (device_is_registered(&snap->dev))
2750                 device_unregister(&snap->dev);
2751 }
2752
2753 static int rbd_register_snap_dev(struct rbd_snap *snap,
2754                                   struct device *parent)
2755 {
2756         struct device *dev = &snap->dev;
2757         int ret;
2758
2759         dev->type = &rbd_snap_device_type;
2760         dev->parent = parent;
2761         dev->release = rbd_snap_dev_release;
2762         dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2763         dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2764
2765         ret = device_register(dev);
2766
2767         return ret;
2768 }
2769
2770 static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2771                                                 const char *snap_name,
2772                                                 u64 snap_id, u64 snap_size,
2773                                                 u64 snap_features)
2774 {
2775         struct rbd_snap *snap;
2776         int ret;
2777
2778         snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2779         if (!snap)
2780                 return ERR_PTR(-ENOMEM);
2781
2782         ret = -ENOMEM;
2783         snap->name = kstrdup(snap_name, GFP_KERNEL);
2784         if (!snap->name)
2785                 goto err;
2786
2787         snap->id = snap_id;
2788         snap->size = snap_size;
2789         snap->features = snap_features;
2790
2791         return snap;
2792
2793 err:
2794         kfree(snap->name);
2795         kfree(snap);
2796
2797         return ERR_PTR(ret);
2798 }
2799
2800 static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2801                 u64 *snap_size, u64 *snap_features)
2802 {
2803         char *snap_name;
2804
2805         rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2806
2807         *snap_size = rbd_dev->header.snap_sizes[which];
2808         *snap_features = 0;     /* No features for v1 */
2809
2810         /* Skip over names until we find the one we are looking for */
2811
2812         snap_name = rbd_dev->header.snap_names;
2813         while (which--)
2814                 snap_name += strlen(snap_name) + 1;
2815
2816         return snap_name;
2817 }
2818
2819 /*
2820  * Get the size and object order for an image snapshot, or if
2821  * snap_id is CEPH_NOSNAP, gets this information for the base
2822  * image.
2823  */
2824 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2825                                 u8 *order, u64 *snap_size)
2826 {
2827         __le64 snapid = cpu_to_le64(snap_id);
2828         int ret;
2829         struct {
2830                 u8 order;
2831                 __le64 size;
2832         } __attribute__ ((packed)) size_buf = { 0 };
2833
2834         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2835                                 "rbd", "get_size",
2836                                 (char *) &snapid, sizeof (snapid),
2837                                 (char *) &size_buf, sizeof (size_buf), NULL);
2838         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2839         if (ret < 0)
2840                 return ret;
2841
2842         *order = size_buf.order;
2843         *snap_size = le64_to_cpu(size_buf.size);
2844
2845         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
2846                 (unsigned long long) snap_id, (unsigned int) *order,
2847                 (unsigned long long) *snap_size);
2848
2849         return 0;
2850 }
2851
2852 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2853 {
2854         return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2855                                         &rbd_dev->header.obj_order,
2856                                         &rbd_dev->header.image_size);
2857 }
2858
2859 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2860 {
2861         void *reply_buf;
2862         int ret;
2863         void *p;
2864
2865         reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2866         if (!reply_buf)
2867                 return -ENOMEM;
2868
2869         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2870                                 "rbd", "get_object_prefix",
2871                                 NULL, 0,
2872                                 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
2873         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2874         if (ret < 0)
2875                 goto out;
2876
2877         p = reply_buf;
2878         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2879                                                 p + RBD_OBJ_PREFIX_LEN_MAX,
2880                                                 NULL, GFP_NOIO);
2881
2882         if (IS_ERR(rbd_dev->header.object_prefix)) {
2883                 ret = PTR_ERR(rbd_dev->header.object_prefix);
2884                 rbd_dev->header.object_prefix = NULL;
2885         } else {
2886                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
2887         }
2888
2889 out:
2890         kfree(reply_buf);
2891
2892         return ret;
2893 }
2894
2895 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2896                 u64 *snap_features)
2897 {
2898         __le64 snapid = cpu_to_le64(snap_id);
2899         struct {
2900                 __le64 features;
2901                 __le64 incompat;
2902         } features_buf = { 0 };
2903         u64 incompat;
2904         int ret;
2905
2906         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2907                                 "rbd", "get_features",
2908                                 (char *) &snapid, sizeof (snapid),
2909                                 (char *) &features_buf, sizeof (features_buf),
2910                                 NULL);
2911         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2912         if (ret < 0)
2913                 return ret;
2914
2915         incompat = le64_to_cpu(features_buf.incompat);
2916         if (incompat & ~RBD_FEATURES_ALL)
2917                 return -ENXIO;
2918
2919         *snap_features = le64_to_cpu(features_buf.features);
2920
2921         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2922                 (unsigned long long) snap_id,
2923                 (unsigned long long) *snap_features,
2924                 (unsigned long long) le64_to_cpu(features_buf.incompat));
2925
2926         return 0;
2927 }
2928
2929 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2930 {
2931         return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2932                                                 &rbd_dev->header.features);
2933 }
2934
2935 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2936 {
2937         struct rbd_spec *parent_spec;
2938         size_t size;
2939         void *reply_buf = NULL;
2940         __le64 snapid;
2941         void *p;
2942         void *end;
2943         char *image_id;
2944         u64 overlap;
2945         int ret;
2946
2947         parent_spec = rbd_spec_alloc();
2948         if (!parent_spec)
2949                 return -ENOMEM;
2950
2951         size = sizeof (__le64) +                                /* pool_id */
2952                 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +        /* image_id */
2953                 sizeof (__le64) +                               /* snap_id */
2954                 sizeof (__le64);                                /* overlap */
2955         reply_buf = kmalloc(size, GFP_KERNEL);
2956         if (!reply_buf) {
2957                 ret = -ENOMEM;
2958                 goto out_err;
2959         }
2960
2961         snapid = cpu_to_le64(CEPH_NOSNAP);
2962         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2963                                 "rbd", "get_parent",
2964                                 (char *) &snapid, sizeof (snapid),
2965                                 (char *) reply_buf, size, NULL);
2966         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2967         if (ret < 0)
2968                 goto out_err;
2969
2970         ret = -ERANGE;
2971         p = reply_buf;
2972         end = (char *) reply_buf + size;
2973         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2974         if (parent_spec->pool_id == CEPH_NOPOOL)
2975                 goto out;       /* No parent?  No problem. */
2976
2977         /* The ceph file layout needs to fit pool id in 32 bits */
2978
2979         ret = -EIO;
2980         if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2981                 goto out;
2982
2983         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2984         if (IS_ERR(image_id)) {
2985                 ret = PTR_ERR(image_id);
2986                 goto out_err;
2987         }
2988         parent_spec->image_id = image_id;
2989         ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2990         ceph_decode_64_safe(&p, end, overlap, out_err);
2991
2992         rbd_dev->parent_overlap = overlap;
2993         rbd_dev->parent_spec = parent_spec;
2994         parent_spec = NULL;     /* rbd_dev now owns this */
2995 out:
2996         ret = 0;
2997 out_err:
2998         kfree(reply_buf);
2999         rbd_spec_put(parent_spec);
3000
3001         return ret;
3002 }
3003
3004 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3005 {
3006         size_t image_id_size;
3007         char *image_id;
3008         void *p;
3009         void *end;
3010         size_t size;
3011         void *reply_buf = NULL;
3012         size_t len = 0;
3013         char *image_name = NULL;
3014         int ret;
3015
3016         rbd_assert(!rbd_dev->spec->image_name);
3017
3018         len = strlen(rbd_dev->spec->image_id);
3019         image_id_size = sizeof (__le32) + len;
3020         image_id = kmalloc(image_id_size, GFP_KERNEL);
3021         if (!image_id)
3022                 return NULL;
3023
3024         p = image_id;
3025         end = (char *) image_id + image_id_size;
3026         ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
3027
3028         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3029         reply_buf = kmalloc(size, GFP_KERNEL);
3030         if (!reply_buf)
3031                 goto out;
3032
3033         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
3034                                 "rbd", "dir_get_name",
3035                                 image_id, image_id_size,
3036                                 (char *) reply_buf, size, NULL);
3037         if (ret < 0)
3038                 goto out;
3039         p = reply_buf;
3040         end = (char *) reply_buf + size;
3041         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3042         if (IS_ERR(image_name))
3043                 image_name = NULL;
3044         else
3045                 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3046 out:
3047         kfree(reply_buf);
3048         kfree(image_id);
3049
3050         return image_name;
3051 }
3052
3053 /*
3054  * When a parent image gets probed, we only have the pool, image,
3055  * and snapshot ids but not the names of any of them.  This call
3056  * is made later to fill in those names.  It has to be done after
3057  * rbd_dev_snaps_update() has completed because some of the
3058  * information (in particular, snapshot name) is not available
3059  * until then.
3060  */
3061 static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3062 {
3063         struct ceph_osd_client *osdc;
3064         const char *name;
3065         void *reply_buf = NULL;
3066         int ret;
3067
3068         if (rbd_dev->spec->pool_name)
3069                 return 0;       /* Already have the names */
3070
3071         /* Look up the pool name */
3072
3073         osdc = &rbd_dev->rbd_client->client->osdc;
3074         name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3075         if (!name) {
3076                 rbd_warn(rbd_dev, "there is no pool with id %llu",
3077                         rbd_dev->spec->pool_id);        /* Really a BUG() */
3078                 return -EIO;
3079         }
3080
3081         rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3082         if (!rbd_dev->spec->pool_name)
3083                 return -ENOMEM;
3084
3085         /* Fetch the image name; tolerate failure here */
3086
3087         name = rbd_dev_image_name(rbd_dev);
3088         if (name)
3089                 rbd_dev->spec->image_name = (char *) name;
3090         else
3091                 rbd_warn(rbd_dev, "unable to get image name");
3092
3093         /* Look up the snapshot name. */
3094
3095         name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3096         if (!name) {
3097                 rbd_warn(rbd_dev, "no snapshot with id %llu",
3098                         rbd_dev->spec->snap_id);        /* Really a BUG() */
3099                 ret = -EIO;
3100                 goto out_err;
3101         }
3102         rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3103         if(!rbd_dev->spec->snap_name)
3104                 goto out_err;
3105
3106         return 0;
3107 out_err:
3108         kfree(reply_buf);
3109         kfree(rbd_dev->spec->pool_name);
3110         rbd_dev->spec->pool_name = NULL;
3111
3112         return ret;
3113 }
3114
3115 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
3116 {
3117         size_t size;
3118         int ret;
3119         void *reply_buf;
3120         void *p;
3121         void *end;
3122         u64 seq;
3123         u32 snap_count;
3124         struct ceph_snap_context *snapc;
3125         u32 i;
3126
3127         /*
3128          * We'll need room for the seq value (maximum snapshot id),
3129          * snapshot count, and array of that many snapshot ids.
3130          * For now we have a fixed upper limit on the number we're
3131          * prepared to receive.
3132          */
3133         size = sizeof (__le64) + sizeof (__le32) +
3134                         RBD_MAX_SNAP_COUNT * sizeof (__le64);
3135         reply_buf = kzalloc(size, GFP_KERNEL);
3136         if (!reply_buf)
3137                 return -ENOMEM;
3138
3139         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3140                                 "rbd", "get_snapcontext",
3141                                 NULL, 0,
3142                                 reply_buf, size, ver);
3143         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3144         if (ret < 0)
3145                 goto out;
3146
3147         ret = -ERANGE;
3148         p = reply_buf;
3149         end = (char *) reply_buf + size;
3150         ceph_decode_64_safe(&p, end, seq, out);
3151         ceph_decode_32_safe(&p, end, snap_count, out);
3152
3153         /*
3154          * Make sure the reported number of snapshot ids wouldn't go
3155          * beyond the end of our buffer.  But before checking that,
3156          * make sure the computed size of the snapshot context we
3157          * allocate is representable in a size_t.
3158          */
3159         if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3160                                  / sizeof (u64)) {
3161                 ret = -EINVAL;
3162                 goto out;
3163         }
3164         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3165                 goto out;
3166
3167         size = sizeof (struct ceph_snap_context) +
3168                                 snap_count * sizeof (snapc->snaps[0]);
3169         snapc = kmalloc(size, GFP_KERNEL);
3170         if (!snapc) {
3171                 ret = -ENOMEM;
3172                 goto out;
3173         }
3174
3175         atomic_set(&snapc->nref, 1);
3176         snapc->seq = seq;
3177         snapc->num_snaps = snap_count;
3178         for (i = 0; i < snap_count; i++)
3179                 snapc->snaps[i] = ceph_decode_64(&p);
3180
3181         rbd_dev->header.snapc = snapc;
3182
3183         dout("  snap context seq = %llu, snap_count = %u\n",
3184                 (unsigned long long) seq, (unsigned int) snap_count);
3185
3186 out:
3187         kfree(reply_buf);
3188
3189         return 0;
3190 }
3191
3192 static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3193 {
3194         size_t size;
3195         void *reply_buf;
3196         __le64 snap_id;
3197         int ret;
3198         void *p;
3199         void *end;
3200         char *snap_name;
3201
3202         size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3203         reply_buf = kmalloc(size, GFP_KERNEL);
3204         if (!reply_buf)
3205                 return ERR_PTR(-ENOMEM);
3206
3207         snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
3208         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3209                                 "rbd", "get_snapshot_name",
3210                                 (char *) &snap_id, sizeof (snap_id),
3211                                 reply_buf, size, NULL);
3212         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3213         if (ret < 0)
3214                 goto out;
3215
3216         p = reply_buf;
3217         end = (char *) reply_buf + size;
3218         snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3219         if (IS_ERR(snap_name)) {
3220                 ret = PTR_ERR(snap_name);
3221                 goto out;
3222         } else {
3223                 dout("  snap_id 0x%016llx snap_name = %s\n",
3224                         (unsigned long long) le64_to_cpu(snap_id), snap_name);
3225         }
3226         kfree(reply_buf);
3227
3228         return snap_name;
3229 out:
3230         kfree(reply_buf);
3231
3232         return ERR_PTR(ret);
3233 }
3234
3235 static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3236                 u64 *snap_size, u64 *snap_features)
3237 {
3238         u64 snap_id;
3239         u8 order;
3240         int ret;
3241
3242         snap_id = rbd_dev->header.snapc->snaps[which];
3243         ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3244         if (ret)
3245                 return ERR_PTR(ret);
3246         ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3247         if (ret)
3248                 return ERR_PTR(ret);
3249
3250         return rbd_dev_v2_snap_name(rbd_dev, which);
3251 }
3252
3253 static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3254                 u64 *snap_size, u64 *snap_features)
3255 {
3256         if (rbd_dev->image_format == 1)
3257                 return rbd_dev_v1_snap_info(rbd_dev, which,
3258                                         snap_size, snap_features);
3259         if (rbd_dev->image_format == 2)
3260                 return rbd_dev_v2_snap_info(rbd_dev, which,
3261                                         snap_size, snap_features);
3262         return ERR_PTR(-EINVAL);
3263 }
3264
3265 static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3266 {
3267         int ret;
3268         __u8 obj_order;
3269
3270         down_write(&rbd_dev->header_rwsem);
3271
3272         /* Grab old order first, to see if it changes */
3273
3274         obj_order = rbd_dev->header.obj_order,
3275         ret = rbd_dev_v2_image_size(rbd_dev);
3276         if (ret)
3277                 goto out;
3278         if (rbd_dev->header.obj_order != obj_order) {
3279                 ret = -EIO;
3280                 goto out;
3281         }
3282         rbd_update_mapping_size(rbd_dev);
3283
3284         ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3285         dout("rbd_dev_v2_snap_context returned %d\n", ret);
3286         if (ret)
3287                 goto out;
3288         ret = rbd_dev_snaps_update(rbd_dev);
3289         dout("rbd_dev_snaps_update returned %d\n", ret);
3290         if (ret)
3291                 goto out;
3292         ret = rbd_dev_snaps_register(rbd_dev);
3293         dout("rbd_dev_snaps_register returned %d\n", ret);
3294 out:
3295         up_write(&rbd_dev->header_rwsem);
3296
3297         return ret;
3298 }
3299
3300 /*
3301  * Scan the rbd device's current snapshot list and compare it to the
3302  * newly-received snapshot context.  Remove any existing snapshots
3303  * not present in the new snapshot context.  Add a new snapshot for
3304  * any snaphots in the snapshot context not in the current list.
3305  * And verify there are no changes to snapshots we already know
3306  * about.
3307  *
3308  * Assumes the snapshots in the snapshot context are sorted by
3309  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
3310  * are also maintained in that order.)
3311  */
3312 static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3313 {
3314         struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3315         const u32 snap_count = snapc->num_snaps;
3316         struct list_head *head = &rbd_dev->snaps;
3317         struct list_head *links = head->next;
3318         u32 index = 0;
3319
3320         dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
3321         while (index < snap_count || links != head) {
3322                 u64 snap_id;
3323                 struct rbd_snap *snap;
3324                 char *snap_name;
3325                 u64 snap_size = 0;
3326                 u64 snap_features = 0;
3327
3328                 snap_id = index < snap_count ? snapc->snaps[index]
3329                                              : CEPH_NOSNAP;
3330                 snap = links != head ? list_entry(links, struct rbd_snap, node)
3331                                      : NULL;
3332                 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3333
3334                 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3335                         struct list_head *next = links->next;
3336
3337                         /*
3338                          * A previously-existing snapshot is not in
3339                          * the new snap context.
3340                          *
3341                          * If the now missing snapshot is the one the
3342                          * image is mapped to, clear its exists flag
3343                          * so we can avoid sending any more requests
3344                          * to it.
3345                          */
3346                         if (rbd_dev->spec->snap_id == snap->id)
3347                                 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3348                         rbd_remove_snap_dev(snap);
3349                         dout("%ssnap id %llu has been removed\n",
3350                                 rbd_dev->spec->snap_id == snap->id ?
3351                                                         "mapped " : "",
3352                                 (unsigned long long) snap->id);
3353
3354                         /* Done with this list entry; advance */
3355
3356                         links = next;
3357                         continue;
3358                 }
3359
3360                 snap_name = rbd_dev_snap_info(rbd_dev, index,
3361                                         &snap_size, &snap_features);
3362                 if (IS_ERR(snap_name))
3363                         return PTR_ERR(snap_name);
3364
3365                 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3366                         (unsigned long long) snap_id);
3367                 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3368                         struct rbd_snap *new_snap;
3369
3370                         /* We haven't seen this snapshot before */
3371
3372                         new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3373                                         snap_id, snap_size, snap_features);
3374                         if (IS_ERR(new_snap)) {
3375                                 int err = PTR_ERR(new_snap);
3376
3377                                 dout("  failed to add dev, error %d\n", err);
3378
3379                                 return err;
3380                         }
3381
3382                         /* New goes before existing, or at end of list */
3383
3384                         dout("  added dev%s\n", snap ? "" : " at end\n");
3385                         if (snap)
3386                                 list_add_tail(&new_snap->node, &snap->node);
3387                         else
3388                                 list_add_tail(&new_snap->node, head);
3389                 } else {
3390                         /* Already have this one */
3391
3392                         dout("  already present\n");
3393
3394                         rbd_assert(snap->size == snap_size);
3395                         rbd_assert(!strcmp(snap->name, snap_name));
3396                         rbd_assert(snap->features == snap_features);
3397
3398                         /* Done with this list entry; advance */
3399
3400                         links = links->next;
3401                 }
3402
3403                 /* Advance to the next entry in the snapshot context */
3404
3405                 index++;
3406         }
3407         dout("%s: done\n", __func__);
3408
3409         return 0;
3410 }
3411
3412 /*
3413  * Scan the list of snapshots and register the devices for any that
3414  * have not already been registered.
3415  */
3416 static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3417 {
3418         struct rbd_snap *snap;
3419         int ret = 0;
3420
3421         dout("%s:\n", __func__);
3422         if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3423                 return -EIO;
3424
3425         list_for_each_entry(snap, &rbd_dev->snaps, node) {
3426                 if (!rbd_snap_registered(snap)) {
3427                         ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3428                         if (ret < 0)
3429                                 break;
3430                 }
3431         }
3432         dout("%s: returning %d\n", __func__, ret);
3433
3434         return ret;
3435 }
3436
3437 static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3438 {
3439         struct device *dev;
3440         int ret;
3441
3442         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3443
3444         dev = &rbd_dev->dev;
3445         dev->bus = &rbd_bus_type;
3446         dev->type = &rbd_device_type;
3447         dev->parent = &rbd_root_dev;
3448         dev->release = rbd_dev_release;
3449         dev_set_name(dev, "%d", rbd_dev->dev_id);
3450         ret = device_register(dev);
3451
3452         mutex_unlock(&ctl_mutex);
3453
3454         return ret;
3455 }
3456
3457 static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3458 {
3459         device_unregister(&rbd_dev->dev);
3460 }
3461
3462 static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
3463
3464 /*
3465  * Get a unique rbd identifier for the given new rbd_dev, and add
3466  * the rbd_dev to the global list.  The minimum rbd id is 1.
3467  */
3468 static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3469 {
3470         rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3471
3472         spin_lock(&rbd_dev_list_lock);
3473         list_add_tail(&rbd_dev->node, &rbd_dev_list);
3474         spin_unlock(&rbd_dev_list_lock);
3475         dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3476                 (unsigned long long) rbd_dev->dev_id);
3477 }
3478
3479 /*
3480  * Remove an rbd_dev from the global list, and record that its
3481  * identifier is no longer in use.
3482  */
3483 static void rbd_dev_id_put(struct rbd_device *rbd_dev)
3484 {
3485         struct list_head *tmp;
3486         int rbd_id = rbd_dev->dev_id;
3487         int max_id;
3488
3489         rbd_assert(rbd_id > 0);
3490
3491         dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3492                 (unsigned long long) rbd_dev->dev_id);
3493         spin_lock(&rbd_dev_list_lock);
3494         list_del_init(&rbd_dev->node);
3495
3496         /*
3497          * If the id being "put" is not the current maximum, there
3498          * is nothing special we need to do.
3499          */
3500         if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3501                 spin_unlock(&rbd_dev_list_lock);
3502                 return;
3503         }
3504
3505         /*
3506          * We need to update the current maximum id.  Search the
3507          * list to find out what it is.  We're more likely to find
3508          * the maximum at the end, so search the list backward.
3509          */
3510         max_id = 0;
3511         list_for_each_prev(tmp, &rbd_dev_list) {
3512                 struct rbd_device *rbd_dev;
3513
3514                 rbd_dev = list_entry(tmp, struct rbd_device, node);
3515                 if (rbd_dev->dev_id > max_id)
3516                         max_id = rbd_dev->dev_id;
3517         }
3518         spin_unlock(&rbd_dev_list_lock);
3519
3520         /*
3521          * The max id could have been updated by rbd_dev_id_get(), in
3522          * which case it now accurately reflects the new maximum.
3523          * Be careful not to overwrite the maximum value in that
3524          * case.
3525          */
3526         atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3527         dout("  max dev id has been reset\n");
3528 }
3529
3530 /*
3531  * Skips over white space at *buf, and updates *buf to point to the
3532  * first found non-space character (if any). Returns the length of
3533  * the token (string of non-white space characters) found.  Note
3534  * that *buf must be terminated with '\0'.
3535  */
3536 static inline size_t next_token(const char **buf)
3537 {
3538         /*
3539         * These are the characters that produce nonzero for
3540         * isspace() in the "C" and "POSIX" locales.
3541         */
3542         const char *spaces = " \f\n\r\t\v";
3543
3544         *buf += strspn(*buf, spaces);   /* Find start of token */
3545
3546         return strcspn(*buf, spaces);   /* Return token length */
3547 }
3548
3549 /*
3550  * Finds the next token in *buf, and if the provided token buffer is
3551  * big enough, copies the found token into it.  The result, if
3552  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3553  * must be terminated with '\0' on entry.
3554  *
3555  * Returns the length of the token found (not including the '\0').
3556  * Return value will be 0 if no token is found, and it will be >=
3557  * token_size if the token would not fit.
3558  *
3559  * The *buf pointer will be updated to point beyond the end of the
3560  * found token.  Note that this occurs even if the token buffer is
3561  * too small to hold it.
3562  */
3563 static inline size_t copy_token(const char **buf,
3564                                 char *token,
3565                                 size_t token_size)
3566 {
3567         size_t len;
3568
3569         len = next_token(buf);
3570         if (len < token_size) {
3571                 memcpy(token, *buf, len);
3572                 *(token + len) = '\0';
3573         }
3574         *buf += len;
3575
3576         return len;
3577 }
3578
3579 /*
3580  * Finds the next token in *buf, dynamically allocates a buffer big
3581  * enough to hold a copy of it, and copies the token into the new
3582  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3583  * that a duplicate buffer is created even for a zero-length token.
3584  *
3585  * Returns a pointer to the newly-allocated duplicate, or a null
3586  * pointer if memory for the duplicate was not available.  If
3587  * the lenp argument is a non-null pointer, the length of the token
3588  * (not including the '\0') is returned in *lenp.
3589  *
3590  * If successful, the *buf pointer will be updated to point beyond
3591  * the end of the found token.
3592  *
3593  * Note: uses GFP_KERNEL for allocation.
3594  */
3595 static inline char *dup_token(const char **buf, size_t *lenp)
3596 {
3597         char *dup;
3598         size_t len;
3599
3600         len = next_token(buf);
3601         dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3602         if (!dup)
3603                 return NULL;
3604         *(dup + len) = '\0';
3605         *buf += len;
3606
3607         if (lenp)
3608                 *lenp = len;
3609
3610         return dup;
3611 }
3612
3613 /*
3614  * Parse the options provided for an "rbd add" (i.e., rbd image
3615  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3616  * and the data written is passed here via a NUL-terminated buffer.
3617  * Returns 0 if successful or an error code otherwise.
3618  *
3619  * The information extracted from these options is recorded in
3620  * the other parameters which return dynamically-allocated
3621  * structures:
3622  *  ceph_opts
3623  *      The address of a pointer that will refer to a ceph options
3624  *      structure.  Caller must release the returned pointer using
3625  *      ceph_destroy_options() when it is no longer needed.
3626  *  rbd_opts
3627  *      Address of an rbd options pointer.  Fully initialized by
3628  *      this function; caller must release with kfree().
3629  *  spec
3630  *      Address of an rbd image specification pointer.  Fully
3631  *      initialized by this function based on parsed options.
3632  *      Caller must release with rbd_spec_put().
3633  *
3634  * The options passed take this form:
3635  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3636  * where:
3637  *  <mon_addrs>
3638  *      A comma-separated list of one or more monitor addresses.
3639  *      A monitor address is an ip address, optionally followed
3640  *      by a port number (separated by a colon).
3641  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3642  *  <options>
3643  *      A comma-separated list of ceph and/or rbd options.
3644  *  <pool_name>
3645  *      The name of the rados pool containing the rbd image.
3646  *  <image_name>
3647  *      The name of the image in that pool to map.
3648  *  <snap_id>
3649  *      An optional snapshot id.  If provided, the mapping will
3650  *      present data from the image at the time that snapshot was
3651  *      created.  The image head is used if no snapshot id is
3652  *      provided.  Snapshot mappings are always read-only.
3653  */
3654 static int rbd_add_parse_args(const char *buf,
3655                                 struct ceph_options **ceph_opts,
3656                                 struct rbd_options **opts,
3657                                 struct rbd_spec **rbd_spec)
3658 {
3659         size_t len;
3660         char *options;
3661         const char *mon_addrs;
3662         size_t mon_addrs_size;
3663         struct rbd_spec *spec = NULL;
3664         struct rbd_options *rbd_opts = NULL;
3665         struct ceph_options *copts;
3666         int ret;
3667
3668         /* The first four tokens are required */
3669
3670         len = next_token(&buf);
3671         if (!len) {
3672                 rbd_warn(NULL, "no monitor address(es) provided");
3673                 return -EINVAL;
3674         }
3675         mon_addrs = buf;
3676         mon_addrs_size = len + 1;
3677         buf += len;
3678
3679         ret = -EINVAL;
3680         options = dup_token(&buf, NULL);
3681         if (!options)
3682                 return -ENOMEM;
3683         if (!*options) {
3684                 rbd_warn(NULL, "no options provided");
3685                 goto out_err;
3686         }
3687
3688         spec = rbd_spec_alloc();
3689         if (!spec)
3690                 goto out_mem;
3691
3692         spec->pool_name = dup_token(&buf, NULL);
3693         if (!spec->pool_name)
3694                 goto out_mem;
3695         if (!*spec->pool_name) {
3696                 rbd_warn(NULL, "no pool name provided");
3697                 goto out_err;
3698         }
3699
3700         spec->image_name = dup_token(&buf, NULL);
3701         if (!spec->image_name)
3702                 goto out_mem;
3703         if (!*spec->image_name) {
3704                 rbd_warn(NULL, "no image name provided");
3705                 goto out_err;
3706         }
3707
3708         /*
3709          * Snapshot name is optional; default is to use "-"
3710          * (indicating the head/no snapshot).
3711          */
3712         len = next_token(&buf);
3713         if (!len) {
3714                 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3715                 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3716         } else if (len > RBD_MAX_SNAP_NAME_LEN) {
3717                 ret = -ENAMETOOLONG;
3718                 goto out_err;
3719         }
3720         spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3721         if (!spec->snap_name)
3722                 goto out_mem;
3723         *(spec->snap_name + len) = '\0';
3724
3725         /* Initialize all rbd options to the defaults */
3726
3727         rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3728         if (!rbd_opts)
3729                 goto out_mem;
3730
3731         rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3732
3733         copts = ceph_parse_options(options, mon_addrs,
3734                                         mon_addrs + mon_addrs_size - 1,
3735                                         parse_rbd_opts_token, rbd_opts);
3736         if (IS_ERR(copts)) {
3737                 ret = PTR_ERR(copts);
3738                 goto out_err;
3739         }
3740         kfree(options);
3741
3742         *ceph_opts = copts;
3743         *opts = rbd_opts;
3744         *rbd_spec = spec;
3745
3746         return 0;
3747 out_mem:
3748         ret = -ENOMEM;
3749 out_err:
3750         kfree(rbd_opts);
3751         rbd_spec_put(spec);
3752         kfree(options);
3753
3754         return ret;
3755 }
3756
3757 /*
3758  * An rbd format 2 image has a unique identifier, distinct from the
3759  * name given to it by the user.  Internally, that identifier is
3760  * what's used to specify the names of objects related to the image.
3761  *
3762  * A special "rbd id" object is used to map an rbd image name to its
3763  * id.  If that object doesn't exist, then there is no v2 rbd image
3764  * with the supplied name.
3765  *
3766  * This function will record the given rbd_dev's image_id field if
3767  * it can be determined, and in that case will return 0.  If any
3768  * errors occur a negative errno will be returned and the rbd_dev's
3769  * image_id field will be unchanged (and should be NULL).
3770  */
3771 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3772 {
3773         int ret;
3774         size_t size;
3775         char *object_name;
3776         void *response;
3777         void *p;
3778
3779         /*
3780          * When probing a parent image, the image id is already
3781          * known (and the image name likely is not).  There's no
3782          * need to fetch the image id again in this case.
3783          */
3784         if (rbd_dev->spec->image_id)
3785                 return 0;
3786
3787         /*
3788          * First, see if the format 2 image id file exists, and if
3789          * so, get the image's persistent id from it.
3790          */
3791         size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3792         object_name = kmalloc(size, GFP_NOIO);
3793         if (!object_name)
3794                 return -ENOMEM;
3795         sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3796         dout("rbd id object name is %s\n", object_name);
3797
3798         /* Response will be an encoded string, which includes a length */
3799
3800         size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3801         response = kzalloc(size, GFP_NOIO);
3802         if (!response) {
3803                 ret = -ENOMEM;
3804                 goto out;
3805         }
3806
3807         ret = rbd_obj_method_sync(rbd_dev, object_name,
3808                                 "rbd", "get_id",
3809                                 NULL, 0,
3810                                 response, RBD_IMAGE_ID_LEN_MAX, NULL);
3811         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3812         if (ret < 0)
3813                 goto out;
3814
3815         p = response;
3816         rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3817                                                 p + RBD_IMAGE_ID_LEN_MAX,
3818                                                 NULL, GFP_NOIO);
3819         if (IS_ERR(rbd_dev->spec->image_id)) {
3820                 ret = PTR_ERR(rbd_dev->spec->image_id);
3821                 rbd_dev->spec->image_id = NULL;
3822         } else {
3823                 dout("image_id is %s\n", rbd_dev->spec->image_id);
3824         }
3825 out:
3826         kfree(response);
3827         kfree(object_name);
3828
3829         return ret;
3830 }
3831
3832 static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3833 {
3834         int ret;
3835         size_t size;
3836
3837         /* Version 1 images have no id; empty string is used */
3838
3839         rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3840         if (!rbd_dev->spec->image_id)
3841                 return -ENOMEM;
3842
3843         /* Record the header object name for this rbd image. */
3844
3845         size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3846         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3847         if (!rbd_dev->header_name) {
3848                 ret = -ENOMEM;
3849                 goto out_err;
3850         }
3851         sprintf(rbd_dev->header_name, "%s%s",
3852                 rbd_dev->spec->image_name, RBD_SUFFIX);
3853
3854         /* Populate rbd image metadata */
3855
3856         ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3857         if (ret < 0)
3858                 goto out_err;
3859
3860         /* Version 1 images have no parent (no layering) */
3861
3862         rbd_dev->parent_spec = NULL;
3863         rbd_dev->parent_overlap = 0;
3864
3865         rbd_dev->image_format = 1;
3866
3867         dout("discovered version 1 image, header name is %s\n",
3868                 rbd_dev->header_name);
3869
3870         return 0;
3871
3872 out_err:
3873         kfree(rbd_dev->header_name);
3874         rbd_dev->header_name = NULL;
3875         kfree(rbd_dev->spec->image_id);
3876         rbd_dev->spec->image_id = NULL;
3877
3878         return ret;
3879 }
3880
3881 static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3882 {
3883         size_t size;
3884         int ret;
3885         u64 ver = 0;
3886
3887         /*
3888          * Image id was filled in by the caller.  Record the header
3889          * object name for this rbd image.
3890          */
3891         size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3892         rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3893         if (!rbd_dev->header_name)
3894                 return -ENOMEM;
3895         sprintf(rbd_dev->header_name, "%s%s",
3896                         RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
3897
3898         /* Get the size and object order for the image */
3899
3900         ret = rbd_dev_v2_image_size(rbd_dev);
3901         if (ret < 0)
3902                 goto out_err;
3903
3904         /* Get the object prefix (a.k.a. block_name) for the image */
3905
3906         ret = rbd_dev_v2_object_prefix(rbd_dev);
3907         if (ret < 0)
3908                 goto out_err;
3909
3910         /* Get the and check features for the image */
3911
3912         ret = rbd_dev_v2_features(rbd_dev);
3913         if (ret < 0)
3914                 goto out_err;
3915
3916         /* If the image supports layering, get the parent info */
3917
3918         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3919                 ret = rbd_dev_v2_parent_info(rbd_dev);
3920                 if (ret < 0)
3921                         goto out_err;
3922         }
3923
3924         /* crypto and compression type aren't (yet) supported for v2 images */
3925
3926         rbd_dev->header.crypt_type = 0;
3927         rbd_dev->header.comp_type = 0;
3928
3929         /* Get the snapshot context, plus the header version */
3930
3931         ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
3932         if (ret)
3933                 goto out_err;
3934         rbd_dev->header.obj_version = ver;
3935
3936         rbd_dev->image_format = 2;
3937
3938         dout("discovered version 2 image, header name is %s\n",
3939                 rbd_dev->header_name);
3940
3941         return 0;
3942 out_err:
3943         rbd_dev->parent_overlap = 0;
3944         rbd_spec_put(rbd_dev->parent_spec);
3945         rbd_dev->parent_spec = NULL;
3946         kfree(rbd_dev->header_name);
3947         rbd_dev->header_name = NULL;
3948         kfree(rbd_dev->header.object_prefix);
3949         rbd_dev->header.object_prefix = NULL;
3950
3951         return ret;
3952 }
3953
3954 static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3955 {
3956         int ret;
3957
3958         /* no need to lock here, as rbd_dev is not registered yet */
3959         ret = rbd_dev_snaps_update(rbd_dev);
3960         if (ret)
3961                 return ret;
3962
3963         ret = rbd_dev_probe_update_spec(rbd_dev);
3964         if (ret)
3965                 goto err_out_snaps;
3966
3967         ret = rbd_dev_set_mapping(rbd_dev);
3968         if (ret)
3969                 goto err_out_snaps;
3970
3971         /* generate unique id: find highest unique id, add one */
3972         rbd_dev_id_get(rbd_dev);
3973
3974         /* Fill in the device name, now that we have its id. */
3975         BUILD_BUG_ON(DEV_NAME_LEN
3976                         < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3977         sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3978
3979         /* Get our block major device number. */
3980
3981         ret = register_blkdev(0, rbd_dev->name);
3982         if (ret < 0)
3983                 goto err_out_id;
3984         rbd_dev->major = ret;
3985
3986         /* Set up the blkdev mapping. */
3987
3988         ret = rbd_init_disk(rbd_dev);
3989         if (ret)
3990                 goto err_out_blkdev;
3991
3992         ret = rbd_bus_add_dev(rbd_dev);
3993         if (ret)
3994                 goto err_out_disk;
3995
3996         /*
3997          * At this point cleanup in the event of an error is the job
3998          * of the sysfs code (initiated by rbd_bus_del_dev()).
3999          */
4000         down_write(&rbd_dev->header_rwsem);
4001         ret = rbd_dev_snaps_register(rbd_dev);
4002         up_write(&rbd_dev->header_rwsem);
4003         if (ret)
4004                 goto err_out_bus;
4005
4006         ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4007         if (ret)
4008                 goto err_out_bus;
4009
4010         /* Everything's ready.  Announce the disk to the world. */
4011
4012         add_disk(rbd_dev->disk);
4013
4014         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4015                 (unsigned long long) rbd_dev->mapping.size);
4016
4017         return ret;
4018 err_out_bus:
4019         /* this will also clean up rest of rbd_dev stuff */
4020
4021         rbd_bus_del_dev(rbd_dev);
4022
4023         return ret;
4024 err_out_disk:
4025         rbd_free_disk(rbd_dev);
4026 err_out_blkdev:
4027         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4028 err_out_id:
4029         rbd_dev_id_put(rbd_dev);
4030 err_out_snaps:
4031         rbd_remove_all_snaps(rbd_dev);
4032
4033         return ret;
4034 }
4035
4036 /*
4037  * Probe for the existence of the header object for the given rbd
4038  * device.  For format 2 images this includes determining the image
4039  * id.
4040  */
4041 static int rbd_dev_probe(struct rbd_device *rbd_dev)
4042 {
4043         int ret;
4044
4045         /*
4046          * Get the id from the image id object.  If it's not a
4047          * format 2 image, we'll get ENOENT back, and we'll assume
4048          * it's a format 1 image.
4049          */
4050         ret = rbd_dev_image_id(rbd_dev);
4051         if (ret)
4052                 ret = rbd_dev_v1_probe(rbd_dev);
4053         else
4054                 ret = rbd_dev_v2_probe(rbd_dev);
4055         if (ret) {
4056                 dout("probe failed, returning %d\n", ret);
4057
4058                 return ret;
4059         }
4060
4061         ret = rbd_dev_probe_finish(rbd_dev);
4062         if (ret)
4063                 rbd_header_free(&rbd_dev->header);
4064
4065         return ret;
4066 }
4067
4068 static ssize_t rbd_add(struct bus_type *bus,
4069                        const char *buf,
4070                        size_t count)
4071 {
4072         struct rbd_device *rbd_dev = NULL;
4073         struct ceph_options *ceph_opts = NULL;
4074         struct rbd_options *rbd_opts = NULL;
4075         struct rbd_spec *spec = NULL;
4076         struct rbd_client *rbdc;
4077         struct ceph_osd_client *osdc;
4078         int rc = -ENOMEM;
4079
4080         if (!try_module_get(THIS_MODULE))
4081                 return -ENODEV;
4082
4083         /* parse add command */
4084         rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4085         if (rc < 0)
4086                 goto err_out_module;
4087
4088         rbdc = rbd_get_client(ceph_opts);
4089         if (IS_ERR(rbdc)) {
4090                 rc = PTR_ERR(rbdc);
4091                 goto err_out_args;
4092         }
4093         ceph_opts = NULL;       /* rbd_dev client now owns this */
4094
4095         /* pick the pool */
4096         osdc = &rbdc->client->osdc;
4097         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4098         if (rc < 0)
4099                 goto err_out_client;
4100         spec->pool_id = (u64) rc;
4101
4102         /* The ceph file layout needs to fit pool id in 32 bits */
4103
4104         if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4105                 rc = -EIO;
4106                 goto err_out_client;
4107         }
4108
4109         rbd_dev = rbd_dev_create(rbdc, spec);
4110         if (!rbd_dev)
4111                 goto err_out_client;
4112         rbdc = NULL;            /* rbd_dev now owns this */
4113         spec = NULL;            /* rbd_dev now owns this */
4114
4115         rbd_dev->mapping.read_only = rbd_opts->read_only;
4116         kfree(rbd_opts);
4117         rbd_opts = NULL;        /* done with this */
4118
4119         rc = rbd_dev_probe(rbd_dev);
4120         if (rc < 0)
4121                 goto err_out_rbd_dev;
4122
4123         return count;
4124 err_out_rbd_dev:
4125         rbd_dev_destroy(rbd_dev);
4126 err_out_client:
4127         rbd_put_client(rbdc);
4128 err_out_args:
4129         if (ceph_opts)
4130                 ceph_destroy_options(ceph_opts);
4131         kfree(rbd_opts);
4132         rbd_spec_put(spec);
4133 err_out_module:
4134         module_put(THIS_MODULE);
4135
4136         dout("Error adding device %s\n", buf);
4137
4138         return (ssize_t) rc;
4139 }
4140
4141 static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4142 {
4143         struct list_head *tmp;
4144         struct rbd_device *rbd_dev;
4145
4146         spin_lock(&rbd_dev_list_lock);
4147         list_for_each(tmp, &rbd_dev_list) {
4148                 rbd_dev = list_entry(tmp, struct rbd_device, node);
4149                 if (rbd_dev->dev_id == dev_id) {
4150                         spin_unlock(&rbd_dev_list_lock);
4151                         return rbd_dev;
4152                 }
4153         }
4154         spin_unlock(&rbd_dev_list_lock);
4155         return NULL;
4156 }
4157
4158 static void rbd_dev_release(struct device *dev)
4159 {
4160         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4161
4162         if (rbd_dev->watch_event)
4163                 rbd_dev_header_watch_sync(rbd_dev, 0);
4164
4165         /* clean up and free blkdev */
4166         rbd_free_disk(rbd_dev);
4167         unregister_blkdev(rbd_dev->major, rbd_dev->name);
4168
4169         /* release allocated disk header fields */
4170         rbd_header_free(&rbd_dev->header);
4171
4172         /* done with the id, and with the rbd_dev */
4173         rbd_dev_id_put(rbd_dev);
4174         rbd_assert(rbd_dev->rbd_client != NULL);
4175         rbd_dev_destroy(rbd_dev);
4176
4177         /* release module ref */
4178         module_put(THIS_MODULE);
4179 }
4180
4181 static ssize_t rbd_remove(struct bus_type *bus,
4182                           const char *buf,
4183                           size_t count)
4184 {
4185         struct rbd_device *rbd_dev = NULL;
4186         int target_id, rc;
4187         unsigned long ul;
4188         int ret = count;
4189
4190         rc = strict_strtoul(buf, 10, &ul);
4191         if (rc)
4192                 return rc;
4193
4194         /* convert to int; abort if we lost anything in the conversion */
4195         target_id = (int) ul;
4196         if (target_id != ul)
4197                 return -EINVAL;
4198
4199         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4200
4201         rbd_dev = __rbd_get_dev(target_id);
4202         if (!rbd_dev) {
4203                 ret = -ENOENT;
4204                 goto done;
4205         }
4206
4207         spin_lock_irq(&rbd_dev->lock);
4208         if (rbd_dev->open_count)
4209                 ret = -EBUSY;
4210         else
4211                 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4212         spin_unlock_irq(&rbd_dev->lock);
4213         if (ret < 0)
4214                 goto done;
4215
4216         rbd_remove_all_snaps(rbd_dev);
4217         rbd_bus_del_dev(rbd_dev);
4218
4219 done:
4220         mutex_unlock(&ctl_mutex);
4221
4222         return ret;
4223 }
4224
4225 /*
4226  * create control files in sysfs
4227  * /sys/bus/rbd/...
4228  */
4229 static int rbd_sysfs_init(void)
4230 {
4231         int ret;
4232
4233         ret = device_register(&rbd_root_dev);
4234         if (ret < 0)
4235                 return ret;
4236
4237         ret = bus_register(&rbd_bus_type);
4238         if (ret < 0)
4239                 device_unregister(&rbd_root_dev);
4240
4241         return ret;
4242 }
4243
4244 static void rbd_sysfs_cleanup(void)
4245 {
4246         bus_unregister(&rbd_bus_type);
4247         device_unregister(&rbd_root_dev);
4248 }
4249
4250 static int __init rbd_init(void)
4251 {
4252         int rc;
4253
4254         if (!libceph_compatible(NULL)) {
4255                 rbd_warn(NULL, "libceph incompatibility (quitting)");
4256
4257                 return -EINVAL;
4258         }
4259         rc = rbd_sysfs_init();
4260         if (rc)
4261                 return rc;
4262         pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4263         return 0;
4264 }
4265
4266 static void __exit rbd_exit(void)
4267 {
4268         rbd_sysfs_cleanup();
4269 }
4270
4271 module_init(rbd_init);
4272 module_exit(rbd_exit);
4273
4274 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4275 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4276 MODULE_DESCRIPTION("rados block device");
4277
4278 /* following authorship retained from original osdblk.c */
4279 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4280
4281 MODULE_LICENSE("GPL");