drivers/md/dm-thin.c

   1 /*
   2  * Copyright (C) 2011-2012 Red Hat UK.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-thin-metadata.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm.h"
  10
  11 #include <linux/device-mapper.h>
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/list.h>
  15 #include <linux/rculist.h>
  16 #include <linux/init.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/rbtree.h>
  20
  21 #define DM_MSG_PREFIX   "thin"
  22
  23 /*
  24  * Tunable constants
  25  */
  26 #define ENDIO_HOOK_POOL_SIZE 1024
  27 #define MAPPING_POOL_SIZE 1024
  28 #define PRISON_CELLS 1024
  29 #define COMMIT_PERIOD HZ
  30 #define NO_SPACE_TIMEOUT_SECS 60
  31
  32 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
  33
  34 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  35                 "A percentage of time allocated for copy on write");
  36
  37 /*
  38  * The block size of the device holding pool data must be
  39  * between 64KB and 1GB.
  40  */
  41 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  42 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  43
  44 /*
  45  * Device id is restricted to 24 bits.
  46  */
  47 #define MAX_DEV_ID ((1 << 24) - 1)
  48
  49 /*
  50  * How do we handle breaking sharing of data blocks?
  51  * =================================================
  52  *
  53  * We use a standard copy-on-write btree to store the mappings for the
  54  * devices (note I'm talking about copy-on-write of the metadata here, not
  55  * the data).  When you take an internal snapshot you clone the root node
  56  * of the origin btree.  After this there is no concept of an origin or a
  57  * snapshot.  They are just two device trees that happen to point to the
  58  * same data blocks.
  59  *
  60  * When we get a write in we decide if it's to a shared data block using
  61  * some timestamp magic.  If it is, we have to break sharing.
  62  *
  63  * Let's say we write to a shared block in what was the origin.  The
  64  * steps are:
  65  *
  66  * i) plug io further to this physical block. (see bio_prison code).
  67  *
  68  * ii) quiesce any read io to that shared data block.  Obviously
  69  * including all devices that share this block.  (see dm_deferred_set code)
  70  *
  71  * iii) copy the data block to a newly allocate block.  This step can be
  72  * missed out if the io covers the block. (schedule_copy).
  73  *
  74  * iv) insert the new mapping into the origin's btree
  75  * (process_prepared_mapping).  This act of inserting breaks some
  76  * sharing of btree nodes between the two devices.  Breaking sharing only
  77  * effects the btree of that specific device.  Btrees for the other
  78  * devices that share the block never change.  The btree for the origin
  79  * device as it was after the last commit is untouched, ie. we're using
  80  * persistent data structures in the functional programming sense.
  81  *
  82  * v) unplug io to this physical block, including the io that triggered
  83  * the breaking of sharing.
  84  *
  85  * Steps (ii) and (iii) occur in parallel.
  86  *
  87  * The metadata _doesn't_ need to be committed before the io continues.  We
  88  * get away with this because the io is always written to a _new_ block.
  89  * If there's a crash, then:
  90  *
  91  * - The origin mapping will point to the old origin block (the shared
  92  * one).  This will contain the data as it was before the io that triggered
  93  * the breaking of sharing came in.
  94  *
  95  * - The snap mapping still points to the old block.  As it would after
  96  * the commit.
  97  *
  98  * The downside of this scheme is the timestamp magic isn't perfect, and
  99  * will continue to think that data block in the snapshot device is shared
 100  * even after the write to the origin has broken sharing.  I suspect data
 101  * blocks will typically be shared by many different devices, so we're
 102  * breaking sharing n + 1 times, rather than n, where n is the number of
 103  * devices that reference this data block.  At the moment I think the
 104  * benefits far, far outweigh the disadvantages.
 105  */
 106
 107 /*----------------------------------------------------------------*/
 108
 109 /*
 110  * Key building.
 111  */
 112 static void build_data_key(struct dm_thin_device *td,
 113                            dm_block_t b, struct dm_cell_key *key)
 114 {
 115         key->virtual = 0;
 116         key->dev = dm_thin_dev_id(td);
 117         key->block = b;
 118 }
 119
 120 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 121                               struct dm_cell_key *key)
 122 {
 123         key->virtual = 1;
 124         key->dev = dm_thin_dev_id(td);
 125         key->block = b;
 126 }
 127
 128 /*----------------------------------------------------------------*/
 129
 130 /*
 131  * A pool device ties together a metadata device and a data device.  It
 132  * also provides the interface for creating and destroying internal
 133  * devices.
 134  */
 135 struct dm_thin_new_mapping;
 136
 137 /*
 138  * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
 139  */
 140 enum pool_mode {
 141         PM_WRITE,               /* metadata may be changed */
 142         PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
 143         PM_READ_ONLY,           /* metadata may not be changed */
 144         PM_FAIL,                /* all I/O fails */
 145 };
 146
 147 struct pool_features {
 148         enum pool_mode mode;
 149
 150         bool zero_new_blocks:1;
 151         bool discard_enabled:1;
 152         bool discard_passdown:1;
 153         bool error_if_no_space:1;
 154 };
 155
 156 struct thin_c;
 157 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 158 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 159
 160 struct pool {
 161         struct list_head list;
 162         struct dm_target *ti;   /* Only set if a pool target is bound */
 163
 164         struct mapped_device *pool_md;
 165         struct block_device *md_dev;
 166         struct dm_pool_metadata *pmd;
 167
 168         dm_block_t low_water_blocks;
 169         uint32_t sectors_per_block;
 170         int sectors_per_block_shift;
 171
 172         struct pool_features pf;
 173         bool low_water_triggered:1;     /* A dm event has been sent */
 174
 175         struct dm_bio_prison *prison;
 176         struct dm_kcopyd_client *copier;
 177
 178         struct workqueue_struct *wq;
 179         struct work_struct worker;
 180         struct delayed_work waker;
 181         struct delayed_work no_space_timeout;
 182
 183         unsigned long last_commit_jiffies;
 184         unsigned ref_count;
 185
 186         spinlock_t lock;
 187         struct bio_list deferred_flush_bios;
 188         struct list_head prepared_mappings;
 189         struct list_head prepared_discards;
 190         struct list_head active_thins;
 191
 192         struct dm_deferred_set *shared_read_ds;
 193         struct dm_deferred_set *all_io_ds;
 194
 195         struct dm_thin_new_mapping *next_mapping;
 196         mempool_t *mapping_pool;
 197
 198         process_bio_fn process_bio;
 199         process_bio_fn process_discard;
 200
 201         process_mapping_fn process_prepared_mapping;
 202         process_mapping_fn process_prepared_discard;
 203 };
 204
 205 static enum pool_mode get_pool_mode(struct pool *pool);
 206 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 207
 208 /*
 209  * Target context for a pool.
 210  */
 211 struct pool_c {
 212         struct dm_target *ti;
 213         struct pool *pool;
 214         struct dm_dev *data_dev;
 215         struct dm_dev *metadata_dev;
 216         struct dm_target_callbacks callbacks;
 217
 218         dm_block_t low_water_blocks;
 219         struct pool_features requested_pf; /* Features requested during table load */
 220         struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 221 };
 222
 223 /*
 224  * Target context for a thin.
 225  */
 226 struct thin_c {
 227         struct list_head list;
 228         struct dm_dev *pool_dev;
 229         struct dm_dev *origin_dev;
 230         dm_thin_id dev_id;
 231
 232         struct pool *pool;
 233         struct dm_thin_device *td;
 234         bool requeue_mode:1;
 235         spinlock_t lock;
 236         struct bio_list deferred_bio_list;
 237         struct bio_list retry_on_resume_list;
 238         struct rb_root sort_bio_list; /* sorted list of deferred bios */
 239
 240         /*
 241          * Ensures the thin is not destroyed until the worker has finished
 242          * iterating the active_thins list.
 243          */
 244         atomic_t refcount;
 245         struct completion can_destroy;
 246 };
 247
 248 /*----------------------------------------------------------------*/
 249
 250 /*
 251  * wake_worker() is used when new work is queued and when pool_resume is
 252  * ready to continue deferred IO processing.
 253  */
 254 static void wake_worker(struct pool *pool)
 255 {
 256         queue_work(pool->wq, &pool->worker);
 257 }
 258
 259 /*----------------------------------------------------------------*/
 260
 261 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 262                       struct dm_bio_prison_cell **cell_result)
 263 {
 264         int r;
 265         struct dm_bio_prison_cell *cell_prealloc;
 266
 267         /*
 268          * Allocate a cell from the prison's mempool.
 269          * This might block but it can't fail.
 270          */
 271         cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 272
 273         r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 274         if (r)
 275                 /*
 276                  * We reused an old cell; we can get rid of
 277                  * the new one.
 278                  */
 279                 dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 280
 281         return r;
 282 }
 283
 284 static void cell_release(struct pool *pool,
 285                          struct dm_bio_prison_cell *cell,
 286                          struct bio_list *bios)
 287 {
 288         dm_cell_release(pool->prison, cell, bios);
 289         dm_bio_prison_free_cell(pool->prison, cell);
 290 }
 291
 292 static void cell_release_no_holder(struct pool *pool,
 293                                    struct dm_bio_prison_cell *cell,
 294                                    struct bio_list *bios)
 295 {
 296         dm_cell_release_no_holder(pool->prison, cell, bios);
 297         dm_bio_prison_free_cell(pool->prison, cell);
 298 }
 299
 300 static void cell_defer_no_holder_no_free(struct thin_c *tc,
 301                                          struct dm_bio_prison_cell *cell)
 302 {
 303         struct pool *pool = tc->pool;
 304         unsigned long flags;
 305
 306         spin_lock_irqsave(&tc->lock, flags);
 307         dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
 308         spin_unlock_irqrestore(&tc->lock, flags);
 309
 310         wake_worker(pool);
 311 }
 312
 313 static void cell_error(struct pool *pool,
 314                        struct dm_bio_prison_cell *cell)
 315 {
 316         dm_cell_error(pool->prison, cell);
 317         dm_bio_prison_free_cell(pool->prison, cell);
 318 }
 319
 320 /*----------------------------------------------------------------*/
 321
 322 /*
 323  * A global list of pools that uses a struct mapped_device as a key.
 324  */
 325 static struct dm_thin_pool_table {
 326         struct mutex mutex;
 327         struct list_head pools;
 328 } dm_thin_pool_table;
 329
 330 static void pool_table_init(void)
 331 {
 332         mutex_init(&dm_thin_pool_table.mutex);
 333         INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 334 }
 335
 336 static void __pool_table_insert(struct pool *pool)
 337 {
 338         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 339         list_add(&pool->list, &dm_thin_pool_table.pools);
 340 }
 341
 342 static void __pool_table_remove(struct pool *pool)
 343 {
 344         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 345         list_del(&pool->list);
 346 }
 347
 348 static struct pool *__pool_table_lookup(struct mapped_device *md)
 349 {
 350         struct pool *pool = NULL, *tmp;
 351
 352         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 353
 354         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 355                 if (tmp->pool_md == md) {
 356                         pool = tmp;
 357                         break;
 358                 }
 359         }
 360
 361         return pool;
 362 }
 363
 364 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 365 {
 366         struct pool *pool = NULL, *tmp;
 367
 368         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 369
 370         list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 371                 if (tmp->md_dev == md_dev) {
 372                         pool = tmp;
 373                         break;
 374                 }
 375         }
 376
 377         return pool;
 378 }
 379
 380 /*----------------------------------------------------------------*/
 381
 382 struct dm_thin_endio_hook {
 383         struct thin_c *tc;
 384         struct dm_deferred_entry *shared_read_entry;
 385         struct dm_deferred_entry *all_io_entry;
 386         struct dm_thin_new_mapping *overwrite_mapping;
 387         struct rb_node rb_node;
 388 };
 389
 390 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 391 {
 392         struct bio *bio;
 393         struct bio_list bios;
 394         unsigned long flags;
 395
 396         bio_list_init(&bios);
 397
 398         spin_lock_irqsave(&tc->lock, flags);
 399         bio_list_merge(&bios, master);
 400         bio_list_init(master);
 401         spin_unlock_irqrestore(&tc->lock, flags);
 402
 403         while ((bio = bio_list_pop(&bios)))
 404                 bio_endio(bio, DM_ENDIO_REQUEUE);
 405 }
 406
 407 static void requeue_io(struct thin_c *tc)
 408 {
 409         requeue_bio_list(tc, &tc->deferred_bio_list);
 410         requeue_bio_list(tc, &tc->retry_on_resume_list);
 411 }
 412
 413 static void error_thin_retry_list(struct thin_c *tc)
 414 {
 415         struct bio *bio;
 416         unsigned long flags;
 417         struct bio_list bios;
 418
 419         bio_list_init(&bios);
 420
 421         spin_lock_irqsave(&tc->lock, flags);
 422         bio_list_merge(&bios, &tc->retry_on_resume_list);
 423         bio_list_init(&tc->retry_on_resume_list);
 424         spin_unlock_irqrestore(&tc->lock, flags);
 425
 426         while ((bio = bio_list_pop(&bios)))
 427                 bio_io_error(bio);
 428 }
 429
 430 static void error_retry_list(struct pool *pool)
 431 {
 432         struct thin_c *tc;
 433
 434         rcu_read_lock();
 435         list_for_each_entry_rcu(tc, &pool->active_thins, list)
 436                 error_thin_retry_list(tc);
 437         rcu_read_unlock();
 438 }
 439
 440 /*
 441  * This section of code contains the logic for processing a thin device's IO.
 442  * Much of the code depends on pool object resources (lists, workqueues, etc)
 443  * but most is exclusively called from the thin target rather than the thin-pool
 444  * target.
 445  */
 446
 447 static bool block_size_is_power_of_two(struct pool *pool)
 448 {
 449         return pool->sectors_per_block_shift >= 0;
 450 }
 451
 452 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 453 {
 454         struct pool *pool = tc->pool;
 455         sector_t block_nr = bio->bi_iter.bi_sector;
 456
 457         if (block_size_is_power_of_two(pool))
 458                 block_nr >>= pool->sectors_per_block_shift;
 459         else
 460                 (void) sector_div(block_nr, pool->sectors_per_block);
 461
 462         return block_nr;
 463 }
 464
 465 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 466 {
 467         struct pool *pool = tc->pool;
 468         sector_t bi_sector = bio->bi_iter.bi_sector;
 469
 470         bio->bi_bdev = tc->pool_dev->bdev;
 471         if (block_size_is_power_of_two(pool))
 472                 bio->bi_iter.bi_sector =
 473                         (block << pool->sectors_per_block_shift) |
 474                         (bi_sector & (pool->sectors_per_block - 1));
 475         else
 476                 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 477                                  sector_div(bi_sector, pool->sectors_per_block);
 478 }
 479
 480 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 481 {
 482         bio->bi_bdev = tc->origin_dev->bdev;
 483 }
 484
 485 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 486 {
 487         return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 488                 dm_thin_changed_this_transaction(tc->td);
 489 }
 490
 491 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 492 {
 493         struct dm_thin_endio_hook *h;
 494
 495         if (bio->bi_rw & REQ_DISCARD)
 496                 return;
 497
 498         h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 499         h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 500 }
 501
 502 static void issue(struct thin_c *tc, struct bio *bio)
 503 {
 504         struct pool *pool = tc->pool;
 505         unsigned long flags;
 506
 507         if (!bio_triggers_commit(tc, bio)) {
 508                 generic_make_request(bio);
 509                 return;
 510         }
 511
 512         /*
 513          * Complete bio with an error if earlier I/O caused changes to
 514          * the metadata that can't be committed e.g, due to I/O errors
 515          * on the metadata device.
 516          */
 517         if (dm_thin_aborted_changes(tc->td)) {
 518                 bio_io_error(bio);
 519                 return;
 520         }
 521
 522         /*
 523          * Batch together any bios that trigger commits and then issue a
 524          * single commit for them in process_deferred_bios().
 525          */
 526         spin_lock_irqsave(&pool->lock, flags);
 527         bio_list_add(&pool->deferred_flush_bios, bio);
 528         spin_unlock_irqrestore(&pool->lock, flags);
 529 }
 530
 531 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 532 {
 533         remap_to_origin(tc, bio);
 534         issue(tc, bio);
 535 }
 536
 537 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 538                             dm_block_t block)
 539 {
 540         remap(tc, bio, block);
 541         issue(tc, bio);
 542 }
 543
 544 /*----------------------------------------------------------------*/
 545
 546 /*
 547  * Bio endio functions.
 548  */
 549 struct dm_thin_new_mapping {
 550         struct list_head list;
 551
 552         bool quiesced:1;
 553         bool prepared:1;
 554         bool pass_discard:1;
 555         bool definitely_not_shared:1;
 556
 557         int err;
 558         struct thin_c *tc;
 559         dm_block_t virt_block;
 560         dm_block_t data_block;
 561         struct dm_bio_prison_cell *cell, *cell2;
 562
 563         /*
 564          * If the bio covers the whole area of a block then we can avoid
 565          * zeroing or copying.  Instead this bio is hooked.  The bio will
 566          * still be in the cell, so care has to be taken to avoid issuing
 567          * the bio twice.
 568          */
 569         struct bio *bio;
 570         bio_end_io_t *saved_bi_end_io;
 571 };
 572
 573 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 574 {
 575         struct pool *pool = m->tc->pool;
 576
 577         if (m->quiesced && m->prepared) {
 578                 list_add_tail(&m->list, &pool->prepared_mappings);
 579                 wake_worker(pool);
 580         }
 581 }
 582
 583 static void copy_complete(int read_err, unsigned long write_err, void *context)
 584 {
 585         unsigned long flags;
 586         struct dm_thin_new_mapping *m = context;
 587         struct pool *pool = m->tc->pool;
 588
 589         m->err = read_err || write_err ? -EIO : 0;
 590
 591         spin_lock_irqsave(&pool->lock, flags);
 592         m->prepared = true;
 593         __maybe_add_mapping(m);
 594         spin_unlock_irqrestore(&pool->lock, flags);
 595 }
 596
 597 static void overwrite_endio(struct bio *bio, int err)
 598 {
 599         unsigned long flags;
 600         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 601         struct dm_thin_new_mapping *m = h->overwrite_mapping;
 602         struct pool *pool = m->tc->pool;
 603
 604         m->err = err;
 605
 606         spin_lock_irqsave(&pool->lock, flags);
 607         m->prepared = true;
 608         __maybe_add_mapping(m);
 609         spin_unlock_irqrestore(&pool->lock, flags);
 610 }
 611
 612 /*----------------------------------------------------------------*/
 613
 614 /*
 615  * Workqueue.
 616  */
 617
 618 /*
 619  * Prepared mapping jobs.
 620  */
 621
 622 /*
 623  * This sends the bios in the cell back to the deferred_bios list.
 624  */
 625 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 626 {
 627         struct pool *pool = tc->pool;
 628         unsigned long flags;
 629
 630         spin_lock_irqsave(&tc->lock, flags);
 631         cell_release(pool, cell, &tc->deferred_bio_list);
 632         spin_unlock_irqrestore(&tc->lock, flags);
 633
 634         wake_worker(pool);
 635 }
 636
 637 /*
 638  * Same as cell_defer above, except it omits the original holder of the cell.
 639  */
 640 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 641 {
 642         struct pool *pool = tc->pool;
 643         unsigned long flags;
 644
 645         spin_lock_irqsave(&tc->lock, flags);
 646         cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 647         spin_unlock_irqrestore(&tc->lock, flags);
 648
 649         wake_worker(pool);
 650 }
 651
 652 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 653 {
 654         if (m->bio) {
 655                 m->bio->bi_end_io = m->saved_bi_end_io;
 656                 atomic_inc(&m->bio->bi_remaining);
 657         }
 658         cell_error(m->tc->pool, m->cell);
 659         list_del(&m->list);
 660         mempool_free(m, m->tc->pool->mapping_pool);
 661 }
 662
 663 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 664 {
 665         struct thin_c *tc = m->tc;
 666         struct pool *pool = tc->pool;
 667         struct bio *bio;
 668         int r;
 669
 670         bio = m->bio;
 671         if (bio) {
 672                 bio->bi_end_io = m->saved_bi_end_io;
 673                 atomic_inc(&bio->bi_remaining);
 674         }
 675
 676         if (m->err) {
 677                 cell_error(pool, m->cell);
 678                 goto out;
 679         }
 680
 681         /*
 682          * Commit the prepared block into the mapping btree.
 683          * Any I/O for this block arriving after this point will get
 684          * remapped to it directly.
 685          */
 686         r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 687         if (r) {
 688                 metadata_operation_failed(pool, "dm_thin_insert_block", r);
 689                 cell_error(pool, m->cell);
 690                 goto out;
 691         }
 692
 693         /*
 694          * Release any bios held while the block was being provisioned.
 695          * If we are processing a write bio that completely covers the block,
 696          * we already processed it so can ignore it now when processing
 697          * the bios in the cell.
 698          */
 699         if (bio) {
 700                 cell_defer_no_holder(tc, m->cell);
 701                 bio_endio(bio, 0);
 702         } else
 703                 cell_defer(tc, m->cell);
 704
 705 out:
 706         list_del(&m->list);
 707         mempool_free(m, pool->mapping_pool);
 708 }
 709
 710 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 711 {
 712         struct thin_c *tc = m->tc;
 713
 714         bio_io_error(m->bio);
 715         cell_defer_no_holder(tc, m->cell);
 716         cell_defer_no_holder(tc, m->cell2);
 717         mempool_free(m, tc->pool->mapping_pool);
 718 }
 719
 720 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 721 {
 722         struct thin_c *tc = m->tc;
 723
 724         inc_all_io_entry(tc->pool, m->bio);
 725         cell_defer_no_holder(tc, m->cell);
 726         cell_defer_no_holder(tc, m->cell2);
 727
 728         if (m->pass_discard)
 729                 if (m->definitely_not_shared)
 730                         remap_and_issue(tc, m->bio, m->data_block);
 731                 else {
 732                         bool used = false;
 733                         if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
 734                                 bio_endio(m->bio, 0);
 735                         else
 736                                 remap_and_issue(tc, m->bio, m->data_block);
 737                 }
 738         else
 739                 bio_endio(m->bio, 0);
 740
 741         mempool_free(m, tc->pool->mapping_pool);
 742 }
 743
 744 static void process_prepared_discard(struct dm_thin_new_mapping *m)
 745 {
 746         int r;
 747         struct thin_c *tc = m->tc;
 748
 749         r = dm_thin_remove_block(tc->td, m->virt_block);
 750         if (r)
 751                 DMERR_LIMIT("dm_thin_remove_block() failed");
 752
 753         process_prepared_discard_passdown(m);
 754 }
 755
 756 static void process_prepared(struct pool *pool, struct list_head *head,
 757                              process_mapping_fn *fn)
 758 {
 759         unsigned long flags;
 760         struct list_head maps;
 761         struct dm_thin_new_mapping *m, *tmp;
 762
 763         INIT_LIST_HEAD(&maps);
 764         spin_lock_irqsave(&pool->lock, flags);
 765         list_splice_init(head, &maps);
 766         spin_unlock_irqrestore(&pool->lock, flags);
 767
 768         list_for_each_entry_safe(m, tmp, &maps, list)
 769                 (*fn)(m);
 770 }
 771
 772 /*
 773  * Deferred bio jobs.
 774  */
 775 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 776 {
 777         return bio->bi_iter.bi_size ==
 778                 (pool->sectors_per_block << SECTOR_SHIFT);
 779 }
 780
 781 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 782 {
 783         return (bio_data_dir(bio) == WRITE) &&
 784                 io_overlaps_block(pool, bio);
 785 }
 786
 787 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 788                                bio_end_io_t *fn)
 789 {
 790         *save = bio->bi_end_io;
 791         bio->bi_end_io = fn;
 792 }
 793
 794 static int ensure_next_mapping(struct pool *pool)
 795 {
 796         if (pool->next_mapping)
 797                 return 0;
 798
 799         pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 800
 801         return pool->next_mapping ? 0 : -ENOMEM;
 802 }
 803
 804 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 805 {
 806         struct dm_thin_new_mapping *m = pool->next_mapping;
 807
 808         BUG_ON(!pool->next_mapping);
 809
 810         memset(m, 0, sizeof(struct dm_thin_new_mapping));
 811         INIT_LIST_HEAD(&m->list);
 812         m->bio = NULL;
 813
 814         pool->next_mapping = NULL;
 815
 816         return m;
 817 }
 818
 819 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 820                           struct dm_dev *origin, dm_block_t data_origin,
 821                           dm_block_t data_dest,
 822                           struct dm_bio_prison_cell *cell, struct bio *bio)
 823 {
 824         int r;
 825         struct pool *pool = tc->pool;
 826         struct dm_thin_new_mapping *m = get_next_mapping(pool);
 827
 828         m->tc = tc;
 829         m->virt_block = virt_block;
 830         m->data_block = data_dest;
 831         m->cell = cell;
 832
 833         if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 834                 m->quiesced = true;
 835
 836         /*
 837          * IO to pool_dev remaps to the pool target's data_dev.
 838          *
 839          * If the whole block of data is being overwritten, we can issue the
 840          * bio immediately. Otherwise we use kcopyd to clone the data first.
 841          */
 842         if (io_overwrites_block(pool, bio)) {
 843                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 844
 845                 h->overwrite_mapping = m;
 846                 m->bio = bio;
 847                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 848                 inc_all_io_entry(pool, bio);
 849                 remap_and_issue(tc, bio, data_dest);
 850         } else {
 851                 struct dm_io_region from, to;
 852
 853                 from.bdev = origin->bdev;
 854                 from.sector = data_origin * pool->sectors_per_block;
 855                 from.count = pool->sectors_per_block;
 856
 857                 to.bdev = tc->pool_dev->bdev;
 858                 to.sector = data_dest * pool->sectors_per_block;
 859                 to.count = pool->sectors_per_block;
 860
 861                 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 862                                    0, copy_complete, m);
 863                 if (r < 0) {
 864                         mempool_free(m, pool->mapping_pool);
 865                         DMERR_LIMIT("dm_kcopyd_copy() failed");
 866                         cell_error(pool, cell);
 867                 }
 868         }
 869 }
 870
 871 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 872                                    dm_block_t data_origin, dm_block_t data_dest,
 873                                    struct dm_bio_prison_cell *cell, struct bio *bio)
 874 {
 875         schedule_copy(tc, virt_block, tc->pool_dev,
 876                       data_origin, data_dest, cell, bio);
 877 }
 878
 879 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 880                                    dm_block_t data_dest,
 881                                    struct dm_bio_prison_cell *cell, struct bio *bio)
 882 {
 883         schedule_copy(tc, virt_block, tc->origin_dev,
 884                       virt_block, data_dest, cell, bio);
 885 }
 886
 887 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 888                           dm_block_t data_block, struct dm_bio_prison_cell *cell,
 889                           struct bio *bio)
 890 {
 891         struct pool *pool = tc->pool;
 892         struct dm_thin_new_mapping *m = get_next_mapping(pool);
 893
 894         m->quiesced = true;
 895         m->prepared = false;
 896         m->tc = tc;
 897         m->virt_block = virt_block;
 898         m->data_block = data_block;
 899         m->cell = cell;
 900
 901         /*
 902          * If the whole block of data is being overwritten or we are not
 903          * zeroing pre-existing data, we can issue the bio immediately.
 904          * Otherwise we use kcopyd to zero the data first.
 905          */
 906         if (!pool->pf.zero_new_blocks)
 907                 process_prepared_mapping(m);
 908
 909         else if (io_overwrites_block(pool, bio)) {
 910                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 911
 912                 h->overwrite_mapping = m;
 913                 m->bio = bio;
 914                 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 915                 inc_all_io_entry(pool, bio);
 916                 remap_and_issue(tc, bio, data_block);
 917         } else {
 918                 int r;
 919                 struct dm_io_region to;
 920
 921                 to.bdev = tc->pool_dev->bdev;
 922                 to.sector = data_block * pool->sectors_per_block;
 923                 to.count = pool->sectors_per_block;
 924
 925                 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
 926                 if (r < 0) {
 927                         mempool_free(m, pool->mapping_pool);
 928                         DMERR_LIMIT("dm_kcopyd_zero() failed");
 929                         cell_error(pool, cell);
 930                 }
 931         }
 932 }
 933
 934 /*
 935  * A non-zero return indicates read_only or fail_io mode.
 936  * Many callers don't care about the return value.
 937  */
 938 static int commit(struct pool *pool)
 939 {
 940         int r;
 941
 942         if (get_pool_mode(pool) >= PM_READ_ONLY)
 943                 return -EINVAL;
 944
 945         r = dm_pool_commit_metadata(pool->pmd);
 946         if (r)
 947                 metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
 948
 949         return r;
 950 }
 951
 952 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 953 {
 954         unsigned long flags;
 955
 956         if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
 957                 DMWARN("%s: reached low water mark for data device: sending event.",
 958                        dm_device_name(pool->pool_md));
 959                 spin_lock_irqsave(&pool->lock, flags);
 960                 pool->low_water_triggered = true;
 961                 spin_unlock_irqrestore(&pool->lock, flags);
 962                 dm_table_event(pool->ti->table);
 963         }
 964 }
 965
 966 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 967
 968 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 969 {
 970         int r;
 971         dm_block_t free_blocks;
 972         struct pool *pool = tc->pool;
 973
 974         if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
 975                 return -EINVAL;
 976
 977         r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 978         if (r) {
 979                 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 980                 return r;
 981         }
 982
 983         check_low_water_mark(pool, free_blocks);
 984
 985         if (!free_blocks) {
 986                 /*
 987                  * Try to commit to see if that will free up some
 988                  * more space.
 989                  */
 990                 r = commit(pool);
 991                 if (r)
 992                         return r;
 993
 994                 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 995                 if (r) {
 996                         metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 997                         return r;
 998                 }
 999
1000                 if (!free_blocks) {
1001                         set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1002                         return -ENOSPC;
1003                 }
1004         }
1005
1006         r = dm_pool_alloc_data_block(pool->pmd, result);
1007         if (r) {
1008                 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1009                 return r;
1010         }
1011
1012         return 0;
1013 }
1014
1015 /*
1016  * If we have run out of space, queue bios until the device is
1017  * resumed, presumably after having been reloaded with more space.
1018  */
1019 static void retry_on_resume(struct bio *bio)
1020 {
1021         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1022         struct thin_c *tc = h->tc;
1023         unsigned long flags;
1024
1025         spin_lock_irqsave(&tc->lock, flags);
1026         bio_list_add(&tc->retry_on_resume_list, bio);
1027         spin_unlock_irqrestore(&tc->lock, flags);
1028 }
1029
1030 static bool should_error_unserviceable_bio(struct pool *pool)
1031 {
1032         enum pool_mode m = get_pool_mode(pool);
1033
1034         switch (m) {
1035         case PM_WRITE:
1036                 /* Shouldn't get here */
1037                 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1038                 return true;
1039
1040         case PM_OUT_OF_DATA_SPACE:
1041                 return pool->pf.error_if_no_space;
1042
1043         case PM_READ_ONLY:
1044         case PM_FAIL:
1045                 return true;
1046         default:
1047                 /* Shouldn't get here */
1048                 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1049                 return true;
1050         }
1051 }
1052
1053 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1054 {
1055         if (should_error_unserviceable_bio(pool))
1056                 bio_io_error(bio);
1057         else
1058                 retry_on_resume(bio);
1059 }
1060
1061 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1062 {
1063         struct bio *bio;
1064         struct bio_list bios;
1065
1066         if (should_error_unserviceable_bio(pool)) {
1067                 cell_error(pool, cell);
1068                 return;
1069         }
1070
1071         bio_list_init(&bios);
1072         cell_release(pool, cell, &bios);
1073
1074         if (should_error_unserviceable_bio(pool))
1075                 while ((bio = bio_list_pop(&bios)))
1076                         bio_io_error(bio);
1077         else
1078                 while ((bio = bio_list_pop(&bios)))
1079                         retry_on_resume(bio);
1080 }
1081
1082 static void process_discard(struct thin_c *tc, struct bio *bio)
1083 {
1084         int r;
1085         unsigned long flags;
1086         struct pool *pool = tc->pool;
1087         struct dm_bio_prison_cell *cell, *cell2;
1088         struct dm_cell_key key, key2;
1089         dm_block_t block = get_bio_block(tc, bio);
1090         struct dm_thin_lookup_result lookup_result;
1091         struct dm_thin_new_mapping *m;
1092
1093         build_virtual_key(tc->td, block, &key);
1094         if (bio_detain(tc->pool, &key, bio, &cell))
1095                 return;
1096
1097         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1098         switch (r) {
1099         case 0:
1100                 /*
1101                  * Check nobody is fiddling with this pool block.  This can
1102                  * happen if someone's in the process of breaking sharing
1103                  * on this block.
1104                  */
1105                 build_data_key(tc->td, lookup_result.block, &key2);
1106                 if (bio_detain(tc->pool, &key2, bio, &cell2)) {
1107                         cell_defer_no_holder(tc, cell);
1108                         break;
1109                 }
1110
1111                 if (io_overlaps_block(pool, bio)) {
1112                         /*
1113                          * IO may still be going to the destination block.  We must
1114                          * quiesce before we can do the removal.
1115                          */
1116                         m = get_next_mapping(pool);
1117                         m->tc = tc;
1118                         m->pass_discard = pool->pf.discard_passdown;
1119                         m->definitely_not_shared = !lookup_result.shared;
1120                         m->virt_block = block;
1121                         m->data_block = lookup_result.block;
1122                         m->cell = cell;
1123                         m->cell2 = cell2;
1124                         m->bio = bio;
1125
1126                         if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1127                                 spin_lock_irqsave(&pool->lock, flags);
1128                                 list_add_tail(&m->list, &pool->prepared_discards);
1129                                 spin_unlock_irqrestore(&pool->lock, flags);
1130                                 wake_worker(pool);
1131                         }
1132                 } else {
1133                         inc_all_io_entry(pool, bio);
1134                         cell_defer_no_holder(tc, cell);
1135                         cell_defer_no_holder(tc, cell2);
1136
1137                         /*
1138                          * The DM core makes sure that the discard doesn't span
1139                          * a block boundary.  So we submit the discard of a
1140                          * partial block appropriately.
1141                          */
1142                         if ((!lookup_result.shared) && pool->pf.discard_passdown)
1143                                 remap_and_issue(tc, bio, lookup_result.block);
1144                         else
1145                                 bio_endio(bio, 0);
1146                 }
1147                 break;
1148
1149         case -ENODATA:
1150                 /*
1151                  * It isn't provisioned, just forget it.
1152                  */
1153                 cell_defer_no_holder(tc, cell);
1154                 bio_endio(bio, 0);
1155                 break;
1156
1157         default:
1158                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1159                             __func__, r);
1160                 cell_defer_no_holder(tc, cell);
1161                 bio_io_error(bio);
1162                 break;
1163         }
1164 }
1165
1166 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1167                           struct dm_cell_key *key,
1168                           struct dm_thin_lookup_result *lookup_result,
1169                           struct dm_bio_prison_cell *cell)
1170 {
1171         int r;
1172         dm_block_t data_block;
1173         struct pool *pool = tc->pool;
1174
1175         r = alloc_data_block(tc, &data_block);
1176         switch (r) {
1177         case 0:
1178                 schedule_internal_copy(tc, block, lookup_result->block,
1179                                        data_block, cell, bio);
1180                 break;
1181
1182         case -ENOSPC:
1183                 retry_bios_on_resume(pool, cell);
1184                 break;
1185
1186         default:
1187                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1188                             __func__, r);
1189                 cell_error(pool, cell);
1190                 break;
1191         }
1192 }
1193
1194 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1195                                dm_block_t block,
1196                                struct dm_thin_lookup_result *lookup_result)
1197 {
1198         struct dm_bio_prison_cell *cell;
1199         struct pool *pool = tc->pool;
1200         struct dm_cell_key key;
1201
1202         /*
1203          * If cell is already occupied, then sharing is already in the process
1204          * of being broken so we have nothing further to do here.
1205          */
1206         build_data_key(tc->td, lookup_result->block, &key);
1207         if (bio_detain(pool, &key, bio, &cell))
1208                 return;
1209
1210         if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
1211                 break_sharing(tc, bio, block, &key, lookup_result, cell);
1212         else {
1213                 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1214
1215                 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1216                 inc_all_io_entry(pool, bio);
1217                 cell_defer_no_holder(tc, cell);
1218
1219                 remap_and_issue(tc, bio, lookup_result->block);
1220         }
1221 }
1222
1223 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1224                             struct dm_bio_prison_cell *cell)
1225 {
1226         int r;
1227         dm_block_t data_block;
1228         struct pool *pool = tc->pool;
1229
1230         /*
1231          * Remap empty bios (flushes) immediately, without provisioning.
1232          */
1233         if (!bio->bi_iter.bi_size) {
1234                 inc_all_io_entry(pool, bio);
1235                 cell_defer_no_holder(tc, cell);
1236
1237                 remap_and_issue(tc, bio, 0);
1238                 return;
1239         }
1240
1241         /*
1242          * Fill read bios with zeroes and complete them immediately.
1243          */
1244         if (bio_data_dir(bio) == READ) {
1245                 zero_fill_bio(bio);
1246                 cell_defer_no_holder(tc, cell);
1247                 bio_endio(bio, 0);
1248                 return;
1249         }
1250
1251         r = alloc_data_block(tc, &data_block);
1252         switch (r) {
1253         case 0:
1254                 if (tc->origin_dev)
1255                         schedule_external_copy(tc, block, data_block, cell, bio);
1256                 else
1257                         schedule_zero(tc, block, data_block, cell, bio);
1258                 break;
1259
1260         case -ENOSPC:
1261                 retry_bios_on_resume(pool, cell);
1262                 break;
1263
1264         default:
1265                 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1266                             __func__, r);
1267                 cell_error(pool, cell);
1268                 break;
1269         }
1270 }
1271
1272 static void process_bio(struct thin_c *tc, struct bio *bio)
1273 {
1274         int r;
1275         struct pool *pool = tc->pool;
1276         dm_block_t block = get_bio_block(tc, bio);
1277         struct dm_bio_prison_cell *cell;
1278         struct dm_cell_key key;
1279         struct dm_thin_lookup_result lookup_result;
1280
1281         /*
1282          * If cell is already occupied, then the block is already
1283          * being provisioned so we have nothing further to do here.
1284          */
1285         build_virtual_key(tc->td, block, &key);
1286         if (bio_detain(pool, &key, bio, &cell))
1287                 return;
1288
1289         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1290         switch (r) {
1291         case 0:
1292                 if (lookup_result.shared) {
1293                         process_shared_bio(tc, bio, block, &lookup_result);
1294                         cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
1295                 } else {
1296                         inc_all_io_entry(pool, bio);
1297                         cell_defer_no_holder(tc, cell);
1298
1299                         remap_and_issue(tc, bio, lookup_result.block);
1300                 }
1301                 break;
1302
1303         case -ENODATA:
1304                 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1305                         inc_all_io_entry(pool, bio);
1306                         cell_defer_no_holder(tc, cell);
1307
1308                         remap_to_origin_and_issue(tc, bio);
1309                 } else
1310                         provision_block(tc, bio, block, cell);
1311                 break;
1312
1313         default:
1314                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1315                             __func__, r);
1316                 cell_defer_no_holder(tc, cell);
1317                 bio_io_error(bio);
1318                 break;
1319         }
1320 }
1321
1322 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1323 {
1324         int r;
1325         int rw = bio_data_dir(bio);
1326         dm_block_t block = get_bio_block(tc, bio);
1327         struct dm_thin_lookup_result lookup_result;
1328
1329         r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1330         switch (r) {
1331         case 0:
1332                 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
1333                         handle_unserviceable_bio(tc->pool, bio);
1334                 else {
1335                         inc_all_io_entry(tc->pool, bio);
1336                         remap_and_issue(tc, bio, lookup_result.block);
1337                 }
1338                 break;
1339
1340         case -ENODATA:
1341                 if (rw != READ) {
1342                         handle_unserviceable_bio(tc->pool, bio);
1343                         break;
1344                 }
1345
1346                 if (tc->origin_dev) {
1347                         inc_all_io_entry(tc->pool, bio);
1348                         remap_to_origin_and_issue(tc, bio);
1349                         break;
1350                 }
1351
1352                 zero_fill_bio(bio);
1353                 bio_endio(bio, 0);
1354                 break;
1355
1356         default:
1357                 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1358                             __func__, r);
1359                 bio_io_error(bio);
1360                 break;
1361         }
1362 }
1363
1364 static void process_bio_success(struct thin_c *tc, struct bio *bio)
1365 {
1366         bio_endio(bio, 0);
1367 }
1368
1369 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1370 {
1371         bio_io_error(bio);
1372 }
1373
1374 /*
1375  * FIXME: should we also commit due to size of transaction, measured in
1376  * metadata blocks?
1377  */
1378 static int need_commit_due_to_time(struct pool *pool)
1379 {
1380         return jiffies < pool->last_commit_jiffies ||
1381                jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1382 }
1383
1384 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1385 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1386
1387 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1388 {
1389         struct rb_node **rbp, *parent;
1390         struct dm_thin_endio_hook *pbd;
1391         sector_t bi_sector = bio->bi_iter.bi_sector;
1392
1393         rbp = &tc->sort_bio_list.rb_node;
1394         parent = NULL;
1395         while (*rbp) {
1396                 parent = *rbp;
1397                 pbd = thin_pbd(parent);
1398
1399                 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1400                         rbp = &(*rbp)->rb_left;
1401                 else
1402                         rbp = &(*rbp)->rb_right;
1403         }
1404
1405         pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1406         rb_link_node(&pbd->rb_node, parent, rbp);
1407         rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1408 }
1409
1410 static void __extract_sorted_bios(struct thin_c *tc)
1411 {
1412         struct rb_node *node;
1413         struct dm_thin_endio_hook *pbd;
1414         struct bio *bio;
1415
1416         for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1417                 pbd = thin_pbd(node);
1418                 bio = thin_bio(pbd);
1419
1420                 bio_list_add(&tc->deferred_bio_list, bio);
1421                 rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1422         }
1423
1424         WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1425 }
1426
1427 static void __sort_thin_deferred_bios(struct thin_c *tc)
1428 {
1429         struct bio *bio;
1430         struct bio_list bios;
1431
1432         bio_list_init(&bios);
1433         bio_list_merge(&bios, &tc->deferred_bio_list);
1434         bio_list_init(&tc->deferred_bio_list);
1435
1436         /* Sort deferred_bio_list using rb-tree */
1437         while ((bio = bio_list_pop(&bios)))
1438                 __thin_bio_rb_add(tc, bio);
1439
1440         /*
1441          * Transfer the sorted bios in sort_bio_list back to
1442          * deferred_bio_list to allow lockless submission of
1443          * all bios.
1444          */
1445         __extract_sorted_bios(tc);
1446 }
1447
1448 static void process_thin_deferred_bios(struct thin_c *tc)
1449 {
1450         struct pool *pool = tc->pool;
1451         unsigned long flags;
1452         struct bio *bio;
1453         struct bio_list bios;
1454         struct blk_plug plug;
1455
1456         if (tc->requeue_mode) {
1457                 requeue_bio_list(tc, &tc->deferred_bio_list);
1458                 return;
1459         }
1460
1461         bio_list_init(&bios);
1462
1463         spin_lock_irqsave(&tc->lock, flags);
1464
1465         if (bio_list_empty(&tc->deferred_bio_list)) {
1466                 spin_unlock_irqrestore(&tc->lock, flags);
1467                 return;
1468         }
1469
1470         __sort_thin_deferred_bios(tc);
1471
1472         bio_list_merge(&bios, &tc->deferred_bio_list);
1473         bio_list_init(&tc->deferred_bio_list);
1474
1475         spin_unlock_irqrestore(&tc->lock, flags);
1476
1477         blk_start_plug(&plug);
1478         while ((bio = bio_list_pop(&bios))) {
1479                 /*
1480                  * If we've got no free new_mapping structs, and processing
1481                  * this bio might require one, we pause until there are some
1482                  * prepared mappings to process.
1483                  */
1484                 if (ensure_next_mapping(pool)) {
1485                         spin_lock_irqsave(&tc->lock, flags);
1486                         bio_list_add(&tc->deferred_bio_list, bio);
1487                         bio_list_merge(&tc->deferred_bio_list, &bios);
1488                         spin_unlock_irqrestore(&tc->lock, flags);
1489                         break;
1490                 }
1491
1492                 if (bio->bi_rw & REQ_DISCARD)
1493                         pool->process_discard(tc, bio);
1494                 else
1495                         pool->process_bio(tc, bio);
1496         }
1497         blk_finish_plug(&plug);
1498 }
1499
1500 static void thin_get(struct thin_c *tc);
1501 static void thin_put(struct thin_c *tc);
1502
1503 /*
1504  * We can't hold rcu_read_lock() around code that can block.  So we
1505  * find a thin with the rcu lock held; bump a refcount; then drop
1506  * the lock.
1507  */
1508 static struct thin_c *get_first_thin(struct pool *pool)
1509 {
1510         struct thin_c *tc = NULL;
1511
1512         rcu_read_lock();
1513         if (!list_empty(&pool->active_thins)) {
1514                 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1515                 thin_get(tc);
1516         }
1517         rcu_read_unlock();
1518
1519         return tc;
1520 }
1521
1522 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1523 {
1524         struct thin_c *old_tc = tc;
1525
1526         rcu_read_lock();
1527         list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1528                 thin_get(tc);
1529                 thin_put(old_tc);
1530                 rcu_read_unlock();
1531                 return tc;
1532         }
1533         thin_put(old_tc);
1534         rcu_read_unlock();
1535
1536         return NULL;
1537 }
1538
1539 static void process_deferred_bios(struct pool *pool)
1540 {
1541         unsigned long flags;
1542         struct bio *bio;
1543         struct bio_list bios;
1544         struct thin_c *tc;
1545
1546         tc = get_first_thin(pool);
1547         while (tc) {
1548                 process_thin_deferred_bios(tc);
1549                 tc = get_next_thin(pool, tc);
1550         }
1551
1552         /*
1553          * If there are any deferred flush bios, we must commit
1554          * the metadata before issuing them.
1555          */
1556         bio_list_init(&bios);
1557         spin_lock_irqsave(&pool->lock, flags);
1558         bio_list_merge(&bios, &pool->deferred_flush_bios);
1559         bio_list_init(&pool->deferred_flush_bios);
1560         spin_unlock_irqrestore(&pool->lock, flags);
1561
1562         if (bio_list_empty(&bios) &&
1563             !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1564                 return;
1565
1566         if (commit(pool)) {
1567                 while ((bio = bio_list_pop(&bios)))
1568                         bio_io_error(bio);
1569                 return;
1570         }
1571         pool->last_commit_jiffies = jiffies;
1572
1573         while ((bio = bio_list_pop(&bios)))
1574                 generic_make_request(bio);
1575 }
1576
1577 static void do_worker(struct work_struct *ws)
1578 {
1579         struct pool *pool = container_of(ws, struct pool, worker);
1580
1581         process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1582         process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1583         process_deferred_bios(pool);
1584 }
1585
1586 /*
1587  * We want to commit periodically so that not too much
1588  * unwritten data builds up.
1589  */
1590 static void do_waker(struct work_struct *ws)
1591 {
1592         struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1593         wake_worker(pool);
1594         queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1595 }
1596
1597 /*
1598  * We're holding onto IO to allow userland time to react.  After the
1599  * timeout either the pool will have been resized (and thus back in
1600  * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1601  */
1602 static void do_no_space_timeout(struct work_struct *ws)
1603 {
1604         struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1605                                          no_space_timeout);
1606
1607         if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1608                 set_pool_mode(pool, PM_READ_ONLY);
1609 }
1610
1611 /*----------------------------------------------------------------*/
1612
1613 struct noflush_work {
1614         struct work_struct worker;
1615         struct thin_c *tc;
1616
1617         atomic_t complete;
1618         wait_queue_head_t wait;
1619 };
1620
1621 static void complete_noflush_work(struct noflush_work *w)
1622 {
1623         atomic_set(&w->complete, 1);
1624         wake_up(&w->wait);
1625 }
1626
1627 static void do_noflush_start(struct work_struct *ws)
1628 {
1629         struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1630         w->tc->requeue_mode = true;
1631         requeue_io(w->tc);
1632         complete_noflush_work(w);
1633 }
1634
1635 static void do_noflush_stop(struct work_struct *ws)
1636 {
1637         struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1638         w->tc->requeue_mode = false;
1639         complete_noflush_work(w);
1640 }
1641
1642 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1643 {
1644         struct noflush_work w;
1645
1646         INIT_WORK_ONSTACK(&w.worker, fn);
1647         w.tc = tc;
1648         atomic_set(&w.complete, 0);
1649         init_waitqueue_head(&w.wait);
1650
1651         queue_work(tc->pool->wq, &w.worker);
1652
1653         wait_event(w.wait, atomic_read(&w.complete));
1654 }
1655
1656 /*----------------------------------------------------------------*/
1657
1658 static enum pool_mode get_pool_mode(struct pool *pool)
1659 {
1660         return pool->pf.mode;
1661 }
1662
1663 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1664 {
1665         dm_table_event(pool->ti->table);
1666         DMINFO("%s: switching pool to %s mode",
1667                dm_device_name(pool->pool_md), new_mode);
1668 }
1669
1670 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1671 {
1672         struct pool_c *pt = pool->ti->private;
1673         bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1674         enum pool_mode old_mode = get_pool_mode(pool);
1675         unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
1676
1677         /*
1678          * Never allow the pool to transition to PM_WRITE mode if user
1679          * intervention is required to verify metadata and data consistency.
1680          */
1681         if (new_mode == PM_WRITE && needs_check) {
1682                 DMERR("%s: unable to switch pool to write mode until repaired.",
1683                       dm_device_name(pool->pool_md));
1684                 if (old_mode != new_mode)
1685                         new_mode = old_mode;
1686                 else
1687                         new_mode = PM_READ_ONLY;
1688         }
1689         /*
1690          * If we were in PM_FAIL mode, rollback of metadata failed.  We're
1691          * not going to recover without a thin_repair.  So we never let the
1692          * pool move out of the old mode.
1693          */
1694         if (old_mode == PM_FAIL)
1695                 new_mode = old_mode;
1696
1697         switch (new_mode) {
1698         case PM_FAIL:
1699                 if (old_mode != new_mode)
1700                         notify_of_pool_mode_change(pool, "failure");
1701                 dm_pool_metadata_read_only(pool->pmd);
1702                 pool->process_bio = process_bio_fail;
1703                 pool->process_discard = process_bio_fail;
1704                 pool->process_prepared_mapping = process_prepared_mapping_fail;
1705                 pool->process_prepared_discard = process_prepared_discard_fail;
1706
1707                 error_retry_list(pool);
1708                 break;
1709
1710         case PM_READ_ONLY:
1711                 if (old_mode != new_mode)
1712                         notify_of_pool_mode_change(pool, "read-only");
1713                 dm_pool_metadata_read_only(pool->pmd);
1714                 pool->process_bio = process_bio_read_only;
1715                 pool->process_discard = process_bio_success;
1716                 pool->process_prepared_mapping = process_prepared_mapping_fail;
1717                 pool->process_prepared_discard = process_prepared_discard_passdown;
1718
1719                 error_retry_list(pool);
1720                 break;
1721
1722         case PM_OUT_OF_DATA_SPACE:
1723                 /*
1724                  * Ideally we'd never hit this state; the low water mark
1725                  * would trigger userland to extend the pool before we
1726                  * completely run out of data space.  However, many small
1727                  * IOs to unprovisioned space can consume data space at an
1728                  * alarming rate.  Adjust your low water mark if you're
1729                  * frequently seeing this mode.
1730                  */
1731                 if (old_mode != new_mode)
1732                         notify_of_pool_mode_change(pool, "out-of-data-space");
1733                 pool->process_bio = process_bio_read_only;
1734                 pool->process_discard = process_discard;
1735                 pool->process_prepared_mapping = process_prepared_mapping;
1736                 pool->process_prepared_discard = process_prepared_discard_passdown;
1737
1738                 if (!pool->pf.error_if_no_space && no_space_timeout)
1739                         queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
1740                 break;
1741
1742         case PM_WRITE:
1743                 if (old_mode != new_mode)
1744                         notify_of_pool_mode_change(pool, "write");
1745                 dm_pool_metadata_read_write(pool->pmd);
1746                 pool->process_bio = process_bio;
1747                 pool->process_discard = process_discard;
1748                 pool->process_prepared_mapping = process_prepared_mapping;
1749                 pool->process_prepared_discard = process_prepared_discard;
1750                 break;
1751         }
1752
1753         pool->pf.mode = new_mode;
1754         /*
1755          * The pool mode may have changed, sync it so bind_control_target()
1756          * doesn't cause an unexpected mode transition on resume.
1757          */
1758         pt->adjusted_pf.mode = new_mode;
1759 }
1760
1761 static void abort_transaction(struct pool *pool)
1762 {
1763         const char *dev_name = dm_device_name(pool->pool_md);
1764
1765         DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1766         if (dm_pool_abort_metadata(pool->pmd)) {
1767                 DMERR("%s: failed to abort metadata transaction", dev_name);
1768                 set_pool_mode(pool, PM_FAIL);
1769         }
1770
1771         if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1772                 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1773                 set_pool_mode(pool, PM_FAIL);
1774         }
1775 }
1776
1777 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1778 {
1779         DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1780                     dm_device_name(pool->pool_md), op, r);
1781
1782         abort_transaction(pool);
1783         set_pool_mode(pool, PM_READ_ONLY);
1784 }
1785
1786 /*----------------------------------------------------------------*/
1787
1788 /*
1789  * Mapping functions.
1790  */
1791
1792 /*
1793  * Called only while mapping a thin bio to hand it over to the workqueue.
1794  */
1795 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1796 {
1797         unsigned long flags;
1798         struct pool *pool = tc->pool;
1799
1800         spin_lock_irqsave(&tc->lock, flags);
1801         bio_list_add(&tc->deferred_bio_list, bio);
1802         spin_unlock_irqrestore(&tc->lock, flags);
1803
1804         wake_worker(pool);
1805 }
1806
1807 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1808 {
1809         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1810
1811         h->tc = tc;
1812         h->shared_read_entry = NULL;
1813         h->all_io_entry = NULL;
1814         h->overwrite_mapping = NULL;
1815 }
1816
1817 /*
1818  * Non-blocking function called from the thin target's map function.
1819  */
1820 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1821 {
1822         int r;
1823         struct thin_c *tc = ti->private;
1824         dm_block_t block = get_bio_block(tc, bio);
1825         struct dm_thin_device *td = tc->td;
1826         struct dm_thin_lookup_result result;
1827         struct dm_bio_prison_cell cell1, cell2;
1828         struct dm_bio_prison_cell *cell_result;
1829         struct dm_cell_key key;
1830
1831         thin_hook_bio(tc, bio);
1832
1833         if (tc->requeue_mode) {
1834                 bio_endio(bio, DM_ENDIO_REQUEUE);
1835                 return DM_MAPIO_SUBMITTED;
1836         }
1837
1838         if (get_pool_mode(tc->pool) == PM_FAIL) {
1839                 bio_io_error(bio);
1840                 return DM_MAPIO_SUBMITTED;
1841         }
1842
1843         if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1844                 thin_defer_bio(tc, bio);
1845                 return DM_MAPIO_SUBMITTED;
1846         }
1847
1848         r = dm_thin_find_block(td, block, 0, &result);
1849
1850         /*
1851          * Note that we defer readahead too.
1852          */
1853         switch (r) {
1854         case 0:
1855                 if (unlikely(result.shared)) {
1856                         /*
1857                          * We have a race condition here between the
1858                          * result.shared value returned by the lookup and
1859                          * snapshot creation, which may cause new
1860                          * sharing.
1861                          *
1862                          * To avoid this always quiesce the origin before
1863                          * taking the snap.  You want to do this anyway to
1864                          * ensure a consistent application view
1865                          * (i.e. lockfs).
1866                          *
1867                          * More distant ancestors are irrelevant. The
1868                          * shared flag will be set in their case.
1869                          */
1870                         thin_defer_bio(tc, bio);
1871                         return DM_MAPIO_SUBMITTED;
1872                 }
1873
1874                 build_virtual_key(tc->td, block, &key);
1875                 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
1876                         return DM_MAPIO_SUBMITTED;
1877
1878                 build_data_key(tc->td, result.block, &key);
1879                 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1880                         cell_defer_no_holder_no_free(tc, &cell1);
1881                         return DM_MAPIO_SUBMITTED;
1882                 }
1883
1884                 inc_all_io_entry(tc->pool, bio);
1885                 cell_defer_no_holder_no_free(tc, &cell2);
1886                 cell_defer_no_holder_no_free(tc, &cell1);
1887
1888                 remap(tc, bio, result.block);
1889                 return DM_MAPIO_REMAPPED;
1890
1891         case -ENODATA:
1892                 if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1893                         /*
1894                          * This block isn't provisioned, and we have no way
1895                          * of doing so.
1896                          */
1897                         handle_unserviceable_bio(tc->pool, bio);
1898                         return DM_MAPIO_SUBMITTED;
1899                 }
1900                 /* fall through */
1901
1902         case -EWOULDBLOCK:
1903                 /*
1904                  * In future, the failed dm_thin_find_block above could
1905                  * provide the hint to load the metadata into cache.
1906                  */
1907                 thin_defer_bio(tc, bio);
1908                 return DM_MAPIO_SUBMITTED;
1909
1910         default:
1911                 /*
1912                  * Must always call bio_io_error on failure.
1913                  * dm_thin_find_block can fail with -EINVAL if the
1914                  * pool is switched to fail-io mode.
1915                  */
1916                 bio_io_error(bio);
1917                 return DM_MAPIO_SUBMITTED;
1918         }
1919 }
1920
1921 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1922 {
1923         struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1924         struct request_queue *q;
1925
1926         if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
1927                 return 1;
1928
1929         q = bdev_get_queue(pt->data_dev->bdev);
1930         return bdi_congested(&q->backing_dev_info, bdi_bits);
1931 }
1932
1933 static void requeue_bios(struct pool *pool)
1934 {
1935         unsigned long flags;
1936         struct thin_c *tc;
1937
1938         rcu_read_lock();
1939         list_for_each_entry_rcu(tc, &pool->active_thins, list) {
1940                 spin_lock_irqsave(&tc->lock, flags);
1941                 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
1942                 bio_list_init(&tc->retry_on_resume_list);
1943                 spin_unlock_irqrestore(&tc->lock, flags);
1944         }
1945         rcu_read_unlock();
1946 }
1947
1948 /*----------------------------------------------------------------
1949  * Binding of control targets to a pool object
1950  *--------------------------------------------------------------*/
1951 static bool data_dev_supports_discard(struct pool_c *pt)
1952 {
1953         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1954
1955         return q && blk_queue_discard(q);
1956 }
1957
1958 static bool is_factor(sector_t block_size, uint32_t n)
1959 {
1960         return !sector_div(block_size, n);
1961 }
1962
1963 /*
1964  * If discard_passdown was enabled verify that the data device
1965  * supports discards.  Disable discard_passdown if not.
1966  */
1967 static void disable_passdown_if_not_supported(struct pool_c *pt)
1968 {
1969         struct pool *pool = pt->pool;
1970         struct block_device *data_bdev = pt->data_dev->bdev;
1971         struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1972         sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1973         const char *reason = NULL;
1974         char buf[BDEVNAME_SIZE];
1975
1976         if (!pt->adjusted_pf.discard_passdown)
1977                 return;
1978
1979         if (!data_dev_supports_discard(pt))
1980                 reason = "discard unsupported";
1981
1982         else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1983                 reason = "max discard sectors smaller than a block";
1984
1985         else if (data_limits->discard_granularity > block_size)
1986                 reason = "discard granularity larger than a block";
1987
1988         else if (!is_factor(block_size, data_limits->discard_granularity))
1989                 reason = "discard granularity not a factor of block size";
1990
1991         if (reason) {
1992                 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1993                 pt->adjusted_pf.discard_passdown = false;
1994         }
1995 }
1996
1997 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1998 {
1999         struct pool_c *pt = ti->private;
2000
2001         /*
2002          * We want to make sure that a pool in PM_FAIL mode is never upgraded.
2003          */
2004         enum pool_mode old_mode = get_pool_mode(pool);
2005         enum pool_mode new_mode = pt->adjusted_pf.mode;
2006
2007         /*
2008          * Don't change the pool's mode until set_pool_mode() below.
2009          * Otherwise the pool's process_* function pointers may
2010          * not match the desired pool mode.
2011          */
2012         pt->adjusted_pf.mode = old_mode;
2013
2014         pool->ti = ti;
2015         pool->pf = pt->adjusted_pf;
2016         pool->low_water_blocks = pt->low_water_blocks;
2017
2018         set_pool_mode(pool, new_mode);
2019
2020         return 0;
2021 }
2022
2023 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2024 {
2025         if (pool->ti == ti)
2026                 pool->ti = NULL;
2027 }
2028
2029 /*----------------------------------------------------------------
2030  * Pool creation
2031  *--------------------------------------------------------------*/
2032 /* Initialize pool features. */
2033 static void pool_features_init(struct pool_features *pf)
2034 {
2035         pf->mode = PM_WRITE;
2036         pf->zero_new_blocks = true;
2037         pf->discard_enabled = true;
2038         pf->discard_passdown = true;
2039         pf->error_if_no_space = false;
2040 }
2041
2042 static void __pool_destroy(struct pool *pool)
2043 {
2044         __pool_table_remove(pool);
2045
2046         if (dm_pool_metadata_close(pool->pmd) < 0)
2047                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2048
2049         dm_bio_prison_destroy(pool->prison);
2050         dm_kcopyd_client_destroy(pool->copier);
2051
2052         if (pool->wq)
2053                 destroy_workqueue(pool->wq);
2054
2055         if (pool->next_mapping)
2056                 mempool_free(pool->next_mapping, pool->mapping_pool);
2057         mempool_destroy(pool->mapping_pool);
2058         dm_deferred_set_destroy(pool->shared_read_ds);
2059         dm_deferred_set_destroy(pool->all_io_ds);
2060         kfree(pool);
2061 }
2062
2063 static struct kmem_cache *_new_mapping_cache;
2064
2065 static struct pool *pool_create(struct mapped_device *pool_md,
2066                                 struct block_device *metadata_dev,
2067                                 unsigned long block_size,
2068                                 int read_only, char **error)
2069 {
2070         int r;
2071         void *err_p;
2072         struct pool *pool;
2073         struct dm_pool_metadata *pmd;
2074         bool format_device = read_only ? false : true;
2075
2076         pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2077         if (IS_ERR(pmd)) {
2078                 *error = "Error creating metadata object";
2079                 return (struct pool *)pmd;
2080         }
2081
2082         pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2083         if (!pool) {
2084                 *error = "Error allocating memory for pool";
2085                 err_p = ERR_PTR(-ENOMEM);
2086                 goto bad_pool;
2087         }
2088
2089         pool->pmd = pmd;
2090         pool->sectors_per_block = block_size;
2091         if (block_size & (block_size - 1))
2092                 pool->sectors_per_block_shift = -1;
2093         else
2094                 pool->sectors_per_block_shift = __ffs(block_size);
2095         pool->low_water_blocks = 0;
2096         pool_features_init(&pool->pf);
2097         pool->prison = dm_bio_prison_create(PRISON_CELLS);
2098         if (!pool->prison) {
2099                 *error = "Error creating pool's bio prison";
2100                 err_p = ERR_PTR(-ENOMEM);
2101                 goto bad_prison;
2102         }
2103
2104         pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2105         if (IS_ERR(pool->copier)) {
2106                 r = PTR_ERR(pool->copier);
2107                 *error = "Error creating pool's kcopyd client";
2108                 err_p = ERR_PTR(r);
2109                 goto bad_kcopyd_client;
2110         }
2111
2112         /*
2113          * Create singlethreaded workqueue that will service all devices
2114          * that use this metadata.
2115          */
2116         pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2117         if (!pool->wq) {
2118                 *error = "Error creating pool's workqueue";
2119                 err_p = ERR_PTR(-ENOMEM);
2120                 goto bad_wq;
2121         }
2122
2123         INIT_WORK(&pool->worker, do_worker);
2124         INIT_DELAYED_WORK(&pool->waker, do_waker);
2125         INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2126         spin_lock_init(&pool->lock);
2127         bio_list_init(&pool->deferred_flush_bios);
2128         INIT_LIST_HEAD(&pool->prepared_mappings);
2129         INIT_LIST_HEAD(&pool->prepared_discards);
2130         INIT_LIST_HEAD(&pool->active_thins);
2131         pool->low_water_triggered = false;
2132
2133         pool->shared_read_ds = dm_deferred_set_create();
2134         if (!pool->shared_read_ds) {
2135                 *error = "Error creating pool's shared read deferred set";
2136                 err_p = ERR_PTR(-ENOMEM);
2137                 goto bad_shared_read_ds;
2138         }
2139
2140         pool->all_io_ds = dm_deferred_set_create();
2141         if (!pool->all_io_ds) {
2142                 *error = "Error creating pool's all io deferred set";
2143                 err_p = ERR_PTR(-ENOMEM);
2144                 goto bad_all_io_ds;
2145         }
2146
2147         pool->next_mapping = NULL;
2148         pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2149                                                       _new_mapping_cache);
2150         if (!pool->mapping_pool) {
2151                 *error = "Error creating pool's mapping mempool";
2152                 err_p = ERR_PTR(-ENOMEM);
2153                 goto bad_mapping_pool;
2154         }
2155
2156         pool->ref_count = 1;
2157         pool->last_commit_jiffies = jiffies;
2158         pool->pool_md = pool_md;
2159         pool->md_dev = metadata_dev;
2160         __pool_table_insert(pool);
2161
2162         return pool;
2163
2164 bad_mapping_pool:
2165         dm_deferred_set_destroy(pool->all_io_ds);
2166 bad_all_io_ds:
2167         dm_deferred_set_destroy(pool->shared_read_ds);
2168 bad_shared_read_ds:
2169         destroy_workqueue(pool->wq);
2170 bad_wq:
2171         dm_kcopyd_client_destroy(pool->copier);
2172 bad_kcopyd_client:
2173         dm_bio_prison_destroy(pool->prison);
2174 bad_prison:
2175         kfree(pool);
2176 bad_pool:
2177         if (dm_pool_metadata_close(pmd))
2178                 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2179
2180         return err_p;
2181 }
2182
2183 static void __pool_inc(struct pool *pool)
2184 {
2185         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2186         pool->ref_count++;
2187 }
2188
2189 static void __pool_dec(struct pool *pool)
2190 {
2191         BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2192         BUG_ON(!pool->ref_count);
2193         if (!--pool->ref_count)
2194                 __pool_destroy(pool);
2195 }
2196
2197 static struct pool *__pool_find(struct mapped_device *pool_md,
2198                                 struct block_device *metadata_dev,
2199                                 unsigned long block_size, int read_only,
2200                                 char **error, int *created)
2201 {
2202         struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2203
2204         if (pool) {
2205                 if (pool->pool_md != pool_md) {
2206                         *error = "metadata device already in use by a pool";
2207                         return ERR_PTR(-EBUSY);
2208                 }
2209                 __pool_inc(pool);
2210
2211         } else {
2212                 pool = __pool_table_lookup(pool_md);
2213                 if (pool) {
2214                         if (pool->md_dev != metadata_dev) {
2215                                 *error = "different pool cannot replace a pool";
2216                                 return ERR_PTR(-EINVAL);
2217                         }
2218                         __pool_inc(pool);
2219
2220                 } else {
2221                         pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
2222                         *created = 1;
2223                 }
2224         }
2225
2226         return pool;
2227 }
2228
2229 /*----------------------------------------------------------------
2230  * Pool target methods
2231  *--------------------------------------------------------------*/
2232 static void pool_dtr(struct dm_target *ti)
2233 {
2234         struct pool_c *pt = ti->private;
2235
2236         mutex_lock(&dm_thin_pool_table.mutex);
2237
2238         unbind_control_target(pt->pool, ti);
2239         __pool_dec(pt->pool);
2240         dm_put_device(ti, pt->metadata_dev);
2241         dm_put_device(ti, pt->data_dev);
2242         kfree(pt);
2243
2244         mutex_unlock(&dm_thin_pool_table.mutex);
2245 }
2246
2247 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2248                                struct dm_target *ti)
2249 {
2250         int r;
2251         unsigned argc;
2252         const char *arg_name;
2253
2254         static struct dm_arg _args[] = {
2255                 {0, 4, "Invalid number of pool feature arguments"},
2256         };
2257
2258         /*
2259          * No feature arguments supplied.
2260          */
2261         if (!as->argc)
2262                 return 0;
2263
2264         r = dm_read_arg_group(_args, as, &argc, &ti->error);
2265         if (r)
2266                 return -EINVAL;
2267
2268         while (argc && !r) {
2269                 arg_name = dm_shift_arg(as);
2270                 argc--;
2271
2272                 if (!strcasecmp(arg_name, "skip_block_zeroing"))
2273                         pf->zero_new_blocks = false;
2274
2275                 else if (!strcasecmp(arg_name, "ignore_discard"))
2276                         pf->discard_enabled = false;
2277
2278                 else if (!strcasecmp(arg_name, "no_discard_passdown"))
2279                         pf->discard_passdown = false;
2280
2281                 else if (!strcasecmp(arg_name, "read_only"))
2282                         pf->mode = PM_READ_ONLY;
2283
2284                 else if (!strcasecmp(arg_name, "error_if_no_space"))
2285                         pf->error_if_no_space = true;
2286
2287                 else {
2288                         ti->error = "Unrecognised pool feature requested";
2289                         r = -EINVAL;
2290                         break;
2291                 }
2292         }
2293
2294         return r;
2295 }
2296
2297 static void metadata_low_callback(void *context)
2298 {
2299         struct pool *pool = context;
2300
2301         DMWARN("%s: reached low water mark for metadata device: sending event.",
2302                dm_device_name(pool->pool_md));
2303
2304         dm_table_event(pool->ti->table);
2305 }
2306
2307 static sector_t get_dev_size(struct block_device *bdev)
2308 {
2309         return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2310 }
2311
2312 static void warn_if_metadata_device_too_big(struct block_device *bdev)
2313 {
2314         sector_t metadata_dev_size = get_dev_size(bdev);
2315         char buffer[BDEVNAME_SIZE];
2316
2317         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2318                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2319                        bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2320 }
2321
2322 static sector_t get_metadata_dev_size(struct block_device *bdev)
2323 {
2324         sector_t metadata_dev_size = get_dev_size(bdev);
2325
2326         if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2327                 metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2328
2329         return metadata_dev_size;
2330 }
2331
2332 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2333 {
2334         sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2335
2336         sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2337
2338         return metadata_dev_size;
2339 }
2340
2341 /*
2342  * When a metadata threshold is crossed a dm event is triggered, and
2343  * userland should respond by growing the metadata device.  We could let
2344  * userland set the threshold, like we do with the data threshold, but I'm
2345  * not sure they know enough to do this well.
2346  */
2347 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2348 {
2349         /*
2350          * 4M is ample for all ops with the possible exception of thin
2351          * device deletion which is harmless if it fails (just retry the
2352          * delete after you've grown the device).
2353          */
2354         dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2355         return min((dm_block_t)1024ULL /* 4M */, quarter);
2356 }
2357
2358 /*
2359  * thin-pool <metadata dev> <data dev>
2360  *           <data block size (sectors)>
2361  *           <low water mark (blocks)>
2362  *           [<#feature args> [<arg>]*]
2363  *
2364  * Optional feature arguments are:
2365  *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
2366  *           ignore_discard: disable discard
2367  *           no_discard_passdown: don't pass discards down to the data device
2368  *           read_only: Don't allow any changes to be made to the pool metadata.
2369  *           error_if_no_space: error IOs, instead of queueing, if no space.
2370  */
2371 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2372 {
2373         int r, pool_created = 0;
2374         struct pool_c *pt;
2375         struct pool *pool;
2376         struct pool_features pf;
2377         struct dm_arg_set as;
2378         struct dm_dev *data_dev;
2379         unsigned long block_size;
2380         dm_block_t low_water_blocks;
2381         struct dm_dev *metadata_dev;
2382         fmode_t metadata_mode;
2383
2384         /*
2385          * FIXME Remove validation from scope of lock.
2386          */
2387         mutex_lock(&dm_thin_pool_table.mutex);
2388
2389         if (argc < 4) {
2390                 ti->error = "Invalid argument count";
2391                 r = -EINVAL;
2392                 goto out_unlock;
2393         }
2394
2395         as.argc = argc;
2396         as.argv = argv;
2397
2398         /*
2399          * Set default pool features.
2400          */
2401         pool_features_init(&pf);
2402
2403         dm_consume_args(&as, 4);
2404         r = parse_pool_features(&as, &pf, ti);
2405         if (r)
2406                 goto out_unlock;
2407
2408         metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2409         r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
2410         if (r) {
2411                 ti->error = "Error opening metadata block device";
2412                 goto out_unlock;
2413         }
2414         warn_if_metadata_device_too_big(metadata_dev->bdev);
2415
2416         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2417         if (r) {
2418                 ti->error = "Error getting data device";
2419                 goto out_metadata;
2420         }
2421
2422         if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2423             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2424             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2425             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2426                 ti->error = "Invalid block size";
2427                 r = -EINVAL;
2428                 goto out;
2429         }
2430
2431         if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2432                 ti->error = "Invalid low water mark";
2433                 r = -EINVAL;
2434                 goto out;
2435         }
2436
2437         pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2438         if (!pt) {
2439                 r = -ENOMEM;
2440                 goto out;
2441         }
2442
2443         pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2444                            block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2445         if (IS_ERR(pool)) {
2446                 r = PTR_ERR(pool);
2447                 goto out_free_pt;
2448         }
2449
2450         /*
2451          * 'pool_created' reflects whether this is the first table load.
2452          * Top level discard support is not allowed to be changed after
2453          * initial load.  This would require a pool reload to trigger thin
2454          * device changes.
2455          */
2456         if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2457                 ti->error = "Discard support cannot be disabled once enabled";
2458                 r = -EINVAL;
2459                 goto out_flags_changed;
2460         }
2461
2462         pt->pool = pool;
2463         pt->ti = ti;
2464         pt->metadata_dev = metadata_dev;
2465         pt->data_dev = data_dev;
2466         pt->low_water_blocks = low_water_blocks;
2467         pt->adjusted_pf = pt->requested_pf = pf;
2468         ti->num_flush_bios = 1;
2469
2470         /*
2471          * Only need to enable discards if the pool should pass
2472          * them down to the data device.  The thin device's discard
2473          * processing will cause mappings to be removed from the btree.
2474          */
2475         ti->discard_zeroes_data_unsupported = true;
2476         if (pf.discard_enabled && pf.discard_passdown) {
2477                 ti->num_discard_bios = 1;
2478
2479                 /*
2480                  * Setting 'discards_supported' circumvents the normal
2481                  * stacking of discard limits (this keeps the pool and
2482                  * thin devices' discard limits consistent).
2483                  */
2484                 ti->discards_supported = true;
2485         }
2486         ti->private = pt;
2487
2488         r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2489                                                 calc_metadata_threshold(pt),
2490                                                 metadata_low_callback,
2491                                                 pool);
2492         if (r)
2493                 goto out_free_pt;
2494
2495         pt->callbacks.congested_fn = pool_is_congested;
2496         dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2497
2498         mutex_unlock(&dm_thin_pool_table.mutex);
2499
2500         return 0;
2501
2502 out_flags_changed:
2503         __pool_dec(pool);
2504 out_free_pt:
2505         kfree(pt);
2506 out:
2507         dm_put_device(ti, data_dev);
2508 out_metadata:
2509         dm_put_device(ti, metadata_dev);
2510 out_unlock:
2511         mutex_unlock(&dm_thin_pool_table.mutex);
2512
2513         return r;
2514 }
2515
2516 static int pool_map(struct dm_target *ti, struct bio *bio)
2517 {
2518         int r;
2519         struct pool_c *pt = ti->private;
2520         struct pool *pool = pt->pool;
2521         unsigned long flags;
2522
2523         /*
2524          * As this is a singleton target, ti->begin is always zero.
2525          */
2526         spin_lock_irqsave(&pool->lock, flags);
2527         bio->bi_bdev = pt->data_dev->bdev;
2528         r = DM_MAPIO_REMAPPED;
2529         spin_unlock_irqrestore(&pool->lock, flags);
2530
2531         return r;
2532 }
2533
2534 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2535 {
2536         int r;
2537         struct pool_c *pt = ti->private;
2538         struct pool *pool = pt->pool;
2539         sector_t data_size = ti->len;
2540         dm_block_t sb_data_size;
2541
2542         *need_commit = false;
2543
2544         (void) sector_div(data_size, pool->sectors_per_block);
2545
2546         r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2547         if (r) {
2548                 DMERR("%s: failed to retrieve data device size",
2549                       dm_device_name(pool->pool_md));
2550                 return r;
2551         }
2552
2553         if (data_size < sb_data_size) {
2554                 DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2555                       dm_device_name(pool->pool_md),
2556                       (unsigned long long)data_size, sb_data_size);
2557                 return -EINVAL;
2558
2559         } else if (data_size > sb_data_size) {
2560                 if (dm_pool_metadata_needs_check(pool->pmd)) {
2561                         DMERR("%s: unable to grow the data device until repaired.",
2562                               dm_device_name(pool->pool_md));
2563                         return 0;
2564                 }
2565
2566                 if (sb_data_size)
2567                         DMINFO("%s: growing the data device from %llu to %llu blocks",
2568                                dm_device_name(pool->pool_md),
2569                                sb_data_size, (unsigned long long)data_size);
2570                 r = dm_pool_resize_data_dev(pool->pmd, data_size);
2571                 if (r) {
2572                         metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2573                         return r;
2574                 }
2575
2576                 *need_commit = true;
2577         }
2578
2579         return 0;
2580 }
2581
2582 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2583 {
2584         int r;
2585         struct pool_c *pt = ti->private;
2586         struct pool *pool = pt->pool;
2587         dm_block_t metadata_dev_size, sb_metadata_dev_size;
2588
2589         *need_commit = false;
2590
2591         metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
2592
2593         r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2594         if (r) {
2595                 DMERR("%s: failed to retrieve metadata device size",
2596                       dm_device_name(pool->pool_md));
2597                 return r;
2598         }
2599
2600         if (metadata_dev_size < sb_metadata_dev_size) {
2601                 DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2602                       dm_device_name(pool->pool_md),
2603                       metadata_dev_size, sb_metadata_dev_size);
2604                 return -EINVAL;
2605
2606         } else if (metadata_dev_size > sb_metadata_dev_size) {
2607                 if (dm_pool_metadata_needs_check(pool->pmd)) {
2608                         DMERR("%s: unable to grow the metadata device until repaired.",
2609                               dm_device_name(pool->pool_md));
2610                         return 0;
2611                 }
2612
2613                 warn_if_metadata_device_too_big(pool->md_dev);
2614                 DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2615                        dm_device_name(pool->pool_md),
2616                        sb_metadata_dev_size, metadata_dev_size);
2617                 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2618                 if (r) {
2619                         metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2620                         return r;
2621                 }
2622
2623                 *need_commit = true;
2624         }
2625
2626         return 0;
2627 }
2628
2629 /*
2630  * Retrieves the number of blocks of the data device from
2631  * the superblock and compares it to the actual device size,
2632  * thus resizing the data device in case it has grown.
2633  *
2634  * This both copes with opening preallocated data devices in the ctr
2635  * being followed by a resume
2636  * -and-
2637  * calling the resume method individually after userspace has
2638  * grown the data device in reaction to a table event.
2639  */
2640 static int pool_preresume(struct dm_target *ti)
2641 {
2642         int r;
2643         bool need_commit1, need_commit2;
2644         struct pool_c *pt = ti->private;
2645         struct pool *pool = pt->pool;
2646
2647         /*
2648          * Take control of the pool object.
2649          */
2650         r = bind_control_target(pool, ti);
2651         if (r)
2652                 return r;
2653
2654         r = maybe_resize_data_dev(ti, &need_commit1);
2655         if (r)
2656                 return r;
2657
2658         r = maybe_resize_metadata_dev(ti, &need_commit2);
2659         if (r)
2660                 return r;
2661
2662         if (need_commit1 || need_commit2)
2663                 (void) commit(pool);
2664
2665         return 0;
2666 }
2667
2668 static void pool_resume(struct dm_target *ti)
2669 {
2670         struct pool_c *pt = ti->private;
2671         struct pool *pool = pt->pool;
2672         unsigned long flags;
2673
2674         spin_lock_irqsave(&pool->lock, flags);
2675         pool->low_water_triggered = false;
2676         spin_unlock_irqrestore(&pool->lock, flags);
2677         requeue_bios(pool);
2678
2679         do_waker(&pool->waker.work);
2680 }
2681
2682 static void pool_postsuspend(struct dm_target *ti)
2683 {
2684         struct pool_c *pt = ti->private;
2685         struct pool *pool = pt->pool;
2686
2687         cancel_delayed_work(&pool->waker);
2688         cancel_delayed_work(&pool->no_space_timeout);
2689         flush_workqueue(pool->wq);
2690         (void) commit(pool);
2691 }
2692
2693 static int check_arg_count(unsigned argc, unsigned args_required)
2694 {
2695         if (argc != args_required) {
2696                 DMWARN("Message received with %u arguments instead of %u.",
2697                        argc, args_required);
2698                 return -EINVAL;
2699         }
2700
2701         return 0;
2702 }
2703
2704 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2705 {
2706         if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2707             *dev_id <= MAX_DEV_ID)
2708                 return 0;
2709
2710         if (warning)
2711                 DMWARN("Message received with invalid device id: %s", arg);
2712
2713         return -EINVAL;
2714 }
2715
2716 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2717 {
2718         dm_thin_id dev_id;
2719         int r;
2720
2721         r = check_arg_count(argc, 2);
2722         if (r)
2723                 return r;
2724
2725         r = read_dev_id(argv[1], &dev_id, 1);
2726         if (r)
2727                 return r;
2728
2729         r = dm_pool_create_thin(pool->pmd, dev_id);
2730         if (r) {
2731                 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2732                        argv[1]);
2733                 return r;
2734         }
2735
2736         return 0;
2737 }
2738
2739 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2740 {
2741         dm_thin_id dev_id;
2742         dm_thin_id origin_dev_id;
2743         int r;
2744
2745         r = check_arg_count(argc, 3);
2746         if (r)
2747                 return r;
2748
2749         r = read_dev_id(argv[1], &dev_id, 1);
2750         if (r)
2751                 return r;
2752
2753         r = read_dev_id(argv[2], &origin_dev_id, 1);
2754         if (r)
2755                 return r;
2756
2757         r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2758         if (r) {
2759                 DMWARN("Creation of new snapshot %s of device %s failed.",
2760                        argv[1], argv[2]);
2761                 return r;
2762         }
2763
2764         return 0;
2765 }
2766
2767 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2768 {
2769         dm_thin_id dev_id;
2770         int r;
2771
2772         r = check_arg_count(argc, 2);
2773         if (r)
2774                 return r;
2775
2776         r = read_dev_id(argv[1], &dev_id, 1);
2777         if (r)
2778                 return r;
2779
2780         r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2781         if (r)
2782                 DMWARN("Deletion of thin device %s failed.", argv[1]);
2783
2784         return r;
2785 }
2786
2787 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2788 {
2789         dm_thin_id old_id, new_id;
2790         int r;
2791
2792         r = check_arg_count(argc, 3);
2793         if (r)
2794                 return r;
2795
2796         if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2797                 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2798                 return -EINVAL;
2799         }
2800
2801         if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2802                 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2803                 return -EINVAL;
2804         }
2805
2806         r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2807         if (r) {
2808                 DMWARN("Failed to change transaction id from %s to %s.",
2809                        argv[1], argv[2]);
2810                 return r;
2811         }
2812
2813         return 0;
2814 }
2815
2816 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2817 {
2818         int r;
2819
2820         r = check_arg_count(argc, 1);
2821         if (r)
2822                 return r;
2823
2824         (void) commit(pool);
2825
2826         r = dm_pool_reserve_metadata_snap(pool->pmd);
2827         if (r)
2828                 DMWARN("reserve_metadata_snap message failed.");
2829
2830         return r;
2831 }
2832
2833 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2834 {
2835         int r;
2836
2837         r = check_arg_count(argc, 1);
2838         if (r)
2839                 return r;
2840
2841         r = dm_pool_release_metadata_snap(pool->pmd);
2842         if (r)
2843                 DMWARN("release_metadata_snap message failed.");
2844
2845         return r;
2846 }
2847
2848 /*
2849  * Messages supported:
2850  *   create_thin        <dev_id>
2851  *   create_snap        <dev_id> <origin_id>
2852  *   delete             <dev_id>
2853  *   trim               <dev_id> <new_size_in_sectors>
2854  *   set_transaction_id <current_trans_id> <new_trans_id>
2855  *   reserve_metadata_snap
2856  *   release_metadata_snap
2857  */
2858 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2859 {
2860         int r = -EINVAL;
2861         struct pool_c *pt = ti->private;
2862         struct pool *pool = pt->pool;
2863
2864         if (!strcasecmp(argv[0], "create_thin"))
2865                 r = process_create_thin_mesg(argc, argv, pool);
2866
2867         else if (!strcasecmp(argv[0], "create_snap"))
2868                 r = process_create_snap_mesg(argc, argv, pool);
2869
2870         else if (!strcasecmp(argv[0], "delete"))
2871                 r = process_delete_mesg(argc, argv, pool);
2872
2873         else if (!strcasecmp(argv[0], "set_transaction_id"))
2874                 r = process_set_transaction_id_mesg(argc, argv, pool);
2875
2876         else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2877                 r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2878
2879         else if (!strcasecmp(argv[0], "release_metadata_snap"))
2880                 r = process_release_metadata_snap_mesg(argc, argv, pool);
2881
2882         else
2883                 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2884
2885         if (!r)
2886                 (void) commit(pool);
2887
2888         return r;
2889 }
2890
2891 static void emit_flags(struct pool_features *pf, char *result,
2892                        unsigned sz, unsigned maxlen)
2893 {
2894         unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2895                 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2896                 pf->error_if_no_space;
2897         DMEMIT("%u ", count);
2898
2899         if (!pf->zero_new_blocks)
2900                 DMEMIT("skip_block_zeroing ");
2901
2902         if (!pf->discard_enabled)
2903                 DMEMIT("ignore_discard ");
2904
2905         if (!pf->discard_passdown)
2906                 DMEMIT("no_discard_passdown ");
2907
2908         if (pf->mode == PM_READ_ONLY)
2909                 DMEMIT("read_only ");
2910
2911         if (pf->error_if_no_space)
2912                 DMEMIT("error_if_no_space ");
2913 }
2914
2915 /*
2916  * Status line is:
2917  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2918  *    <used data sectors>/<total data sectors> <held metadata root>
2919  */
2920 static void pool_status(struct dm_target *ti, status_type_t type,
2921                         unsigned status_flags, char *result, unsigned maxlen)
2922 {
2923         int r;
2924         unsigned sz = 0;
2925         uint64_t transaction_id;
2926         dm_block_t nr_free_blocks_data;
2927         dm_block_t nr_free_blocks_metadata;
2928         dm_block_t nr_blocks_data;
2929         dm_block_t nr_blocks_metadata;
2930         dm_block_t held_root;
2931         char buf[BDEVNAME_SIZE];
2932         char buf2[BDEVNAME_SIZE];
2933         struct pool_c *pt = ti->private;
2934         struct pool *pool = pt->pool;
2935
2936         switch (type) {
2937         case STATUSTYPE_INFO:
2938                 if (get_pool_mode(pool) == PM_FAIL) {
2939                         DMEMIT("Fail");
2940                         break;
2941                 }
2942
2943                 /* Commit to ensure statistics aren't out-of-date */
2944                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2945                         (void) commit(pool);
2946
2947                 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2948                 if (r) {
2949                         DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2950                               dm_device_name(pool->pool_md), r);
2951                         goto err;
2952                 }
2953
2954                 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2955                 if (r) {
2956                         DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2957                               dm_device_name(pool->pool_md), r);
2958                         goto err;
2959                 }
2960
2961                 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2962                 if (r) {
2963                         DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2964                               dm_device_name(pool->pool_md), r);
2965                         goto err;
2966                 }
2967
2968                 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2969                 if (r) {
2970                         DMERR("%s: dm_pool_get_free_block_count returned %d",
2971                               dm_device_name(pool->pool_md), r);
2972                         goto err;
2973                 }
2974
2975                 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2976                 if (r) {
2977                         DMERR("%s: dm_pool_get_data_dev_size returned %d",
2978                               dm_device_name(pool->pool_md), r);
2979                         goto err;
2980                 }
2981
2982                 r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2983                 if (r) {
2984                         DMERR("%s: dm_pool_get_metadata_snap returned %d",
2985                               dm_device_name(pool->pool_md), r);
2986                         goto err;
2987                 }
2988
2989                 DMEMIT("%llu %llu/%llu %llu/%llu ",
2990                        (unsigned long long)transaction_id,
2991                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2992                        (unsigned long long)nr_blocks_metadata,
2993                        (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2994                        (unsigned long long)nr_blocks_data);
2995
2996                 if (held_root)
2997                         DMEMIT("%llu ", held_root);
2998                 else
2999                         DMEMIT("- ");
3000
3001                 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
3002                         DMEMIT("out_of_data_space ");
3003                 else if (pool->pf.mode == PM_READ_ONLY)
3004                         DMEMIT("ro ");
3005                 else
3006                         DMEMIT("rw ");
3007
3008                 if (!pool->pf.discard_enabled)
3009                         DMEMIT("ignore_discard ");
3010                 else if (pool->pf.discard_passdown)
3011                         DMEMIT("discard_passdown ");
3012                 else
3013                         DMEMIT("no_discard_passdown ");
3014
3015                 if (pool->pf.error_if_no_space)
3016                         DMEMIT("error_if_no_space ");
3017                 else
3018                         DMEMIT("queue_if_no_space ");
3019
3020                 break;
3021
3022         case STATUSTYPE_TABLE:
3023                 DMEMIT("%s %s %lu %llu ",
3024                        format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3025                        format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3026                        (unsigned long)pool->sectors_per_block,
3027                        (unsigned long long)pt->low_water_blocks);
3028                 emit_flags(&pt->requested_pf, result, sz, maxlen);
3029                 break;
3030         }
3031         return;
3032
3033 err:
3034         DMEMIT("Error");
3035 }
3036
3037 static int pool_iterate_devices(struct dm_target *ti,
3038                                 iterate_devices_callout_fn fn, void *data)
3039 {
3040         struct pool_c *pt = ti->private;
3041
3042         return fn(ti, pt->data_dev, 0, ti->len, data);
3043 }
3044
3045 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3046                       struct bio_vec *biovec, int max_size)
3047 {
3048         struct pool_c *pt = ti->private;
3049         struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3050
3051         if (!q->merge_bvec_fn)
3052                 return max_size;
3053
3054         bvm->bi_bdev = pt->data_dev->bdev;
3055
3056         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3057 }
3058
3059 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
3060 {
3061         struct pool *pool = pt->pool;
3062         struct queue_limits *data_limits;
3063
3064         limits->max_discard_sectors = pool->sectors_per_block;
3065
3066         /*
3067          * discard_granularity is just a hint, and not enforced.
3068          */
3069         if (pt->adjusted_pf.discard_passdown) {
3070                 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
3071                 limits->discard_granularity = data_limits->discard_granularity;
3072         } else
3073                 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
3074 }
3075
3076 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3077 {
3078         struct pool_c *pt = ti->private;
3079         struct pool *pool = pt->pool;
3080         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3081
3082         /*
3083          * If the system-determined stacked limits are compatible with the
3084          * pool's blocksize (io_opt is a factor) do not override them.
3085          */
3086         if (io_opt_sectors < pool->sectors_per_block ||
3087             do_div(io_opt_sectors, pool->sectors_per_block)) {
3088                 blk_limits_io_min(limits, 0);
3089                 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3090         }
3091
3092         /*
3093          * pt->adjusted_pf is a staging area for the actual features to use.
3094          * They get transferred to the live pool in bind_control_target()
3095          * called from pool_preresume().
3096          */
3097         if (!pt->adjusted_pf.discard_enabled) {
3098                 /*
3099                  * Must explicitly disallow stacking discard limits otherwise the
3100                  * block layer will stack them if pool's data device has support.
3101                  * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3102                  * user to see that, so make sure to set all discard limits to 0.
3103                  */
3104                 limits->discard_granularity = 0;
3105                 return;
3106         }
3107
3108         disable_passdown_if_not_supported(pt);
3109
3110         set_discard_limits(pt, limits);
3111 }
3112
3113 static struct target_type pool_target = {
3114         .name = "thin-pool",
3115         .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3116                     DM_TARGET_IMMUTABLE,
3117         .version = {1, 12, 0},
3118         .module = THIS_MODULE,
3119         .ctr = pool_ctr,
3120         .dtr = pool_dtr,
3121         .map = pool_map,
3122         .postsuspend = pool_postsuspend,
3123         .preresume = pool_preresume,
3124         .resume = pool_resume,
3125         .message = pool_message,
3126         .status = pool_status,
3127         .merge = pool_merge,
3128         .iterate_devices = pool_iterate_devices,
3129         .io_hints = pool_io_hints,
3130 };
3131
3132 /*----------------------------------------------------------------
3133  * Thin target methods
3134  *--------------------------------------------------------------*/
3135 static void thin_get(struct thin_c *tc)
3136 {
3137         atomic_inc(&tc->refcount);
3138 }
3139
3140 static void thin_put(struct thin_c *tc)
3141 {
3142         if (atomic_dec_and_test(&tc->refcount))
3143                 complete(&tc->can_destroy);
3144 }
3145
3146 static void thin_dtr(struct dm_target *ti)
3147 {
3148         struct thin_c *tc = ti->private;
3149         unsigned long flags;
3150
3151         thin_put(tc);
3152         wait_for_completion(&tc->can_destroy);
3153
3154         spin_lock_irqsave(&tc->pool->lock, flags);
3155         list_del_rcu(&tc->list);
3156         spin_unlock_irqrestore(&tc->pool->lock, flags);
3157         synchronize_rcu();
3158
3159         mutex_lock(&dm_thin_pool_table.mutex);
3160
3161         __pool_dec(tc->pool);
3162         dm_pool_close_thin_device(tc->td);
3163         dm_put_device(ti, tc->pool_dev);
3164         if (tc->origin_dev)
3165                 dm_put_device(ti, tc->origin_dev);
3166         kfree(tc);
3167
3168         mutex_unlock(&dm_thin_pool_table.mutex);
3169 }
3170
3171 /*
3172  * Thin target parameters:
3173  *
3174  * <pool_dev> <dev_id> [origin_dev]
3175  *
3176  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3177  * dev_id: the internal device identifier
3178  * origin_dev: a device external to the pool that should act as the origin
3179  *
3180  * If the pool device has discards disabled, they get disabled for the thin
3181  * device as well.
3182  */
3183 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3184 {
3185         int r;
3186         struct thin_c *tc;
3187         struct dm_dev *pool_dev, *origin_dev;
3188         struct mapped_device *pool_md;
3189         unsigned long flags;
3190
3191         mutex_lock(&dm_thin_pool_table.mutex);
3192
3193         if (argc != 2 && argc != 3) {
3194                 ti->error = "Invalid argument count";
3195                 r = -EINVAL;
3196                 goto out_unlock;
3197         }
3198
3199         tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3200         if (!tc) {
3201                 ti->error = "Out of memory";
3202                 r = -ENOMEM;
3203                 goto out_unlock;
3204         }
3205         spin_lock_init(&tc->lock);
3206         bio_list_init(&tc->deferred_bio_list);
3207         bio_list_init(&tc->retry_on_resume_list);
3208         tc->sort_bio_list = RB_ROOT;
3209
3210         if (argc == 3) {
3211                 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3212                 if (r) {
3213                         ti->error = "Error opening origin device";
3214                         goto bad_origin_dev;
3215                 }
3216                 tc->origin_dev = origin_dev;
3217         }
3218
3219         r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3220         if (r) {
3221                 ti->error = "Error opening pool device";
3222                 goto bad_pool_dev;
3223         }
3224         tc->pool_dev = pool_dev;
3225
3226         if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3227                 ti->error = "Invalid device id";
3228                 r = -EINVAL;
3229                 goto bad_common;
3230         }
3231
3232         pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3233         if (!pool_md) {
3234                 ti->error = "Couldn't get pool mapped device";
3235                 r = -EINVAL;
3236                 goto bad_common;
3237         }
3238
3239         tc->pool = __pool_table_lookup(pool_md);
3240         if (!tc->pool) {
3241                 ti->error = "Couldn't find pool object";
3242                 r = -EINVAL;
3243                 goto bad_pool_lookup;
3244         }
3245         __pool_inc(tc->pool);
3246
3247         if (get_pool_mode(tc->pool) == PM_FAIL) {
3248                 ti->error = "Couldn't open thin device, Pool is in fail mode";
3249                 r = -EINVAL;
3250                 goto bad_thin_open;
3251         }
3252
3253         r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3254         if (r) {
3255                 ti->error = "Couldn't open thin internal device";
3256                 goto bad_thin_open;
3257         }
3258
3259         r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3260         if (r)
3261                 goto bad_target_max_io_len;
3262
3263         ti->num_flush_bios = 1;
3264         ti->flush_supported = true;
3265         ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
3266
3267         /* In case the pool supports discards, pass them on. */
3268         ti->discard_zeroes_data_unsupported = true;
3269         if (tc->pool->pf.discard_enabled) {
3270                 ti->discards_supported = true;
3271                 ti->num_discard_bios = 1;
3272                 /* Discard bios must be split on a block boundary */
3273                 ti->split_discard_bios = true;
3274         }
3275
3276         dm_put(pool_md);
3277
3278         mutex_unlock(&dm_thin_pool_table.mutex);
3279
3280         atomic_set(&tc->refcount, 1);
3281         init_completion(&tc->can_destroy);
3282
3283         spin_lock_irqsave(&tc->pool->lock, flags);
3284         list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3285         spin_unlock_irqrestore(&tc->pool->lock, flags);
3286         /*
3287          * This synchronize_rcu() call is needed here otherwise we risk a
3288          * wake_worker() call finding no bios to process (because the newly
3289          * added tc isn't yet visible).  So this reduces latency since we
3290          * aren't then dependent on the periodic commit to wake_worker().
3291          */
3292         synchronize_rcu();
3293
3294         return 0;
3295
3296 bad_target_max_io_len:
3297         dm_pool_close_thin_device(tc->td);
3298 bad_thin_open:
3299         __pool_dec(tc->pool);
3300 bad_pool_lookup:
3301         dm_put(pool_md);
3302 bad_common:
3303         dm_put_device(ti, tc->pool_dev);
3304 bad_pool_dev:
3305         if (tc->origin_dev)
3306                 dm_put_device(ti, tc->origin_dev);
3307 bad_origin_dev:
3308         kfree(tc);
3309 out_unlock:
3310         mutex_unlock(&dm_thin_pool_table.mutex);
3311
3312         return r;
3313 }
3314
3315 static int thin_map(struct dm_target *ti, struct bio *bio)
3316 {
3317         bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
3318
3319         return thin_bio_map(ti, bio);
3320 }
3321
3322 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
3323 {
3324         unsigned long flags;
3325         struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
3326         struct list_head work;
3327         struct dm_thin_new_mapping *m, *tmp;
3328         struct pool *pool = h->tc->pool;
3329
3330         if (h->shared_read_entry) {
3331                 INIT_LIST_HEAD(&work);
3332                 dm_deferred_entry_dec(h->shared_read_entry, &work);
3333
3334                 spin_lock_irqsave(&pool->lock, flags);
3335                 list_for_each_entry_safe(m, tmp, &work, list) {
3336                         list_del(&m->list);
3337                         m->quiesced = true;
3338                         __maybe_add_mapping(m);
3339                 }
3340                 spin_unlock_irqrestore(&pool->lock, flags);
3341         }
3342
3343         if (h->all_io_entry) {
3344                 INIT_LIST_HEAD(&work);
3345                 dm_deferred_entry_dec(h->all_io_entry, &work);
3346                 if (!list_empty(&work)) {
3347                         spin_lock_irqsave(&pool->lock, flags);
3348                         list_for_each_entry_safe(m, tmp, &work, list)
3349                                 list_add_tail(&m->list, &pool->prepared_discards);
3350                         spin_unlock_irqrestore(&pool->lock, flags);
3351                         wake_worker(pool);
3352                 }
3353         }
3354
3355         return 0;
3356 }
3357
3358 static void thin_presuspend(struct dm_target *ti)
3359 {
3360         struct thin_c *tc = ti->private;
3361
3362         if (dm_noflush_suspending(ti))
3363                 noflush_work(tc, do_noflush_start);
3364 }
3365
3366 static void thin_postsuspend(struct dm_target *ti)
3367 {
3368         struct thin_c *tc = ti->private;
3369
3370         /*
3371          * The dm_noflush_suspending flag has been cleared by now, so
3372          * unfortunately we must always run this.
3373          */
3374         noflush_work(tc, do_noflush_stop);
3375 }
3376
3377 /*
3378  * <nr mapped sectors> <highest mapped sector>
3379  */
3380 static void thin_status(struct dm_target *ti, status_type_t type,
3381                         unsigned status_flags, char *result, unsigned maxlen)
3382 {
3383         int r;
3384         ssize_t sz = 0;
3385         dm_block_t mapped, highest;
3386         char buf[BDEVNAME_SIZE];
3387         struct thin_c *tc = ti->private;
3388
3389         if (get_pool_mode(tc->pool) == PM_FAIL) {
3390                 DMEMIT("Fail");
3391                 return;
3392         }
3393
3394         if (!tc->td)
3395                 DMEMIT("-");
3396         else {
3397                 switch (type) {
3398                 case STATUSTYPE_INFO:
3399                         r = dm_thin_get_mapped_count(tc->td, &mapped);
3400                         if (r) {
3401                                 DMERR("dm_thin_get_mapped_count returned %d", r);
3402                                 goto err;
3403                         }
3404
3405                         r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3406                         if (r < 0) {
3407                                 DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3408                                 goto err;
3409                         }
3410
3411                         DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3412                         if (r)
3413                                 DMEMIT("%llu", ((highest + 1) *
3414                                                 tc->pool->sectors_per_block) - 1);
3415                         else
3416                                 DMEMIT("-");
3417                         break;
3418
3419                 case STATUSTYPE_TABLE:
3420                         DMEMIT("%s %lu",
3421                                format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3422                                (unsigned long) tc->dev_id);
3423                         if (tc->origin_dev)
3424                                 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
3425                         break;
3426                 }
3427         }
3428
3429         return;
3430
3431 err:
3432         DMEMIT("Error");
3433 }
3434
3435 static int thin_iterate_devices(struct dm_target *ti,
3436                                 iterate_devices_callout_fn fn, void *data)
3437 {
3438         sector_t blocks;
3439         struct thin_c *tc = ti->private;
3440         struct pool *pool = tc->pool;
3441
3442         /*
3443          * We can't call dm_pool_get_data_dev_size() since that blocks.  So
3444          * we follow a more convoluted path through to the pool's target.
3445          */
3446         if (!pool->ti)
3447                 return 0;       /* nothing is bound */
3448
3449         blocks = pool->ti->len;
3450         (void) sector_div(blocks, pool->sectors_per_block);
3451         if (blocks)
3452                 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
3453
3454         return 0;
3455 }
3456
3457 static struct target_type thin_target = {
3458         .name = "thin",
3459         .version = {1, 12, 0},
3460         .module = THIS_MODULE,
3461         .ctr = thin_ctr,
3462         .dtr = thin_dtr,
3463         .map = thin_map,
3464         .end_io = thin_endio,
3465         .presuspend = thin_presuspend,
3466         .postsuspend = thin_postsuspend,
3467         .status = thin_status,
3468         .iterate_devices = thin_iterate_devices,
3469 };
3470
3471 /*----------------------------------------------------------------*/
3472
3473 static int __init dm_thin_init(void)
3474 {
3475         int r;
3476
3477         pool_table_init();
3478
3479         r = dm_register_target(&thin_target);
3480         if (r)
3481                 return r;
3482
3483         r = dm_register_target(&pool_target);
3484         if (r)
3485                 goto bad_pool_target;
3486
3487         r = -ENOMEM;
3488
3489         _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3490         if (!_new_mapping_cache)
3491                 goto bad_new_mapping_cache;
3492
3493         return 0;
3494
3495 bad_new_mapping_cache:
3496         dm_unregister_target(&pool_target);
3497 bad_pool_target:
3498         dm_unregister_target(&thin_target);
3499
3500         return r;
3501 }
3502
3503 static void dm_thin_exit(void)
3504 {
3505         dm_unregister_target(&thin_target);
3506         dm_unregister_target(&pool_target);
3507
3508         kmem_cache_destroy(_new_mapping_cache);
3509 }
3510
3511 module_init(dm_thin_init);
3512 module_exit(dm_thin_exit);
3513
3514 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
3515 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
3516
3517 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3518 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3519 MODULE_LICENSE("GPL");