drivers/md/dm-cache-target.c

   1 /*
   2  * Copyright (C) 2012 Red Hat. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm.h"
   8 #include "dm-bio-prison.h"
   9 #include "dm-bio-record.h"
  10 #include "dm-cache-metadata.h"
  11
  12 #include <linux/dm-io.h>
  13 #include <linux/dm-kcopyd.h>
  14 #include <linux/init.h>
  15 #include <linux/mempool.h>
  16 #include <linux/module.h>
  17 #include <linux/slab.h>
  18 #include <linux/vmalloc.h>
  19
  20 #define DM_MSG_PREFIX "cache"
  21
  22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  23         "A percentage of time allocated for copying to and/or from cache");
  24
  25 /*----------------------------------------------------------------*/
  26
  27 /*
  28  * Glossary:
  29  *
  30  * oblock: index of an origin block
  31  * cblock: index of a cache block
  32  * promotion: movement of a block from origin to cache
  33  * demotion: movement of a block from cache to origin
  34  * migration: movement of a block between the origin and cache device,
  35  *            either direction
  36  */
  37
  38 /*----------------------------------------------------------------*/
  39
  40 static size_t bitset_size_in_bytes(unsigned nr_entries)
  41 {
  42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
  43 }
  44
  45 static unsigned long *alloc_bitset(unsigned nr_entries)
  46 {
  47         size_t s = bitset_size_in_bytes(nr_entries);
  48         return vzalloc(s);
  49 }
  50
  51 static void clear_bitset(void *bitset, unsigned nr_entries)
  52 {
  53         size_t s = bitset_size_in_bytes(nr_entries);
  54         memset(bitset, 0, s);
  55 }
  56
  57 static void free_bitset(unsigned long *bits)
  58 {
  59         vfree(bits);
  60 }
  61
  62 /*----------------------------------------------------------------*/
  63
  64 #define PRISON_CELLS 1024
  65 #define MIGRATION_POOL_SIZE 128
  66 #define COMMIT_PERIOD HZ
  67 #define MIGRATION_COUNT_WINDOW 10
  68
  69 /*
  70  * The block size of the device holding cache data must be
  71  * between 32KB and 1GB.
  72  */
  73 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
  74 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  75
  76 /*
  77  * FIXME: the cache is read/write for the time being.
  78  */
  79 enum cache_mode {
  80         CM_WRITE,               /* metadata may be changed */
  81         CM_READ_ONLY,           /* metadata may not be changed */
  82 };
  83
  84 struct cache_features {
  85         enum cache_mode mode;
  86         bool write_through:1;
  87 };
  88
  89 struct cache_stats {
  90         atomic_t read_hit;
  91         atomic_t read_miss;
  92         atomic_t write_hit;
  93         atomic_t write_miss;
  94         atomic_t demotion;
  95         atomic_t promotion;
  96         atomic_t copies_avoided;
  97         atomic_t cache_cell_clash;
  98         atomic_t commit_count;
  99         atomic_t discard_count;
 100 };
 101
 102 struct cache {
 103         struct dm_target *ti;
 104         struct dm_target_callbacks callbacks;
 105
 106         struct dm_cache_metadata *cmd;
 107
 108         /*
 109          * Metadata is written to this device.
 110          */
 111         struct dm_dev *metadata_dev;
 112
 113         /*
 114          * The slower of the two data devices.  Typically a spindle.
 115          */
 116         struct dm_dev *origin_dev;
 117
 118         /*
 119          * The faster of the two data devices.  Typically an SSD.
 120          */
 121         struct dm_dev *cache_dev;
 122
 123         /*
 124          * Size of the origin device in _complete_ blocks and native sectors.
 125          */
 126         dm_oblock_t origin_blocks;
 127         sector_t origin_sectors;
 128
 129         /*
 130          * Size of the cache device in blocks.
 131          */
 132         dm_cblock_t cache_size;
 133
 134         /*
 135          * Fields for converting from sectors to blocks.
 136          */
 137         uint32_t sectors_per_block;
 138         int sectors_per_block_shift;
 139
 140         spinlock_t lock;
 141         struct bio_list deferred_bios;
 142         struct bio_list deferred_flush_bios;
 143         struct bio_list deferred_writethrough_bios;
 144         struct list_head quiesced_migrations;
 145         struct list_head completed_migrations;
 146         struct list_head need_commit_migrations;
 147         sector_t migration_threshold;
 148         wait_queue_head_t migration_wait;
 149         atomic_t nr_migrations;
 150
 151         /*
 152          * cache_size entries, dirty if set
 153          */
 154         dm_cblock_t nr_dirty;
 155         unsigned long *dirty_bitset;
 156
 157         /*
 158          * origin_blocks entries, discarded if set.
 159          */
 160         dm_dblock_t discard_nr_blocks;
 161         unsigned long *discard_bitset;
 162         uint32_t discard_block_size; /* a power of 2 times sectors per block */
 163
 164         /*
 165          * Rather than reconstructing the table line for the status we just
 166          * save it and regurgitate.
 167          */
 168         unsigned nr_ctr_args;
 169         const char **ctr_args;
 170
 171         struct dm_kcopyd_client *copier;
 172         struct workqueue_struct *wq;
 173         struct work_struct worker;
 174
 175         struct delayed_work waker;
 176         unsigned long last_commit_jiffies;
 177
 178         struct dm_bio_prison *prison;
 179         struct dm_deferred_set *all_io_ds;
 180
 181         mempool_t *migration_pool;
 182         struct dm_cache_migration *next_migration;
 183
 184         struct dm_cache_policy *policy;
 185         unsigned policy_nr_args;
 186
 187         bool need_tick_bio:1;
 188         bool sized:1;
 189         bool quiescing:1;
 190         bool commit_requested:1;
 191         bool loaded_mappings:1;
 192         bool loaded_discards:1;
 193
 194         /*
 195          * Cache features such as write-through.
 196          */
 197         struct cache_features features;
 198
 199         struct cache_stats stats;
 200 };
 201
 202 struct per_bio_data {
 203         bool tick:1;
 204         unsigned req_nr:2;
 205         struct dm_deferred_entry *all_io_entry;
 206
 207         /*
 208          * writethrough fields.  These MUST remain at the end of this
 209          * structure and the 'cache' member must be the first as it
 210          * is used to determine the offset of the writethrough fields.
 211          */
 212         struct cache *cache;
 213         dm_cblock_t cblock;
 214         bio_end_io_t *saved_bi_end_io;
 215         struct dm_bio_details bio_details;
 216 };
 217
 218 struct dm_cache_migration {
 219         struct list_head list;
 220         struct cache *cache;
 221
 222         unsigned long start_jiffies;
 223         dm_oblock_t old_oblock;
 224         dm_oblock_t new_oblock;
 225         dm_cblock_t cblock;
 226
 227         bool err:1;
 228         bool writeback:1;
 229         bool demote:1;
 230         bool promote:1;
 231
 232         struct dm_bio_prison_cell *old_ocell;
 233         struct dm_bio_prison_cell *new_ocell;
 234 };
 235
 236 /*
 237  * Processing a bio in the worker thread may require these memory
 238  * allocations.  We prealloc to avoid deadlocks (the same worker thread
 239  * frees them back to the mempool).
 240  */
 241 struct prealloc {
 242         struct dm_cache_migration *mg;
 243         struct dm_bio_prison_cell *cell1;
 244         struct dm_bio_prison_cell *cell2;
 245 };
 246
 247 static void wake_worker(struct cache *cache)
 248 {
 249         queue_work(cache->wq, &cache->worker);
 250 }
 251
 252 /*----------------------------------------------------------------*/
 253
 254 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
 255 {
 256         /* FIXME: change to use a local slab. */
 257         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 258 }
 259
 260 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
 261 {
 262         dm_bio_prison_free_cell(cache->prison, cell);
 263 }
 264
 265 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
 266 {
 267         if (!p->mg) {
 268                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 269                 if (!p->mg)
 270                         return -ENOMEM;
 271         }
 272
 273         if (!p->cell1) {
 274                 p->cell1 = alloc_prison_cell(cache);
 275                 if (!p->cell1)
 276                         return -ENOMEM;
 277         }
 278
 279         if (!p->cell2) {
 280                 p->cell2 = alloc_prison_cell(cache);
 281                 if (!p->cell2)
 282                         return -ENOMEM;
 283         }
 284
 285         return 0;
 286 }
 287
 288 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
 289 {
 290         if (p->cell2)
 291                 free_prison_cell(cache, p->cell2);
 292
 293         if (p->cell1)
 294                 free_prison_cell(cache, p->cell1);
 295
 296         if (p->mg)
 297                 mempool_free(p->mg, cache->migration_pool);
 298 }
 299
 300 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
 301 {
 302         struct dm_cache_migration *mg = p->mg;
 303
 304         BUG_ON(!mg);
 305         p->mg = NULL;
 306
 307         return mg;
 308 }
 309
 310 /*
 311  * You must have a cell within the prealloc struct to return.  If not this
 312  * function will BUG() rather than returning NULL.
 313  */
 314 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
 315 {
 316         struct dm_bio_prison_cell *r = NULL;
 317
 318         if (p->cell1) {
 319                 r = p->cell1;
 320                 p->cell1 = NULL;
 321
 322         } else if (p->cell2) {
 323                 r = p->cell2;
 324                 p->cell2 = NULL;
 325         } else
 326                 BUG();
 327
 328         return r;
 329 }
 330
 331 /*
 332  * You can't have more than two cells in a prealloc struct.  BUG() will be
 333  * called if you try and overfill.
 334  */
 335 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 336 {
 337         if (!p->cell2)
 338                 p->cell2 = cell;
 339
 340         else if (!p->cell1)
 341                 p->cell1 = cell;
 342
 343         else
 344                 BUG();
 345 }
 346
 347 /*----------------------------------------------------------------*/
 348
 349 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
 350 {
 351         key->virtual = 0;
 352         key->dev = 0;
 353         key->block = from_oblock(oblock);
 354 }
 355
 356 /*
 357  * The caller hands in a preallocated cell, and a free function for it.
 358  * The cell will be freed if there's an error, or if it wasn't used because
 359  * a cell with that key already exists.
 360  */
 361 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
 362
 363 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
 364                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
 365                       cell_free_fn free_fn, void *free_context,
 366                       struct dm_bio_prison_cell **cell_result)
 367 {
 368         int r;
 369         struct dm_cell_key key;
 370
 371         build_key(oblock, &key);
 372         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
 373         if (r)
 374                 free_fn(free_context, cell_prealloc);
 375
 376         return r;
 377 }
 378
 379 static int get_cell(struct cache *cache,
 380                     dm_oblock_t oblock,
 381                     struct prealloc *structs,
 382                     struct dm_bio_prison_cell **cell_result)
 383 {
 384         int r;
 385         struct dm_cell_key key;
 386         struct dm_bio_prison_cell *cell_prealloc;
 387
 388         cell_prealloc = prealloc_get_cell(structs);
 389
 390         build_key(oblock, &key);
 391         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
 392         if (r)
 393                 prealloc_put_cell(structs, cell_prealloc);
 394
 395         return r;
 396 }
 397
 398 /*----------------------------------------------------------------*/
 399
 400 static bool is_dirty(struct cache *cache, dm_cblock_t b)
 401 {
 402         return test_bit(from_cblock(b), cache->dirty_bitset);
 403 }
 404
 405 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 406 {
 407         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 408                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
 409                 policy_set_dirty(cache->policy, oblock);
 410         }
 411 }
 412
 413 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
 414 {
 415         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 416                 policy_clear_dirty(cache->policy, oblock);
 417                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
 418                 if (!from_cblock(cache->nr_dirty))
 419                         dm_table_event(cache->ti->table);
 420         }
 421 }
 422
 423 /*----------------------------------------------------------------*/
 424
 425 static bool block_size_is_power_of_two(struct cache *cache)
 426 {
 427         return cache->sectors_per_block_shift >= 0;
 428 }
 429
 430 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 431 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 432 __always_inline
 433 #endif
 434 static dm_block_t block_div(dm_block_t b, uint32_t n)
 435 {
 436         do_div(b, n);
 437
 438         return b;
 439 }
 440
 441 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 442 {
 443         uint32_t discard_blocks = cache->discard_block_size;
 444         dm_block_t b = from_oblock(oblock);
 445
 446         if (!block_size_is_power_of_two(cache))
 447                 discard_blocks = discard_blocks / cache->sectors_per_block;
 448         else
 449                 discard_blocks >>= cache->sectors_per_block_shift;
 450
 451         b = block_div(b, discard_blocks);
 452
 453         return to_dblock(b);
 454 }
 455
 456 static void set_discard(struct cache *cache, dm_dblock_t b)
 457 {
 458         unsigned long flags;
 459
 460         atomic_inc(&cache->stats.discard_count);
 461
 462         spin_lock_irqsave(&cache->lock, flags);
 463         set_bit(from_dblock(b), cache->discard_bitset);
 464         spin_unlock_irqrestore(&cache->lock, flags);
 465 }
 466
 467 static void clear_discard(struct cache *cache, dm_dblock_t b)
 468 {
 469         unsigned long flags;
 470
 471         spin_lock_irqsave(&cache->lock, flags);
 472         clear_bit(from_dblock(b), cache->discard_bitset);
 473         spin_unlock_irqrestore(&cache->lock, flags);
 474 }
 475
 476 static bool is_discarded(struct cache *cache, dm_dblock_t b)
 477 {
 478         int r;
 479         unsigned long flags;
 480
 481         spin_lock_irqsave(&cache->lock, flags);
 482         r = test_bit(from_dblock(b), cache->discard_bitset);
 483         spin_unlock_irqrestore(&cache->lock, flags);
 484
 485         return r;
 486 }
 487
 488 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 489 {
 490         int r;
 491         unsigned long flags;
 492
 493         spin_lock_irqsave(&cache->lock, flags);
 494         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 495                      cache->discard_bitset);
 496         spin_unlock_irqrestore(&cache->lock, flags);
 497
 498         return r;
 499 }
 500
 501 /*----------------------------------------------------------------*/
 502
 503 static void load_stats(struct cache *cache)
 504 {
 505         struct dm_cache_statistics stats;
 506
 507         dm_cache_metadata_get_stats(cache->cmd, &stats);
 508         atomic_set(&cache->stats.read_hit, stats.read_hits);
 509         atomic_set(&cache->stats.read_miss, stats.read_misses);
 510         atomic_set(&cache->stats.write_hit, stats.write_hits);
 511         atomic_set(&cache->stats.write_miss, stats.write_misses);
 512 }
 513
 514 static void save_stats(struct cache *cache)
 515 {
 516         struct dm_cache_statistics stats;
 517
 518         stats.read_hits = atomic_read(&cache->stats.read_hit);
 519         stats.read_misses = atomic_read(&cache->stats.read_miss);
 520         stats.write_hits = atomic_read(&cache->stats.write_hit);
 521         stats.write_misses = atomic_read(&cache->stats.write_miss);
 522
 523         dm_cache_metadata_set_stats(cache->cmd, &stats);
 524 }
 525
 526 /*----------------------------------------------------------------
 527  * Per bio data
 528  *--------------------------------------------------------------*/
 529
 530 /*
 531  * If using writeback, leave out struct per_bio_data's writethrough fields.
 532  */
 533 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 534 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
 535
 536 static size_t get_per_bio_data_size(struct cache *cache)
 537 {
 538         return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 539 }
 540
 541 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
 542 {
 543         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
 544         BUG_ON(!pb);
 545         return pb;
 546 }
 547
 548 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 549 {
 550         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
 551
 552         pb->tick = false;
 553         pb->req_nr = dm_bio_get_target_bio_nr(bio);
 554         pb->all_io_entry = NULL;
 555
 556         return pb;
 557 }
 558
 559 /*----------------------------------------------------------------
 560  * Remapping
 561  *--------------------------------------------------------------*/
 562 static void remap_to_origin(struct cache *cache, struct bio *bio)
 563 {
 564         bio->bi_bdev = cache->origin_dev->bdev;
 565 }
 566
 567 static void remap_to_cache(struct cache *cache, struct bio *bio,
 568                            dm_cblock_t cblock)
 569 {
 570         sector_t bi_sector = bio->bi_sector;
 571
 572         bio->bi_bdev = cache->cache_dev->bdev;
 573         if (!block_size_is_power_of_two(cache))
 574                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
 575                                 sector_div(bi_sector, cache->sectors_per_block);
 576         else
 577                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
 578                                 (bi_sector & (cache->sectors_per_block - 1));
 579 }
 580
 581 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 582 {
 583         unsigned long flags;
 584         size_t pb_data_size = get_per_bio_data_size(cache);
 585         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 586
 587         spin_lock_irqsave(&cache->lock, flags);
 588         if (cache->need_tick_bio &&
 589             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
 590                 pb->tick = true;
 591                 cache->need_tick_bio = false;
 592         }
 593         spin_unlock_irqrestore(&cache->lock, flags);
 594 }
 595
 596 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 597                                   dm_oblock_t oblock)
 598 {
 599         check_if_tick_bio_needed(cache, bio);
 600         remap_to_origin(cache, bio);
 601         if (bio_data_dir(bio) == WRITE)
 602                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 603 }
 604
 605 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 606                                  dm_oblock_t oblock, dm_cblock_t cblock)
 607 {
 608         remap_to_cache(cache, bio, cblock);
 609         if (bio_data_dir(bio) == WRITE) {
 610                 set_dirty(cache, oblock, cblock);
 611                 clear_discard(cache, oblock_to_dblock(cache, oblock));
 612         }
 613 }
 614
 615 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 616 {
 617         sector_t block_nr = bio->bi_sector;
 618
 619         if (!block_size_is_power_of_two(cache))
 620                 (void) sector_div(block_nr, cache->sectors_per_block);
 621         else
 622                 block_nr >>= cache->sectors_per_block_shift;
 623
 624         return to_oblock(block_nr);
 625 }
 626
 627 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
 628 {
 629         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 630 }
 631
 632 static void issue(struct cache *cache, struct bio *bio)
 633 {
 634         unsigned long flags;
 635
 636         if (!bio_triggers_commit(cache, bio)) {
 637                 generic_make_request(bio);
 638                 return;
 639         }
 640
 641         /*
 642          * Batch together any bios that trigger commits and then issue a
 643          * single commit for them in do_worker().
 644          */
 645         spin_lock_irqsave(&cache->lock, flags);
 646         cache->commit_requested = true;
 647         bio_list_add(&cache->deferred_flush_bios, bio);
 648         spin_unlock_irqrestore(&cache->lock, flags);
 649 }
 650
 651 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 652 {
 653         unsigned long flags;
 654
 655         spin_lock_irqsave(&cache->lock, flags);
 656         bio_list_add(&cache->deferred_writethrough_bios, bio);
 657         spin_unlock_irqrestore(&cache->lock, flags);
 658
 659         wake_worker(cache);
 660 }
 661
 662 static void writethrough_endio(struct bio *bio, int err)
 663 {
 664         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 665         bio->bi_end_io = pb->saved_bi_end_io;
 666
 667         if (err) {
 668                 bio_endio(bio, err);
 669                 return;
 670         }
 671
 672         dm_bio_restore(&pb->bio_details, bio);
 673         remap_to_cache(pb->cache, bio, pb->cblock);
 674
 675         /*
 676          * We can't issue this bio directly, since we're in interrupt
 677          * context.  So it gets put on a bio list for processing by the
 678          * worker thread.
 679          */
 680         defer_writethrough_bio(pb->cache, bio);
 681 }
 682
 683 /*
 684  * When running in writethrough mode we need to send writes to clean blocks
 685  * to both the cache and origin devices.  In future we'd like to clone the
 686  * bio and send them in parallel, but for now we're doing them in
 687  * series as this is easier.
 688  */
 689 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 690                                        dm_oblock_t oblock, dm_cblock_t cblock)
 691 {
 692         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
 693
 694         pb->cache = cache;
 695         pb->cblock = cblock;
 696         pb->saved_bi_end_io = bio->bi_end_io;
 697         dm_bio_record(&pb->bio_details, bio);
 698         bio->bi_end_io = writethrough_endio;
 699
 700         remap_to_origin_clear_discard(pb->cache, bio, oblock);
 701 }
 702
 703 /*----------------------------------------------------------------
 704  * Migration processing
 705  *
 706  * Migration covers moving data from the origin device to the cache, or
 707  * vice versa.
 708  *--------------------------------------------------------------*/
 709 static void free_migration(struct dm_cache_migration *mg)
 710 {
 711         mempool_free(mg, mg->cache->migration_pool);
 712 }
 713
 714 static void inc_nr_migrations(struct cache *cache)
 715 {
 716         atomic_inc(&cache->nr_migrations);
 717 }
 718
 719 static void dec_nr_migrations(struct cache *cache)
 720 {
 721         atomic_dec(&cache->nr_migrations);
 722
 723         /*
 724          * Wake the worker in case we're suspending the target.
 725          */
 726         wake_up(&cache->migration_wait);
 727 }
 728
 729 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 730                          bool holder)
 731 {
 732         (holder ? dm_cell_release : dm_cell_release_no_holder)
 733                 (cache->prison, cell, &cache->deferred_bios);
 734         free_prison_cell(cache, cell);
 735 }
 736
 737 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 738                        bool holder)
 739 {
 740         unsigned long flags;
 741
 742         spin_lock_irqsave(&cache->lock, flags);
 743         __cell_defer(cache, cell, holder);
 744         spin_unlock_irqrestore(&cache->lock, flags);
 745
 746         wake_worker(cache);
 747 }
 748
 749 static void cleanup_migration(struct dm_cache_migration *mg)
 750 {
 751         dec_nr_migrations(mg->cache);
 752         free_migration(mg);
 753 }
 754
 755 static void migration_failure(struct dm_cache_migration *mg)
 756 {
 757         struct cache *cache = mg->cache;
 758
 759         if (mg->writeback) {
 760                 DMWARN_LIMIT("writeback failed; couldn't copy block");
 761                 set_dirty(cache, mg->old_oblock, mg->cblock);
 762                 cell_defer(cache, mg->old_ocell, false);
 763
 764         } else if (mg->demote) {
 765                 DMWARN_LIMIT("demotion failed; couldn't copy block");
 766                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 767
 768                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 769                 if (mg->promote)
 770                         cell_defer(cache, mg->new_ocell, 1);
 771         } else {
 772                 DMWARN_LIMIT("promotion failed; couldn't copy block");
 773                 policy_remove_mapping(cache->policy, mg->new_oblock);
 774                 cell_defer(cache, mg->new_ocell, 1);
 775         }
 776
 777         cleanup_migration(mg);
 778 }
 779
 780 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 781 {
 782         unsigned long flags;
 783         struct cache *cache = mg->cache;
 784
 785         if (mg->writeback) {
 786                 cell_defer(cache, mg->old_ocell, false);
 787                 clear_dirty(cache, mg->old_oblock, mg->cblock);
 788                 cleanup_migration(mg);
 789                 return;
 790
 791         } else if (mg->demote) {
 792                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
 793                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
 794                         policy_force_mapping(cache->policy, mg->new_oblock,
 795                                              mg->old_oblock);
 796                         if (mg->promote)
 797                                 cell_defer(cache, mg->new_ocell, true);
 798                         cleanup_migration(mg);
 799                         return;
 800                 }
 801         } else {
 802                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
 803                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
 804                         policy_remove_mapping(cache->policy, mg->new_oblock);
 805                         cleanup_migration(mg);
 806                         return;
 807                 }
 808         }
 809
 810         spin_lock_irqsave(&cache->lock, flags);
 811         list_add_tail(&mg->list, &cache->need_commit_migrations);
 812         cache->commit_requested = true;
 813         spin_unlock_irqrestore(&cache->lock, flags);
 814 }
 815
 816 static void migration_success_post_commit(struct dm_cache_migration *mg)
 817 {
 818         unsigned long flags;
 819         struct cache *cache = mg->cache;
 820
 821         if (mg->writeback) {
 822                 DMWARN("writeback unexpectedly triggered commit");
 823                 return;
 824
 825         } else if (mg->demote) {
 826                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
 827
 828                 if (mg->promote) {
 829                         mg->demote = false;
 830
 831                         spin_lock_irqsave(&cache->lock, flags);
 832                         list_add_tail(&mg->list, &cache->quiesced_migrations);
 833                         spin_unlock_irqrestore(&cache->lock, flags);
 834
 835                 } else
 836                         cleanup_migration(mg);
 837
 838         } else {
 839                 cell_defer(cache, mg->new_ocell, true);
 840                 clear_dirty(cache, mg->new_oblock, mg->cblock);
 841                 cleanup_migration(mg);
 842         }
 843 }
 844
 845 static void copy_complete(int read_err, unsigned long write_err, void *context)
 846 {
 847         unsigned long flags;
 848         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
 849         struct cache *cache = mg->cache;
 850
 851         if (read_err || write_err)
 852                 mg->err = true;
 853
 854         spin_lock_irqsave(&cache->lock, flags);
 855         list_add_tail(&mg->list, &cache->completed_migrations);
 856         spin_unlock_irqrestore(&cache->lock, flags);
 857
 858         wake_worker(cache);
 859 }
 860
 861 static void issue_copy_real(struct dm_cache_migration *mg)
 862 {
 863         int r;
 864         struct dm_io_region o_region, c_region;
 865         struct cache *cache = mg->cache;
 866
 867         o_region.bdev = cache->origin_dev->bdev;
 868         o_region.count = cache->sectors_per_block;
 869
 870         c_region.bdev = cache->cache_dev->bdev;
 871         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
 872         c_region.count = cache->sectors_per_block;
 873
 874         if (mg->writeback || mg->demote) {
 875                 /* demote */
 876                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
 877                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
 878         } else {
 879                 /* promote */
 880                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
 881                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
 882         }
 883
 884         if (r < 0)
 885                 migration_failure(mg);
 886 }
 887
 888 static void avoid_copy(struct dm_cache_migration *mg)
 889 {
 890         atomic_inc(&mg->cache->stats.copies_avoided);
 891         migration_success_pre_commit(mg);
 892 }
 893
 894 static void issue_copy(struct dm_cache_migration *mg)
 895 {
 896         bool avoid;
 897         struct cache *cache = mg->cache;
 898
 899         if (mg->writeback || mg->demote)
 900                 avoid = !is_dirty(cache, mg->cblock) ||
 901                         is_discarded_oblock(cache, mg->old_oblock);
 902         else
 903                 avoid = is_discarded_oblock(cache, mg->new_oblock);
 904
 905         avoid ? avoid_copy(mg) : issue_copy_real(mg);
 906 }
 907
 908 static void complete_migration(struct dm_cache_migration *mg)
 909 {
 910         if (mg->err)
 911                 migration_failure(mg);
 912         else
 913                 migration_success_pre_commit(mg);
 914 }
 915
 916 static void process_migrations(struct cache *cache, struct list_head *head,
 917                                void (*fn)(struct dm_cache_migration *))
 918 {
 919         unsigned long flags;
 920         struct list_head list;
 921         struct dm_cache_migration *mg, *tmp;
 922
 923         INIT_LIST_HEAD(&list);
 924         spin_lock_irqsave(&cache->lock, flags);
 925         list_splice_init(head, &list);
 926         spin_unlock_irqrestore(&cache->lock, flags);
 927
 928         list_for_each_entry_safe(mg, tmp, &list, list)
 929                 fn(mg);
 930 }
 931
 932 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
 933 {
 934         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
 935 }
 936
 937 static void queue_quiesced_migration(struct dm_cache_migration *mg)
 938 {
 939         unsigned long flags;
 940         struct cache *cache = mg->cache;
 941
 942         spin_lock_irqsave(&cache->lock, flags);
 943         __queue_quiesced_migration(mg);
 944         spin_unlock_irqrestore(&cache->lock, flags);
 945
 946         wake_worker(cache);
 947 }
 948
 949 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
 950 {
 951         unsigned long flags;
 952         struct dm_cache_migration *mg, *tmp;
 953
 954         spin_lock_irqsave(&cache->lock, flags);
 955         list_for_each_entry_safe(mg, tmp, work, list)
 956                 __queue_quiesced_migration(mg);
 957         spin_unlock_irqrestore(&cache->lock, flags);
 958
 959         wake_worker(cache);
 960 }
 961
 962 static void check_for_quiesced_migrations(struct cache *cache,
 963                                           struct per_bio_data *pb)
 964 {
 965         struct list_head work;
 966
 967         if (!pb->all_io_entry)
 968                 return;
 969
 970         INIT_LIST_HEAD(&work);
 971         if (pb->all_io_entry)
 972                 dm_deferred_entry_dec(pb->all_io_entry, &work);
 973
 974         if (!list_empty(&work))
 975                 queue_quiesced_migrations(cache, &work);
 976 }
 977
 978 static void quiesce_migration(struct dm_cache_migration *mg)
 979 {
 980         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
 981                 queue_quiesced_migration(mg);
 982 }
 983
 984 static void promote(struct cache *cache, struct prealloc *structs,
 985                     dm_oblock_t oblock, dm_cblock_t cblock,
 986                     struct dm_bio_prison_cell *cell)
 987 {
 988         struct dm_cache_migration *mg = prealloc_get_migration(structs);
 989
 990         mg->err = false;
 991         mg->writeback = false;
 992         mg->demote = false;
 993         mg->promote = true;
 994         mg->cache = cache;
 995         mg->new_oblock = oblock;
 996         mg->cblock = cblock;
 997         mg->old_ocell = NULL;
 998         mg->new_ocell = cell;
 999         mg->start_jiffies = jiffies;
1000
1001         inc_nr_migrations(cache);
1002         quiesce_migration(mg);
1003 }
1004
1005 static void writeback(struct cache *cache, struct prealloc *structs,
1006                       dm_oblock_t oblock, dm_cblock_t cblock,
1007                       struct dm_bio_prison_cell *cell)
1008 {
1009         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1010
1011         mg->err = false;
1012         mg->writeback = true;
1013         mg->demote = false;
1014         mg->promote = false;
1015         mg->cache = cache;
1016         mg->old_oblock = oblock;
1017         mg->cblock = cblock;
1018         mg->old_ocell = cell;
1019         mg->new_ocell = NULL;
1020         mg->start_jiffies = jiffies;
1021
1022         inc_nr_migrations(cache);
1023         quiesce_migration(mg);
1024 }
1025
1026 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1027                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1028                                 dm_cblock_t cblock,
1029                                 struct dm_bio_prison_cell *old_ocell,
1030                                 struct dm_bio_prison_cell *new_ocell)
1031 {
1032         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1033
1034         mg->err = false;
1035         mg->writeback = false;
1036         mg->demote = true;
1037         mg->promote = true;
1038         mg->cache = cache;
1039         mg->old_oblock = old_oblock;
1040         mg->new_oblock = new_oblock;
1041         mg->cblock = cblock;
1042         mg->old_ocell = old_ocell;
1043         mg->new_ocell = new_ocell;
1044         mg->start_jiffies = jiffies;
1045
1046         inc_nr_migrations(cache);
1047         quiesce_migration(mg);
1048 }
1049
1050 /*----------------------------------------------------------------
1051  * bio processing
1052  *--------------------------------------------------------------*/
1053 static void defer_bio(struct cache *cache, struct bio *bio)
1054 {
1055         unsigned long flags;
1056
1057         spin_lock_irqsave(&cache->lock, flags);
1058         bio_list_add(&cache->deferred_bios, bio);
1059         spin_unlock_irqrestore(&cache->lock, flags);
1060
1061         wake_worker(cache);
1062 }
1063
1064 static void process_flush_bio(struct cache *cache, struct bio *bio)
1065 {
1066         size_t pb_data_size = get_per_bio_data_size(cache);
1067         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1068
1069         BUG_ON(bio->bi_size);
1070         if (!pb->req_nr)
1071                 remap_to_origin(cache, bio);
1072         else
1073                 remap_to_cache(cache, bio, 0);
1074
1075         issue(cache, bio);
1076 }
1077
1078 /*
1079  * People generally discard large parts of a device, eg, the whole device
1080  * when formatting.  Splitting these large discards up into cache block
1081  * sized ios and then quiescing (always neccessary for discard) takes too
1082  * long.
1083  *
1084  * We keep it simple, and allow any size of discard to come in, and just
1085  * mark off blocks on the discard bitset.  No passdown occurs!
1086  *
1087  * To implement passdown we need to change the bio_prison such that a cell
1088  * can have a key that spans many blocks.
1089  */
1090 static void process_discard_bio(struct cache *cache, struct bio *bio)
1091 {
1092         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1093                                                   cache->discard_block_size);
1094         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1095         dm_block_t b;
1096
1097         end_block = block_div(end_block, cache->discard_block_size);
1098
1099         for (b = start_block; b < end_block; b++)
1100                 set_discard(cache, to_dblock(b));
1101
1102         bio_endio(bio, 0);
1103 }
1104
1105 static bool spare_migration_bandwidth(struct cache *cache)
1106 {
1107         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1108                 cache->sectors_per_block;
1109         return current_volume < cache->migration_threshold;
1110 }
1111
1112 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1113                                dm_cblock_t cblock)
1114 {
1115         return bio_data_dir(bio) == WRITE &&
1116                 cache->features.write_through && !is_dirty(cache, cblock);
1117 }
1118
1119 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1120 {
1121         atomic_inc(bio_data_dir(bio) == READ ?
1122                    &cache->stats.read_hit : &cache->stats.write_hit);
1123 }
1124
1125 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1126 {
1127         atomic_inc(bio_data_dir(bio) == READ ?
1128                    &cache->stats.read_miss : &cache->stats.write_miss);
1129 }
1130
1131 static void process_bio(struct cache *cache, struct prealloc *structs,
1132                         struct bio *bio)
1133 {
1134         int r;
1135         bool release_cell = true;
1136         dm_oblock_t block = get_bio_block(cache, bio);
1137         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1138         struct policy_result lookup_result;
1139         size_t pb_data_size = get_per_bio_data_size(cache);
1140         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1141         bool discarded_block = is_discarded_oblock(cache, block);
1142         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1143
1144         /*
1145          * Check to see if that block is currently migrating.
1146          */
1147         cell_prealloc = prealloc_get_cell(structs);
1148         r = bio_detain(cache, block, bio, cell_prealloc,
1149                        (cell_free_fn) prealloc_put_cell,
1150                        structs, &new_ocell);
1151         if (r > 0)
1152                 return;
1153
1154         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1155                        bio, &lookup_result);
1156
1157         if (r == -EWOULDBLOCK)
1158                 /* migration has been denied */
1159                 lookup_result.op = POLICY_MISS;
1160
1161         switch (lookup_result.op) {
1162         case POLICY_HIT:
1163                 inc_hit_counter(cache, bio);
1164                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1165
1166                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1167                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1168                 else
1169                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1170
1171                 issue(cache, bio);
1172                 break;
1173
1174         case POLICY_MISS:
1175                 inc_miss_counter(cache, bio);
1176                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1177                 remap_to_origin_clear_discard(cache, bio, block);
1178                 issue(cache, bio);
1179                 break;
1180
1181         case POLICY_NEW:
1182                 atomic_inc(&cache->stats.promotion);
1183                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1184                 release_cell = false;
1185                 break;
1186
1187         case POLICY_REPLACE:
1188                 cell_prealloc = prealloc_get_cell(structs);
1189                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1190                                (cell_free_fn) prealloc_put_cell,
1191                                structs, &old_ocell);
1192                 if (r > 0) {
1193                         /*
1194                          * We have to be careful to avoid lock inversion of
1195                          * the cells.  So we back off, and wait for the
1196                          * old_ocell to become free.
1197                          */
1198                         policy_force_mapping(cache->policy, block,
1199                                              lookup_result.old_oblock);
1200                         atomic_inc(&cache->stats.cache_cell_clash);
1201                         break;
1202                 }
1203                 atomic_inc(&cache->stats.demotion);
1204                 atomic_inc(&cache->stats.promotion);
1205
1206                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1207                                     block, lookup_result.cblock,
1208                                     old_ocell, new_ocell);
1209                 release_cell = false;
1210                 break;
1211
1212         default:
1213                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1214                             (unsigned) lookup_result.op);
1215                 bio_io_error(bio);
1216         }
1217
1218         if (release_cell)
1219                 cell_defer(cache, new_ocell, false);
1220 }
1221
1222 static int need_commit_due_to_time(struct cache *cache)
1223 {
1224         return jiffies < cache->last_commit_jiffies ||
1225                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1226 }
1227
1228 static int commit_if_needed(struct cache *cache)
1229 {
1230         if (dm_cache_changed_this_transaction(cache->cmd) &&
1231             (cache->commit_requested || need_commit_due_to_time(cache))) {
1232                 atomic_inc(&cache->stats.commit_count);
1233                 cache->last_commit_jiffies = jiffies;
1234                 cache->commit_requested = false;
1235                 return dm_cache_commit(cache->cmd, false);
1236         }
1237
1238         return 0;
1239 }
1240
1241 static void process_deferred_bios(struct cache *cache)
1242 {
1243         unsigned long flags;
1244         struct bio_list bios;
1245         struct bio *bio;
1246         struct prealloc structs;
1247
1248         memset(&structs, 0, sizeof(structs));
1249         bio_list_init(&bios);
1250
1251         spin_lock_irqsave(&cache->lock, flags);
1252         bio_list_merge(&bios, &cache->deferred_bios);
1253         bio_list_init(&cache->deferred_bios);
1254         spin_unlock_irqrestore(&cache->lock, flags);
1255
1256         while (!bio_list_empty(&bios)) {
1257                 /*
1258                  * If we've got no free migration structs, and processing
1259                  * this bio might require one, we pause until there are some
1260                  * prepared mappings to process.
1261                  */
1262                 if (prealloc_data_structs(cache, &structs)) {
1263                         spin_lock_irqsave(&cache->lock, flags);
1264                         bio_list_merge(&cache->deferred_bios, &bios);
1265                         spin_unlock_irqrestore(&cache->lock, flags);
1266                         break;
1267                 }
1268
1269                 bio = bio_list_pop(&bios);
1270
1271                 if (bio->bi_rw & REQ_FLUSH)
1272                         process_flush_bio(cache, bio);
1273                 else if (bio->bi_rw & REQ_DISCARD)
1274                         process_discard_bio(cache, bio);
1275                 else
1276                         process_bio(cache, &structs, bio);
1277         }
1278
1279         prealloc_free_structs(cache, &structs);
1280 }
1281
1282 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1283 {
1284         unsigned long flags;
1285         struct bio_list bios;
1286         struct bio *bio;
1287
1288         bio_list_init(&bios);
1289
1290         spin_lock_irqsave(&cache->lock, flags);
1291         bio_list_merge(&bios, &cache->deferred_flush_bios);
1292         bio_list_init(&cache->deferred_flush_bios);
1293         spin_unlock_irqrestore(&cache->lock, flags);
1294
1295         while ((bio = bio_list_pop(&bios)))
1296                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1297 }
1298
1299 static void process_deferred_writethrough_bios(struct cache *cache)
1300 {
1301         unsigned long flags;
1302         struct bio_list bios;
1303         struct bio *bio;
1304
1305         bio_list_init(&bios);
1306
1307         spin_lock_irqsave(&cache->lock, flags);
1308         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1309         bio_list_init(&cache->deferred_writethrough_bios);
1310         spin_unlock_irqrestore(&cache->lock, flags);
1311
1312         while ((bio = bio_list_pop(&bios)))
1313                 generic_make_request(bio);
1314 }
1315
1316 static void writeback_some_dirty_blocks(struct cache *cache)
1317 {
1318         int r = 0;
1319         dm_oblock_t oblock;
1320         dm_cblock_t cblock;
1321         struct prealloc structs;
1322         struct dm_bio_prison_cell *old_ocell;
1323
1324         memset(&structs, 0, sizeof(structs));
1325
1326         while (spare_migration_bandwidth(cache)) {
1327                 if (prealloc_data_structs(cache, &structs))
1328                         break;
1329
1330                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1331                 if (r)
1332                         break;
1333
1334                 r = get_cell(cache, oblock, &structs, &old_ocell);
1335                 if (r) {
1336                         policy_set_dirty(cache->policy, oblock);
1337                         break;
1338                 }
1339
1340                 writeback(cache, &structs, oblock, cblock, old_ocell);
1341         }
1342
1343         prealloc_free_structs(cache, &structs);
1344 }
1345
1346 /*----------------------------------------------------------------
1347  * Main worker loop
1348  *--------------------------------------------------------------*/
1349 static void start_quiescing(struct cache *cache)
1350 {
1351         unsigned long flags;
1352
1353         spin_lock_irqsave(&cache->lock, flags);
1354         cache->quiescing = 1;
1355         spin_unlock_irqrestore(&cache->lock, flags);
1356 }
1357
1358 static void stop_quiescing(struct cache *cache)
1359 {
1360         unsigned long flags;
1361
1362         spin_lock_irqsave(&cache->lock, flags);
1363         cache->quiescing = 0;
1364         spin_unlock_irqrestore(&cache->lock, flags);
1365 }
1366
1367 static bool is_quiescing(struct cache *cache)
1368 {
1369         int r;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&cache->lock, flags);
1373         r = cache->quiescing;
1374         spin_unlock_irqrestore(&cache->lock, flags);
1375
1376         return r;
1377 }
1378
1379 static void wait_for_migrations(struct cache *cache)
1380 {
1381         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1382 }
1383
1384 static void stop_worker(struct cache *cache)
1385 {
1386         cancel_delayed_work(&cache->waker);
1387         flush_workqueue(cache->wq);
1388 }
1389
1390 static void requeue_deferred_io(struct cache *cache)
1391 {
1392         struct bio *bio;
1393         struct bio_list bios;
1394
1395         bio_list_init(&bios);
1396         bio_list_merge(&bios, &cache->deferred_bios);
1397         bio_list_init(&cache->deferred_bios);
1398
1399         while ((bio = bio_list_pop(&bios)))
1400                 bio_endio(bio, DM_ENDIO_REQUEUE);
1401 }
1402
1403 static int more_work(struct cache *cache)
1404 {
1405         if (is_quiescing(cache))
1406                 return !list_empty(&cache->quiesced_migrations) ||
1407                         !list_empty(&cache->completed_migrations) ||
1408                         !list_empty(&cache->need_commit_migrations);
1409         else
1410                 return !bio_list_empty(&cache->deferred_bios) ||
1411                         !bio_list_empty(&cache->deferred_flush_bios) ||
1412                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1413                         !list_empty(&cache->quiesced_migrations) ||
1414                         !list_empty(&cache->completed_migrations) ||
1415                         !list_empty(&cache->need_commit_migrations);
1416 }
1417
1418 static void do_worker(struct work_struct *ws)
1419 {
1420         struct cache *cache = container_of(ws, struct cache, worker);
1421
1422         do {
1423                 if (!is_quiescing(cache))
1424                         process_deferred_bios(cache);
1425
1426                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1427                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1428
1429                 writeback_some_dirty_blocks(cache);
1430
1431                 process_deferred_writethrough_bios(cache);
1432
1433                 if (commit_if_needed(cache)) {
1434                         process_deferred_flush_bios(cache, false);
1435
1436                         /*
1437                          * FIXME: rollback metadata or just go into a
1438                          * failure mode and error everything
1439                          */
1440                 } else {
1441                         process_deferred_flush_bios(cache, true);
1442                         process_migrations(cache, &cache->need_commit_migrations,
1443                                            migration_success_post_commit);
1444                 }
1445         } while (more_work(cache));
1446 }
1447
1448 /*
1449  * We want to commit periodically so that not too much
1450  * unwritten metadata builds up.
1451  */
1452 static void do_waker(struct work_struct *ws)
1453 {
1454         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1455         policy_tick(cache->policy);
1456         wake_worker(cache);
1457         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1458 }
1459
1460 /*----------------------------------------------------------------*/
1461
1462 static int is_congested(struct dm_dev *dev, int bdi_bits)
1463 {
1464         struct request_queue *q = bdev_get_queue(dev->bdev);
1465         return bdi_congested(&q->backing_dev_info, bdi_bits);
1466 }
1467
1468 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1469 {
1470         struct cache *cache = container_of(cb, struct cache, callbacks);
1471
1472         return is_congested(cache->origin_dev, bdi_bits) ||
1473                 is_congested(cache->cache_dev, bdi_bits);
1474 }
1475
1476 /*----------------------------------------------------------------
1477  * Target methods
1478  *--------------------------------------------------------------*/
1479
1480 /*
1481  * This function gets called on the error paths of the constructor, so we
1482  * have to cope with a partially initialised struct.
1483  */
1484 static void destroy(struct cache *cache)
1485 {
1486         unsigned i;
1487
1488         if (cache->next_migration)
1489                 mempool_free(cache->next_migration, cache->migration_pool);
1490
1491         if (cache->migration_pool)
1492                 mempool_destroy(cache->migration_pool);
1493
1494         if (cache->all_io_ds)
1495                 dm_deferred_set_destroy(cache->all_io_ds);
1496
1497         if (cache->prison)
1498                 dm_bio_prison_destroy(cache->prison);
1499
1500         if (cache->wq)
1501                 destroy_workqueue(cache->wq);
1502
1503         if (cache->dirty_bitset)
1504                 free_bitset(cache->dirty_bitset);
1505
1506         if (cache->discard_bitset)
1507                 free_bitset(cache->discard_bitset);
1508
1509         if (cache->copier)
1510                 dm_kcopyd_client_destroy(cache->copier);
1511
1512         if (cache->cmd)
1513                 dm_cache_metadata_close(cache->cmd);
1514
1515         if (cache->metadata_dev)
1516                 dm_put_device(cache->ti, cache->metadata_dev);
1517
1518         if (cache->origin_dev)
1519                 dm_put_device(cache->ti, cache->origin_dev);
1520
1521         if (cache->cache_dev)
1522                 dm_put_device(cache->ti, cache->cache_dev);
1523
1524         if (cache->policy)
1525                 dm_cache_policy_destroy(cache->policy);
1526
1527         for (i = 0; i < cache->nr_ctr_args ; i++)
1528                 kfree(cache->ctr_args[i]);
1529         kfree(cache->ctr_args);
1530
1531         kfree(cache);
1532 }
1533
1534 static void cache_dtr(struct dm_target *ti)
1535 {
1536         struct cache *cache = ti->private;
1537
1538         destroy(cache);
1539 }
1540
1541 static sector_t get_dev_size(struct dm_dev *dev)
1542 {
1543         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1544 }
1545
1546 /*----------------------------------------------------------------*/
1547
1548 /*
1549  * Construct a cache device mapping.
1550  *
1551  * cache <metadata dev> <cache dev> <origin dev> <block size>
1552  *       <#feature args> [<feature arg>]*
1553  *       <policy> <#policy args> [<policy arg>]*
1554  *
1555  * metadata dev    : fast device holding the persistent metadata
1556  * cache dev       : fast device holding cached data blocks
1557  * origin dev      : slow device holding original data blocks
1558  * block size      : cache unit size in sectors
1559  *
1560  * #feature args   : number of feature arguments passed
1561  * feature args    : writethrough.  (The default is writeback.)
1562  *
1563  * policy          : the replacement policy to use
1564  * #policy args    : an even number of policy arguments corresponding
1565  *                   to key/value pairs passed to the policy
1566  * policy args     : key/value pairs passed to the policy
1567  *                   E.g. 'sequential_threshold 1024'
1568  *                   See cache-policies.txt for details.
1569  *
1570  * Optional feature arguments are:
1571  *   writethrough  : write through caching that prohibits cache block
1572  *                   content from being different from origin block content.
1573  *                   Without this argument, the default behaviour is to write
1574  *                   back cache block contents later for performance reasons,
1575  *                   so they may differ from the corresponding origin blocks.
1576  */
1577 struct cache_args {
1578         struct dm_target *ti;
1579
1580         struct dm_dev *metadata_dev;
1581
1582         struct dm_dev *cache_dev;
1583         sector_t cache_sectors;
1584
1585         struct dm_dev *origin_dev;
1586         sector_t origin_sectors;
1587
1588         uint32_t block_size;
1589
1590         const char *policy_name;
1591         int policy_argc;
1592         const char **policy_argv;
1593
1594         struct cache_features features;
1595 };
1596
1597 static void destroy_cache_args(struct cache_args *ca)
1598 {
1599         if (ca->metadata_dev)
1600                 dm_put_device(ca->ti, ca->metadata_dev);
1601
1602         if (ca->cache_dev)
1603                 dm_put_device(ca->ti, ca->cache_dev);
1604
1605         if (ca->origin_dev)
1606                 dm_put_device(ca->ti, ca->origin_dev);
1607
1608         kfree(ca);
1609 }
1610
1611 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1612 {
1613         if (!as->argc) {
1614                 *error = "Insufficient args";
1615                 return false;
1616         }
1617
1618         return true;
1619 }
1620
1621 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1622                               char **error)
1623 {
1624         int r;
1625         sector_t metadata_dev_size;
1626         char b[BDEVNAME_SIZE];
1627
1628         if (!at_least_one_arg(as, error))
1629                 return -EINVAL;
1630
1631         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1632                           &ca->metadata_dev);
1633         if (r) {
1634                 *error = "Error opening metadata device";
1635                 return r;
1636         }
1637
1638         metadata_dev_size = get_dev_size(ca->metadata_dev);
1639         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1640                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1641                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1642
1643         return 0;
1644 }
1645
1646 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1647                            char **error)
1648 {
1649         int r;
1650
1651         if (!at_least_one_arg(as, error))
1652                 return -EINVAL;
1653
1654         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1655                           &ca->cache_dev);
1656         if (r) {
1657                 *error = "Error opening cache device";
1658                 return r;
1659         }
1660         ca->cache_sectors = get_dev_size(ca->cache_dev);
1661
1662         return 0;
1663 }
1664
1665 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1666                             char **error)
1667 {
1668         int r;
1669
1670         if (!at_least_one_arg(as, error))
1671                 return -EINVAL;
1672
1673         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1674                           &ca->origin_dev);
1675         if (r) {
1676                 *error = "Error opening origin device";
1677                 return r;
1678         }
1679
1680         ca->origin_sectors = get_dev_size(ca->origin_dev);
1681         if (ca->ti->len > ca->origin_sectors) {
1682                 *error = "Device size larger than cached device";
1683                 return -EINVAL;
1684         }
1685
1686         return 0;
1687 }
1688
1689 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1690                             char **error)
1691 {
1692         unsigned long block_size;
1693
1694         if (!at_least_one_arg(as, error))
1695                 return -EINVAL;
1696
1697         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1698             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1699             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1700             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1701                 *error = "Invalid data block size";
1702                 return -EINVAL;
1703         }
1704
1705         if (block_size > ca->cache_sectors) {
1706                 *error = "Data block size is larger than the cache device";
1707                 return -EINVAL;
1708         }
1709
1710         ca->block_size = block_size;
1711
1712         return 0;
1713 }
1714
1715 static void init_features(struct cache_features *cf)
1716 {
1717         cf->mode = CM_WRITE;
1718         cf->write_through = false;
1719 }
1720
1721 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1722                           char **error)
1723 {
1724         static struct dm_arg _args[] = {
1725                 {0, 1, "Invalid number of cache feature arguments"},
1726         };
1727
1728         int r;
1729         unsigned argc;
1730         const char *arg;
1731         struct cache_features *cf = &ca->features;
1732
1733         init_features(cf);
1734
1735         r = dm_read_arg_group(_args, as, &argc, error);
1736         if (r)
1737                 return -EINVAL;
1738
1739         while (argc--) {
1740                 arg = dm_shift_arg(as);
1741
1742                 if (!strcasecmp(arg, "writeback"))
1743                         cf->write_through = false;
1744
1745                 else if (!strcasecmp(arg, "writethrough"))
1746                         cf->write_through = true;
1747
1748                 else {
1749                         *error = "Unrecognised cache feature requested";
1750                         return -EINVAL;
1751                 }
1752         }
1753
1754         return 0;
1755 }
1756
1757 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1758                         char **error)
1759 {
1760         static struct dm_arg _args[] = {
1761                 {0, 1024, "Invalid number of policy arguments"},
1762         };
1763
1764         int r;
1765
1766         if (!at_least_one_arg(as, error))
1767                 return -EINVAL;
1768
1769         ca->policy_name = dm_shift_arg(as);
1770
1771         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1772         if (r)
1773                 return -EINVAL;
1774
1775         ca->policy_argv = (const char **)as->argv;
1776         dm_consume_args(as, ca->policy_argc);
1777
1778         return 0;
1779 }
1780
1781 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1782                             char **error)
1783 {
1784         int r;
1785         struct dm_arg_set as;
1786
1787         as.argc = argc;
1788         as.argv = argv;
1789
1790         r = parse_metadata_dev(ca, &as, error);
1791         if (r)
1792                 return r;
1793
1794         r = parse_cache_dev(ca, &as, error);
1795         if (r)
1796                 return r;
1797
1798         r = parse_origin_dev(ca, &as, error);
1799         if (r)
1800                 return r;
1801
1802         r = parse_block_size(ca, &as, error);
1803         if (r)
1804                 return r;
1805
1806         r = parse_features(ca, &as, error);
1807         if (r)
1808                 return r;
1809
1810         r = parse_policy(ca, &as, error);
1811         if (r)
1812                 return r;
1813
1814         return 0;
1815 }
1816
1817 /*----------------------------------------------------------------*/
1818
1819 static struct kmem_cache *migration_cache;
1820
1821 #define NOT_CORE_OPTION 1
1822
1823 static int process_config_option(struct cache *cache, const char *key, const char *value)
1824 {
1825         unsigned long tmp;
1826
1827         if (!strcasecmp(key, "migration_threshold")) {
1828                 if (kstrtoul(value, 10, &tmp))
1829                         return -EINVAL;
1830
1831                 cache->migration_threshold = tmp;
1832                 return 0;
1833         }
1834
1835         return NOT_CORE_OPTION;
1836 }
1837
1838 static int set_config_value(struct cache *cache, const char *key, const char *value)
1839 {
1840         int r = process_config_option(cache, key, value);
1841
1842         if (r == NOT_CORE_OPTION)
1843                 r = policy_set_config_value(cache->policy, key, value);
1844
1845         if (r)
1846                 DMWARN("bad config value for %s: %s", key, value);
1847
1848         return r;
1849 }
1850
1851 static int set_config_values(struct cache *cache, int argc, const char **argv)
1852 {
1853         int r = 0;
1854
1855         if (argc & 1) {
1856                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1857                 return -EINVAL;
1858         }
1859
1860         while (argc) {
1861                 r = set_config_value(cache, argv[0], argv[1]);
1862                 if (r)
1863                         break;
1864
1865                 argc -= 2;
1866                 argv += 2;
1867         }
1868
1869         return r;
1870 }
1871
1872 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1873                                char **error)
1874 {
1875         cache->policy = dm_cache_policy_create(ca->policy_name,
1876                                                cache->cache_size,
1877                                                cache->origin_sectors,
1878                                                cache->sectors_per_block);
1879         if (!cache->policy) {
1880                 *error = "Error creating cache's policy";
1881                 return -ENOMEM;
1882         }
1883
1884         return 0;
1885 }
1886
1887 /*
1888  * We want the discard block size to be a power of two, at least the size
1889  * of the cache block size, and have no more than 2^14 discard blocks
1890  * across the origin.
1891  */
1892 #define MAX_DISCARD_BLOCKS (1 << 14)
1893
1894 static bool too_many_discard_blocks(sector_t discard_block_size,
1895                                     sector_t origin_size)
1896 {
1897         (void) sector_div(origin_size, discard_block_size);
1898
1899         return origin_size > MAX_DISCARD_BLOCKS;
1900 }
1901
1902 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1903                                              sector_t origin_size)
1904 {
1905         sector_t discard_block_size;
1906
1907         discard_block_size = roundup_pow_of_two(cache_block_size);
1908
1909         if (origin_size)
1910                 while (too_many_discard_blocks(discard_block_size, origin_size))
1911                         discard_block_size *= 2;
1912
1913         return discard_block_size;
1914 }
1915
1916 #define DEFAULT_MIGRATION_THRESHOLD 2048
1917
1918 static int cache_create(struct cache_args *ca, struct cache **result)
1919 {
1920         int r = 0;
1921         char **error = &ca->ti->error;
1922         struct cache *cache;
1923         struct dm_target *ti = ca->ti;
1924         dm_block_t origin_blocks;
1925         struct dm_cache_metadata *cmd;
1926         bool may_format = ca->features.mode == CM_WRITE;
1927
1928         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1929         if (!cache)
1930                 return -ENOMEM;
1931
1932         cache->ti = ca->ti;
1933         ti->private = cache;
1934         ti->num_flush_bios = 2;
1935         ti->flush_supported = true;
1936
1937         ti->num_discard_bios = 1;
1938         ti->discards_supported = true;
1939         ti->discard_zeroes_data_unsupported = true;
1940
1941         cache->features = ca->features;
1942         ti->per_bio_data_size = get_per_bio_data_size(cache);
1943
1944         cache->callbacks.congested_fn = cache_is_congested;
1945         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1946
1947         cache->metadata_dev = ca->metadata_dev;
1948         cache->origin_dev = ca->origin_dev;
1949         cache->cache_dev = ca->cache_dev;
1950
1951         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1952
1953         /* FIXME: factor out this whole section */
1954         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1955         origin_blocks = block_div(origin_blocks, ca->block_size);
1956         cache->origin_blocks = to_oblock(origin_blocks);
1957
1958         cache->sectors_per_block = ca->block_size;
1959         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1960                 r = -EINVAL;
1961                 goto bad;
1962         }
1963
1964         if (ca->block_size & (ca->block_size - 1)) {
1965                 dm_block_t cache_size = ca->cache_sectors;
1966
1967                 cache->sectors_per_block_shift = -1;
1968                 cache_size = block_div(cache_size, ca->block_size);
1969                 cache->cache_size = to_cblock(cache_size);
1970         } else {
1971                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1972                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1973         }
1974
1975         r = create_cache_policy(cache, ca, error);
1976         if (r)
1977                 goto bad;
1978
1979         cache->policy_nr_args = ca->policy_argc;
1980         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1981
1982         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
1983         if (r) {
1984                 *error = "Error setting cache policy's config values";
1985                 goto bad;
1986         }
1987
1988         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1989                                      ca->block_size, may_format,
1990                                      dm_cache_policy_get_hint_size(cache->policy));
1991         if (IS_ERR(cmd)) {
1992                 *error = "Error creating metadata object";
1993                 r = PTR_ERR(cmd);
1994                 goto bad;
1995         }
1996         cache->cmd = cmd;
1997
1998         spin_lock_init(&cache->lock);
1999         bio_list_init(&cache->deferred_bios);
2000         bio_list_init(&cache->deferred_flush_bios);
2001         bio_list_init(&cache->deferred_writethrough_bios);
2002         INIT_LIST_HEAD(&cache->quiesced_migrations);
2003         INIT_LIST_HEAD(&cache->completed_migrations);
2004         INIT_LIST_HEAD(&cache->need_commit_migrations);
2005         atomic_set(&cache->nr_migrations, 0);
2006         init_waitqueue_head(&cache->migration_wait);
2007
2008         r = -ENOMEM;
2009         cache->nr_dirty = 0;
2010         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2011         if (!cache->dirty_bitset) {
2012                 *error = "could not allocate dirty bitset";
2013                 goto bad;
2014         }
2015         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2016
2017         cache->discard_block_size =
2018                 calculate_discard_block_size(cache->sectors_per_block,
2019                                              cache->origin_sectors);
2020         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
2021         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2022         if (!cache->discard_bitset) {
2023                 *error = "could not allocate discard bitset";
2024                 goto bad;
2025         }
2026         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2027
2028         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2029         if (IS_ERR(cache->copier)) {
2030                 *error = "could not create kcopyd client";
2031                 r = PTR_ERR(cache->copier);
2032                 goto bad;
2033         }
2034
2035         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2036         if (!cache->wq) {
2037                 *error = "could not create workqueue for metadata object";
2038                 goto bad;
2039         }
2040         INIT_WORK(&cache->worker, do_worker);
2041         INIT_DELAYED_WORK(&cache->waker, do_waker);
2042         cache->last_commit_jiffies = jiffies;
2043
2044         cache->prison = dm_bio_prison_create(PRISON_CELLS);
2045         if (!cache->prison) {
2046                 *error = "could not create bio prison";
2047                 goto bad;
2048         }
2049
2050         cache->all_io_ds = dm_deferred_set_create();
2051         if (!cache->all_io_ds) {
2052                 *error = "could not create all_io deferred set";
2053                 goto bad;
2054         }
2055
2056         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2057                                                          migration_cache);
2058         if (!cache->migration_pool) {
2059                 *error = "Error creating cache's migration mempool";
2060                 goto bad;
2061         }
2062
2063         cache->next_migration = NULL;
2064
2065         cache->need_tick_bio = true;
2066         cache->sized = false;
2067         cache->quiescing = false;
2068         cache->commit_requested = false;
2069         cache->loaded_mappings = false;
2070         cache->loaded_discards = false;
2071
2072         load_stats(cache);
2073
2074         atomic_set(&cache->stats.demotion, 0);
2075         atomic_set(&cache->stats.promotion, 0);
2076         atomic_set(&cache->stats.copies_avoided, 0);
2077         atomic_set(&cache->stats.cache_cell_clash, 0);
2078         atomic_set(&cache->stats.commit_count, 0);
2079         atomic_set(&cache->stats.discard_count, 0);
2080
2081         *result = cache;
2082         return 0;
2083
2084 bad:
2085         destroy(cache);
2086         return r;
2087 }
2088
2089 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2090 {
2091         unsigned i;
2092         const char **copy;
2093
2094         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2095         if (!copy)
2096                 return -ENOMEM;
2097         for (i = 0; i < argc; i++) {
2098                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2099                 if (!copy[i]) {
2100                         while (i--)
2101                                 kfree(copy[i]);
2102                         kfree(copy);
2103                         return -ENOMEM;
2104                 }
2105         }
2106
2107         cache->nr_ctr_args = argc;
2108         cache->ctr_args = copy;
2109
2110         return 0;
2111 }
2112
2113 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2114 {
2115         int r = -EINVAL;
2116         struct cache_args *ca;
2117         struct cache *cache = NULL;
2118
2119         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2120         if (!ca) {
2121                 ti->error = "Error allocating memory for cache";
2122                 return -ENOMEM;
2123         }
2124         ca->ti = ti;
2125
2126         r = parse_cache_args(ca, argc, argv, &ti->error);
2127         if (r)
2128                 goto out;
2129
2130         r = cache_create(ca, &cache);
2131         if (r)
2132                 goto out;
2133
2134         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2135         if (r) {
2136                 destroy(cache);
2137                 goto out;
2138         }
2139
2140         ti->private = cache;
2141
2142 out:
2143         destroy_cache_args(ca);
2144         return r;
2145 }
2146
2147 static int cache_map(struct dm_target *ti, struct bio *bio)
2148 {
2149         struct cache *cache = ti->private;
2150
2151         int r;
2152         dm_oblock_t block = get_bio_block(cache, bio);
2153         size_t pb_data_size = get_per_bio_data_size(cache);
2154         bool can_migrate = false;
2155         bool discarded_block;
2156         struct dm_bio_prison_cell *cell;
2157         struct policy_result lookup_result;
2158         struct per_bio_data *pb;
2159
2160         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2161                 /*
2162                  * This can only occur if the io goes to a partial block at
2163                  * the end of the origin device.  We don't cache these.
2164                  * Just remap to the origin and carry on.
2165                  */
2166                 remap_to_origin_clear_discard(cache, bio, block);
2167                 return DM_MAPIO_REMAPPED;
2168         }
2169
2170         pb = init_per_bio_data(bio, pb_data_size);
2171
2172         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2173                 defer_bio(cache, bio);
2174                 return DM_MAPIO_SUBMITTED;
2175         }
2176
2177         /*
2178          * Check to see if that block is currently migrating.
2179          */
2180         cell = alloc_prison_cell(cache);
2181         if (!cell) {
2182                 defer_bio(cache, bio);
2183                 return DM_MAPIO_SUBMITTED;
2184         }
2185
2186         r = bio_detain(cache, block, bio, cell,
2187                        (cell_free_fn) free_prison_cell,
2188                        cache, &cell);
2189         if (r) {
2190                 if (r < 0)
2191                         defer_bio(cache, bio);
2192
2193                 return DM_MAPIO_SUBMITTED;
2194         }
2195
2196         discarded_block = is_discarded_oblock(cache, block);
2197
2198         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2199                        bio, &lookup_result);
2200         if (r == -EWOULDBLOCK) {
2201                 cell_defer(cache, cell, true);
2202                 return DM_MAPIO_SUBMITTED;
2203
2204         } else if (r) {
2205                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2206                 bio_io_error(bio);
2207                 return DM_MAPIO_SUBMITTED;
2208         }
2209
2210         switch (lookup_result.op) {
2211         case POLICY_HIT:
2212                 inc_hit_counter(cache, bio);
2213                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2214
2215                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2216                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2217                 else
2218                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2219
2220                 cell_defer(cache, cell, false);
2221                 break;
2222
2223         case POLICY_MISS:
2224                 inc_miss_counter(cache, bio);
2225                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2226
2227                 if (pb->req_nr != 0) {
2228                         /*
2229                          * This is a duplicate writethrough io that is no
2230                          * longer needed because the block has been demoted.
2231                          */
2232                         bio_endio(bio, 0);
2233                         cell_defer(cache, cell, false);
2234                         return DM_MAPIO_SUBMITTED;
2235                 } else {
2236                         remap_to_origin_clear_discard(cache, bio, block);
2237                         cell_defer(cache, cell, false);
2238                 }
2239                 break;
2240
2241         default:
2242                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2243                             (unsigned) lookup_result.op);
2244                 bio_io_error(bio);
2245                 return DM_MAPIO_SUBMITTED;
2246         }
2247
2248         return DM_MAPIO_REMAPPED;
2249 }
2250
2251 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2252 {
2253         struct cache *cache = ti->private;
2254         unsigned long flags;
2255         size_t pb_data_size = get_per_bio_data_size(cache);
2256         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2257
2258         if (pb->tick) {
2259                 policy_tick(cache->policy);
2260
2261                 spin_lock_irqsave(&cache->lock, flags);
2262                 cache->need_tick_bio = true;
2263                 spin_unlock_irqrestore(&cache->lock, flags);
2264         }
2265
2266         check_for_quiesced_migrations(cache, pb);
2267
2268         return 0;
2269 }
2270
2271 static int write_dirty_bitset(struct cache *cache)
2272 {
2273         unsigned i, r;
2274
2275         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2276                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2277                                        is_dirty(cache, to_cblock(i)));
2278                 if (r)
2279                         return r;
2280         }
2281
2282         return 0;
2283 }
2284
2285 static int write_discard_bitset(struct cache *cache)
2286 {
2287         unsigned i, r;
2288
2289         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2290                                            cache->discard_nr_blocks);
2291         if (r) {
2292                 DMERR("could not resize on-disk discard bitset");
2293                 return r;
2294         }
2295
2296         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2297                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2298                                          is_discarded(cache, to_dblock(i)));
2299                 if (r)
2300                         return r;
2301         }
2302
2303         return 0;
2304 }
2305
2306 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2307                      uint32_t hint)
2308 {
2309         struct cache *cache = context;
2310         return dm_cache_save_hint(cache->cmd, cblock, hint);
2311 }
2312
2313 static int write_hints(struct cache *cache)
2314 {
2315         int r;
2316
2317         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2318         if (r) {
2319                 DMERR("dm_cache_begin_hints failed");
2320                 return r;
2321         }
2322
2323         r = policy_walk_mappings(cache->policy, save_hint, cache);
2324         if (r)
2325                 DMERR("policy_walk_mappings failed");
2326
2327         return r;
2328 }
2329
2330 /*
2331  * returns true on success
2332  */
2333 static bool sync_metadata(struct cache *cache)
2334 {
2335         int r1, r2, r3, r4;
2336
2337         r1 = write_dirty_bitset(cache);
2338         if (r1)
2339                 DMERR("could not write dirty bitset");
2340
2341         r2 = write_discard_bitset(cache);
2342         if (r2)
2343                 DMERR("could not write discard bitset");
2344
2345         save_stats(cache);
2346
2347         r3 = write_hints(cache);
2348         if (r3)
2349                 DMERR("could not write hints");
2350
2351         /*
2352          * If writing the above metadata failed, we still commit, but don't
2353          * set the clean shutdown flag.  This will effectively force every
2354          * dirty bit to be set on reload.
2355          */
2356         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2357         if (r4)
2358                 DMERR("could not write cache metadata.  Data loss may occur.");
2359
2360         return !r1 && !r2 && !r3 && !r4;
2361 }
2362
2363 static void cache_postsuspend(struct dm_target *ti)
2364 {
2365         struct cache *cache = ti->private;
2366
2367         start_quiescing(cache);
2368         wait_for_migrations(cache);
2369         stop_worker(cache);
2370         requeue_deferred_io(cache);
2371         stop_quiescing(cache);
2372
2373         (void) sync_metadata(cache);
2374 }
2375
2376 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2377                         bool dirty, uint32_t hint, bool hint_valid)
2378 {
2379         int r;
2380         struct cache *cache = context;
2381
2382         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2383         if (r)
2384                 return r;
2385
2386         if (dirty)
2387                 set_dirty(cache, oblock, cblock);
2388         else
2389                 clear_dirty(cache, oblock, cblock);
2390
2391         return 0;
2392 }
2393
2394 static int load_discard(void *context, sector_t discard_block_size,
2395                         dm_dblock_t dblock, bool discard)
2396 {
2397         struct cache *cache = context;
2398
2399         /* FIXME: handle mis-matched block size */
2400
2401         if (discard)
2402                 set_discard(cache, dblock);
2403         else
2404                 clear_discard(cache, dblock);
2405
2406         return 0;
2407 }
2408
2409 static int cache_preresume(struct dm_target *ti)
2410 {
2411         int r = 0;
2412         struct cache *cache = ti->private;
2413         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2414         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2415
2416         /*
2417          * Check to see if the cache has resized.
2418          */
2419         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2420                 cache->cache_size = to_cblock(actual_cache_size);
2421
2422                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2423                 if (r) {
2424                         DMERR("could not resize cache metadata");
2425                         return r;
2426                 }
2427
2428                 cache->sized = true;
2429         }
2430
2431         if (!cache->loaded_mappings) {
2432                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2433                                            load_mapping, cache);
2434                 if (r) {
2435                         DMERR("could not load cache mappings");
2436                         return r;
2437                 }
2438
2439                 cache->loaded_mappings = true;
2440         }
2441
2442         if (!cache->loaded_discards) {
2443                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2444                 if (r) {
2445                         DMERR("could not load origin discards");
2446                         return r;
2447                 }
2448
2449                 cache->loaded_discards = true;
2450         }
2451
2452         return r;
2453 }
2454
2455 static void cache_resume(struct dm_target *ti)
2456 {
2457         struct cache *cache = ti->private;
2458
2459         cache->need_tick_bio = true;
2460         do_waker(&cache->waker.work);
2461 }
2462
2463 /*
2464  * Status format:
2465  *
2466  * <#used metadata blocks>/<#total metadata blocks>
2467  * <#read hits> <#read misses> <#write hits> <#write misses>
2468  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2469  * <#features> <features>*
2470  * <#core args> <core args>
2471  * <#policy args> <policy args>*
2472  */
2473 static void cache_status(struct dm_target *ti, status_type_t type,
2474                          unsigned status_flags, char *result, unsigned maxlen)
2475 {
2476         int r = 0;
2477         unsigned i;
2478         ssize_t sz = 0;
2479         dm_block_t nr_free_blocks_metadata = 0;
2480         dm_block_t nr_blocks_metadata = 0;
2481         char buf[BDEVNAME_SIZE];
2482         struct cache *cache = ti->private;
2483         dm_cblock_t residency;
2484
2485         switch (type) {
2486         case STATUSTYPE_INFO:
2487                 /* Commit to ensure statistics aren't out-of-date */
2488                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2489                         r = dm_cache_commit(cache->cmd, false);
2490                         if (r)
2491                                 DMERR("could not commit metadata for accurate status");
2492                 }
2493
2494                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2495                                                            &nr_free_blocks_metadata);
2496                 if (r) {
2497                         DMERR("could not get metadata free block count");
2498                         goto err;
2499                 }
2500
2501                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2502                 if (r) {
2503                         DMERR("could not get metadata device size");
2504                         goto err;
2505                 }
2506
2507                 residency = policy_residency(cache->policy);
2508
2509                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2510                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2511                        (unsigned long long)nr_blocks_metadata,
2512                        (unsigned) atomic_read(&cache->stats.read_hit),
2513                        (unsigned) atomic_read(&cache->stats.read_miss),
2514                        (unsigned) atomic_read(&cache->stats.write_hit),
2515                        (unsigned) atomic_read(&cache->stats.write_miss),
2516                        (unsigned) atomic_read(&cache->stats.demotion),
2517                        (unsigned) atomic_read(&cache->stats.promotion),
2518                        (unsigned long long) from_cblock(residency),
2519                        cache->nr_dirty);
2520
2521                 if (cache->features.write_through)
2522                         DMEMIT("1 writethrough ");
2523                 else
2524                         DMEMIT("0 ");
2525
2526                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2527                 if (sz < maxlen) {
2528                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2529                         if (r)
2530                                 DMERR("policy_emit_config_values returned %d", r);
2531                 }
2532
2533                 break;
2534
2535         case STATUSTYPE_TABLE:
2536                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2537                 DMEMIT("%s ", buf);
2538                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2539                 DMEMIT("%s ", buf);
2540                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2541                 DMEMIT("%s", buf);
2542
2543                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2544                         DMEMIT(" %s", cache->ctr_args[i]);
2545                 if (cache->nr_ctr_args)
2546                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2547         }
2548
2549         return;
2550
2551 err:
2552         DMEMIT("Error");
2553 }
2554
2555 /*
2556  * Supports <key> <value>.
2557  *
2558  * The key migration_threshold is supported by the cache target core.
2559  */
2560 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2561 {
2562         struct cache *cache = ti->private;
2563
2564         if (argc != 2)
2565                 return -EINVAL;
2566
2567         return set_config_value(cache, argv[0], argv[1]);
2568 }
2569
2570 static int cache_iterate_devices(struct dm_target *ti,
2571                                  iterate_devices_callout_fn fn, void *data)
2572 {
2573         int r = 0;
2574         struct cache *cache = ti->private;
2575
2576         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2577         if (!r)
2578                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2579
2580         return r;
2581 }
2582
2583 /*
2584  * We assume I/O is going to the origin (which is the volume
2585  * more likely to have restrictions e.g. by being striped).
2586  * (Looking up the exact location of the data would be expensive
2587  * and could always be out of date by the time the bio is submitted.)
2588  */
2589 static int cache_bvec_merge(struct dm_target *ti,
2590                             struct bvec_merge_data *bvm,
2591                             struct bio_vec *biovec, int max_size)
2592 {
2593         struct cache *cache = ti->private;
2594         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2595
2596         if (!q->merge_bvec_fn)
2597                 return max_size;
2598
2599         bvm->bi_bdev = cache->origin_dev->bdev;
2600         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2601 }
2602
2603 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2604 {
2605         /*
2606          * FIXME: these limits may be incompatible with the cache device
2607          */
2608         limits->max_discard_sectors = cache->discard_block_size * 1024;
2609         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2610 }
2611
2612 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2613 {
2614         struct cache *cache = ti->private;
2615         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2616
2617         /*
2618          * If the system-determined stacked limits are compatible with the
2619          * cache's blocksize (io_opt is a factor) do not override them.
2620          */
2621         if (io_opt_sectors < cache->sectors_per_block ||
2622             do_div(io_opt_sectors, cache->sectors_per_block)) {
2623                 blk_limits_io_min(limits, 0);
2624                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2625         }
2626         set_discard_limits(cache, limits);
2627 }
2628
2629 /*----------------------------------------------------------------*/
2630
2631 static struct target_type cache_target = {
2632         .name = "cache",
2633         .version = {1, 1, 1},
2634         .module = THIS_MODULE,
2635         .ctr = cache_ctr,
2636         .dtr = cache_dtr,
2637         .map = cache_map,
2638         .end_io = cache_end_io,
2639         .postsuspend = cache_postsuspend,
2640         .preresume = cache_preresume,
2641         .resume = cache_resume,
2642         .status = cache_status,
2643         .message = cache_message,
2644         .iterate_devices = cache_iterate_devices,
2645         .merge = cache_bvec_merge,
2646         .io_hints = cache_io_hints,
2647 };
2648
2649 static int __init dm_cache_init(void)
2650 {
2651         int r;
2652
2653         r = dm_register_target(&cache_target);
2654         if (r) {
2655                 DMERR("cache target registration failed: %d", r);
2656                 return r;
2657         }
2658
2659         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2660         if (!migration_cache) {
2661                 dm_unregister_target(&cache_target);
2662                 return -ENOMEM;
2663         }
2664
2665         return 0;
2666 }
2667
2668 static void __exit dm_cache_exit(void)
2669 {
2670         dm_unregister_target(&cache_target);
2671         kmem_cache_destroy(migration_cache);
2672 }
2673
2674 module_init(dm_cache_init);
2675 module_exit(dm_cache_exit);
2676
2677 MODULE_DESCRIPTION(DM_NAME " cache target");
2678 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2679 MODULE_LICENSE("GPL");