]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/md/dm-cache-target.c
PowerCap: Convert class code to use dev_groups
[karo-tx-linux.git] / drivers / md / dm-cache-target.c
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/init.h>
15 #include <linux/mempool.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/vmalloc.h>
19
20 #define DM_MSG_PREFIX "cache"
21
22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
23         "A percentage of time allocated for copying to and/or from cache");
24
25 /*----------------------------------------------------------------*/
26
27 /*
28  * Glossary:
29  *
30  * oblock: index of an origin block
31  * cblock: index of a cache block
32  * promotion: movement of a block from origin to cache
33  * demotion: movement of a block from cache to origin
34  * migration: movement of a block between the origin and cache device,
35  *            either direction
36  */
37
38 /*----------------------------------------------------------------*/
39
40 static size_t bitset_size_in_bytes(unsigned nr_entries)
41 {
42         return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
43 }
44
45 static unsigned long *alloc_bitset(unsigned nr_entries)
46 {
47         size_t s = bitset_size_in_bytes(nr_entries);
48         return vzalloc(s);
49 }
50
51 static void clear_bitset(void *bitset, unsigned nr_entries)
52 {
53         size_t s = bitset_size_in_bytes(nr_entries);
54         memset(bitset, 0, s);
55 }
56
57 static void free_bitset(unsigned long *bits)
58 {
59         vfree(bits);
60 }
61
62 /*----------------------------------------------------------------*/
63
64 #define PRISON_CELLS 1024
65 #define MIGRATION_POOL_SIZE 128
66 #define COMMIT_PERIOD HZ
67 #define MIGRATION_COUNT_WINDOW 10
68
69 /*
70  * The block size of the device holding cache data must be
71  * between 32KB and 1GB.
72  */
73 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
74 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
75
76 /*
77  * FIXME: the cache is read/write for the time being.
78  */
79 enum cache_mode {
80         CM_WRITE,               /* metadata may be changed */
81         CM_READ_ONLY,           /* metadata may not be changed */
82 };
83
84 struct cache_features {
85         enum cache_mode mode;
86         bool write_through:1;
87 };
88
89 struct cache_stats {
90         atomic_t read_hit;
91         atomic_t read_miss;
92         atomic_t write_hit;
93         atomic_t write_miss;
94         atomic_t demotion;
95         atomic_t promotion;
96         atomic_t copies_avoided;
97         atomic_t cache_cell_clash;
98         atomic_t commit_count;
99         atomic_t discard_count;
100 };
101
102 struct cache {
103         struct dm_target *ti;
104         struct dm_target_callbacks callbacks;
105
106         struct dm_cache_metadata *cmd;
107
108         /*
109          * Metadata is written to this device.
110          */
111         struct dm_dev *metadata_dev;
112
113         /*
114          * The slower of the two data devices.  Typically a spindle.
115          */
116         struct dm_dev *origin_dev;
117
118         /*
119          * The faster of the two data devices.  Typically an SSD.
120          */
121         struct dm_dev *cache_dev;
122
123         /*
124          * Size of the origin device in _complete_ blocks and native sectors.
125          */
126         dm_oblock_t origin_blocks;
127         sector_t origin_sectors;
128
129         /*
130          * Size of the cache device in blocks.
131          */
132         dm_cblock_t cache_size;
133
134         /*
135          * Fields for converting from sectors to blocks.
136          */
137         uint32_t sectors_per_block;
138         int sectors_per_block_shift;
139
140         spinlock_t lock;
141         struct bio_list deferred_bios;
142         struct bio_list deferred_flush_bios;
143         struct bio_list deferred_writethrough_bios;
144         struct list_head quiesced_migrations;
145         struct list_head completed_migrations;
146         struct list_head need_commit_migrations;
147         sector_t migration_threshold;
148         wait_queue_head_t migration_wait;
149         atomic_t nr_migrations;
150
151         /*
152          * cache_size entries, dirty if set
153          */
154         dm_cblock_t nr_dirty;
155         unsigned long *dirty_bitset;
156
157         /*
158          * origin_blocks entries, discarded if set.
159          */
160         dm_dblock_t discard_nr_blocks;
161         unsigned long *discard_bitset;
162         uint32_t discard_block_size; /* a power of 2 times sectors per block */
163
164         /*
165          * Rather than reconstructing the table line for the status we just
166          * save it and regurgitate.
167          */
168         unsigned nr_ctr_args;
169         const char **ctr_args;
170
171         struct dm_kcopyd_client *copier;
172         struct workqueue_struct *wq;
173         struct work_struct worker;
174
175         struct delayed_work waker;
176         unsigned long last_commit_jiffies;
177
178         struct dm_bio_prison *prison;
179         struct dm_deferred_set *all_io_ds;
180
181         mempool_t *migration_pool;
182         struct dm_cache_migration *next_migration;
183
184         struct dm_cache_policy *policy;
185         unsigned policy_nr_args;
186
187         bool need_tick_bio:1;
188         bool sized:1;
189         bool quiescing:1;
190         bool commit_requested:1;
191         bool loaded_mappings:1;
192         bool loaded_discards:1;
193
194         /*
195          * Cache features such as write-through.
196          */
197         struct cache_features features;
198
199         struct cache_stats stats;
200 };
201
202 struct per_bio_data {
203         bool tick:1;
204         unsigned req_nr:2;
205         struct dm_deferred_entry *all_io_entry;
206
207         /*
208          * writethrough fields.  These MUST remain at the end of this
209          * structure and the 'cache' member must be the first as it
210          * is used to determine the offset of the writethrough fields.
211          */
212         struct cache *cache;
213         dm_cblock_t cblock;
214         bio_end_io_t *saved_bi_end_io;
215         struct dm_bio_details bio_details;
216 };
217
218 struct dm_cache_migration {
219         struct list_head list;
220         struct cache *cache;
221
222         unsigned long start_jiffies;
223         dm_oblock_t old_oblock;
224         dm_oblock_t new_oblock;
225         dm_cblock_t cblock;
226
227         bool err:1;
228         bool writeback:1;
229         bool demote:1;
230         bool promote:1;
231
232         struct dm_bio_prison_cell *old_ocell;
233         struct dm_bio_prison_cell *new_ocell;
234 };
235
236 /*
237  * Processing a bio in the worker thread may require these memory
238  * allocations.  We prealloc to avoid deadlocks (the same worker thread
239  * frees them back to the mempool).
240  */
241 struct prealloc {
242         struct dm_cache_migration *mg;
243         struct dm_bio_prison_cell *cell1;
244         struct dm_bio_prison_cell *cell2;
245 };
246
247 static void wake_worker(struct cache *cache)
248 {
249         queue_work(cache->wq, &cache->worker);
250 }
251
252 /*----------------------------------------------------------------*/
253
254 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
255 {
256         /* FIXME: change to use a local slab. */
257         return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
258 }
259
260 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
261 {
262         dm_bio_prison_free_cell(cache->prison, cell);
263 }
264
265 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
266 {
267         if (!p->mg) {
268                 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
269                 if (!p->mg)
270                         return -ENOMEM;
271         }
272
273         if (!p->cell1) {
274                 p->cell1 = alloc_prison_cell(cache);
275                 if (!p->cell1)
276                         return -ENOMEM;
277         }
278
279         if (!p->cell2) {
280                 p->cell2 = alloc_prison_cell(cache);
281                 if (!p->cell2)
282                         return -ENOMEM;
283         }
284
285         return 0;
286 }
287
288 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
289 {
290         if (p->cell2)
291                 free_prison_cell(cache, p->cell2);
292
293         if (p->cell1)
294                 free_prison_cell(cache, p->cell1);
295
296         if (p->mg)
297                 mempool_free(p->mg, cache->migration_pool);
298 }
299
300 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
301 {
302         struct dm_cache_migration *mg = p->mg;
303
304         BUG_ON(!mg);
305         p->mg = NULL;
306
307         return mg;
308 }
309
310 /*
311  * You must have a cell within the prealloc struct to return.  If not this
312  * function will BUG() rather than returning NULL.
313  */
314 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
315 {
316         struct dm_bio_prison_cell *r = NULL;
317
318         if (p->cell1) {
319                 r = p->cell1;
320                 p->cell1 = NULL;
321
322         } else if (p->cell2) {
323                 r = p->cell2;
324                 p->cell2 = NULL;
325         } else
326                 BUG();
327
328         return r;
329 }
330
331 /*
332  * You can't have more than two cells in a prealloc struct.  BUG() will be
333  * called if you try and overfill.
334  */
335 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
336 {
337         if (!p->cell2)
338                 p->cell2 = cell;
339
340         else if (!p->cell1)
341                 p->cell1 = cell;
342
343         else
344                 BUG();
345 }
346
347 /*----------------------------------------------------------------*/
348
349 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
350 {
351         key->virtual = 0;
352         key->dev = 0;
353         key->block = from_oblock(oblock);
354 }
355
356 /*
357  * The caller hands in a preallocated cell, and a free function for it.
358  * The cell will be freed if there's an error, or if it wasn't used because
359  * a cell with that key already exists.
360  */
361 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
362
363 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
364                       struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
365                       cell_free_fn free_fn, void *free_context,
366                       struct dm_bio_prison_cell **cell_result)
367 {
368         int r;
369         struct dm_cell_key key;
370
371         build_key(oblock, &key);
372         r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
373         if (r)
374                 free_fn(free_context, cell_prealloc);
375
376         return r;
377 }
378
379 static int get_cell(struct cache *cache,
380                     dm_oblock_t oblock,
381                     struct prealloc *structs,
382                     struct dm_bio_prison_cell **cell_result)
383 {
384         int r;
385         struct dm_cell_key key;
386         struct dm_bio_prison_cell *cell_prealloc;
387
388         cell_prealloc = prealloc_get_cell(structs);
389
390         build_key(oblock, &key);
391         r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
392         if (r)
393                 prealloc_put_cell(structs, cell_prealloc);
394
395         return r;
396 }
397
398 /*----------------------------------------------------------------*/
399
400 static bool is_dirty(struct cache *cache, dm_cblock_t b)
401 {
402         return test_bit(from_cblock(b), cache->dirty_bitset);
403 }
404
405 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
406 {
407         if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
408                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
409                 policy_set_dirty(cache->policy, oblock);
410         }
411 }
412
413 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
414 {
415         if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
416                 policy_clear_dirty(cache->policy, oblock);
417                 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
418                 if (!from_cblock(cache->nr_dirty))
419                         dm_table_event(cache->ti->table);
420         }
421 }
422
423 /*----------------------------------------------------------------*/
424
425 static bool block_size_is_power_of_two(struct cache *cache)
426 {
427         return cache->sectors_per_block_shift >= 0;
428 }
429
430 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
431 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
432 __always_inline
433 #endif
434 static dm_block_t block_div(dm_block_t b, uint32_t n)
435 {
436         do_div(b, n);
437
438         return b;
439 }
440
441 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
442 {
443         uint32_t discard_blocks = cache->discard_block_size;
444         dm_block_t b = from_oblock(oblock);
445
446         if (!block_size_is_power_of_two(cache))
447                 discard_blocks = discard_blocks / cache->sectors_per_block;
448         else
449                 discard_blocks >>= cache->sectors_per_block_shift;
450
451         b = block_div(b, discard_blocks);
452
453         return to_dblock(b);
454 }
455
456 static void set_discard(struct cache *cache, dm_dblock_t b)
457 {
458         unsigned long flags;
459
460         atomic_inc(&cache->stats.discard_count);
461
462         spin_lock_irqsave(&cache->lock, flags);
463         set_bit(from_dblock(b), cache->discard_bitset);
464         spin_unlock_irqrestore(&cache->lock, flags);
465 }
466
467 static void clear_discard(struct cache *cache, dm_dblock_t b)
468 {
469         unsigned long flags;
470
471         spin_lock_irqsave(&cache->lock, flags);
472         clear_bit(from_dblock(b), cache->discard_bitset);
473         spin_unlock_irqrestore(&cache->lock, flags);
474 }
475
476 static bool is_discarded(struct cache *cache, dm_dblock_t b)
477 {
478         int r;
479         unsigned long flags;
480
481         spin_lock_irqsave(&cache->lock, flags);
482         r = test_bit(from_dblock(b), cache->discard_bitset);
483         spin_unlock_irqrestore(&cache->lock, flags);
484
485         return r;
486 }
487
488 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
489 {
490         int r;
491         unsigned long flags;
492
493         spin_lock_irqsave(&cache->lock, flags);
494         r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
495                      cache->discard_bitset);
496         spin_unlock_irqrestore(&cache->lock, flags);
497
498         return r;
499 }
500
501 /*----------------------------------------------------------------*/
502
503 static void load_stats(struct cache *cache)
504 {
505         struct dm_cache_statistics stats;
506
507         dm_cache_metadata_get_stats(cache->cmd, &stats);
508         atomic_set(&cache->stats.read_hit, stats.read_hits);
509         atomic_set(&cache->stats.read_miss, stats.read_misses);
510         atomic_set(&cache->stats.write_hit, stats.write_hits);
511         atomic_set(&cache->stats.write_miss, stats.write_misses);
512 }
513
514 static void save_stats(struct cache *cache)
515 {
516         struct dm_cache_statistics stats;
517
518         stats.read_hits = atomic_read(&cache->stats.read_hit);
519         stats.read_misses = atomic_read(&cache->stats.read_miss);
520         stats.write_hits = atomic_read(&cache->stats.write_hit);
521         stats.write_misses = atomic_read(&cache->stats.write_miss);
522
523         dm_cache_metadata_set_stats(cache->cmd, &stats);
524 }
525
526 /*----------------------------------------------------------------
527  * Per bio data
528  *--------------------------------------------------------------*/
529
530 /*
531  * If using writeback, leave out struct per_bio_data's writethrough fields.
532  */
533 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
534 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
535
536 static size_t get_per_bio_data_size(struct cache *cache)
537 {
538         return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
539 }
540
541 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
542 {
543         struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
544         BUG_ON(!pb);
545         return pb;
546 }
547
548 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
549 {
550         struct per_bio_data *pb = get_per_bio_data(bio, data_size);
551
552         pb->tick = false;
553         pb->req_nr = dm_bio_get_target_bio_nr(bio);
554         pb->all_io_entry = NULL;
555
556         return pb;
557 }
558
559 /*----------------------------------------------------------------
560  * Remapping
561  *--------------------------------------------------------------*/
562 static void remap_to_origin(struct cache *cache, struct bio *bio)
563 {
564         bio->bi_bdev = cache->origin_dev->bdev;
565 }
566
567 static void remap_to_cache(struct cache *cache, struct bio *bio,
568                            dm_cblock_t cblock)
569 {
570         sector_t bi_sector = bio->bi_sector;
571
572         bio->bi_bdev = cache->cache_dev->bdev;
573         if (!block_size_is_power_of_two(cache))
574                 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
575                                 sector_div(bi_sector, cache->sectors_per_block);
576         else
577                 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
578                                 (bi_sector & (cache->sectors_per_block - 1));
579 }
580
581 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
582 {
583         unsigned long flags;
584         size_t pb_data_size = get_per_bio_data_size(cache);
585         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
586
587         spin_lock_irqsave(&cache->lock, flags);
588         if (cache->need_tick_bio &&
589             !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
590                 pb->tick = true;
591                 cache->need_tick_bio = false;
592         }
593         spin_unlock_irqrestore(&cache->lock, flags);
594 }
595
596 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
597                                   dm_oblock_t oblock)
598 {
599         check_if_tick_bio_needed(cache, bio);
600         remap_to_origin(cache, bio);
601         if (bio_data_dir(bio) == WRITE)
602                 clear_discard(cache, oblock_to_dblock(cache, oblock));
603 }
604
605 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
606                                  dm_oblock_t oblock, dm_cblock_t cblock)
607 {
608         remap_to_cache(cache, bio, cblock);
609         if (bio_data_dir(bio) == WRITE) {
610                 set_dirty(cache, oblock, cblock);
611                 clear_discard(cache, oblock_to_dblock(cache, oblock));
612         }
613 }
614
615 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
616 {
617         sector_t block_nr = bio->bi_sector;
618
619         if (!block_size_is_power_of_two(cache))
620                 (void) sector_div(block_nr, cache->sectors_per_block);
621         else
622                 block_nr >>= cache->sectors_per_block_shift;
623
624         return to_oblock(block_nr);
625 }
626
627 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
628 {
629         return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
630 }
631
632 static void issue(struct cache *cache, struct bio *bio)
633 {
634         unsigned long flags;
635
636         if (!bio_triggers_commit(cache, bio)) {
637                 generic_make_request(bio);
638                 return;
639         }
640
641         /*
642          * Batch together any bios that trigger commits and then issue a
643          * single commit for them in do_worker().
644          */
645         spin_lock_irqsave(&cache->lock, flags);
646         cache->commit_requested = true;
647         bio_list_add(&cache->deferred_flush_bios, bio);
648         spin_unlock_irqrestore(&cache->lock, flags);
649 }
650
651 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
652 {
653         unsigned long flags;
654
655         spin_lock_irqsave(&cache->lock, flags);
656         bio_list_add(&cache->deferred_writethrough_bios, bio);
657         spin_unlock_irqrestore(&cache->lock, flags);
658
659         wake_worker(cache);
660 }
661
662 static void writethrough_endio(struct bio *bio, int err)
663 {
664         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
665         bio->bi_end_io = pb->saved_bi_end_io;
666
667         if (err) {
668                 bio_endio(bio, err);
669                 return;
670         }
671
672         dm_bio_restore(&pb->bio_details, bio);
673         remap_to_cache(pb->cache, bio, pb->cblock);
674
675         /*
676          * We can't issue this bio directly, since we're in interrupt
677          * context.  So it gets put on a bio list for processing by the
678          * worker thread.
679          */
680         defer_writethrough_bio(pb->cache, bio);
681 }
682
683 /*
684  * When running in writethrough mode we need to send writes to clean blocks
685  * to both the cache and origin devices.  In future we'd like to clone the
686  * bio and send them in parallel, but for now we're doing them in
687  * series as this is easier.
688  */
689 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
690                                        dm_oblock_t oblock, dm_cblock_t cblock)
691 {
692         struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
693
694         pb->cache = cache;
695         pb->cblock = cblock;
696         pb->saved_bi_end_io = bio->bi_end_io;
697         dm_bio_record(&pb->bio_details, bio);
698         bio->bi_end_io = writethrough_endio;
699
700         remap_to_origin_clear_discard(pb->cache, bio, oblock);
701 }
702
703 /*----------------------------------------------------------------
704  * Migration processing
705  *
706  * Migration covers moving data from the origin device to the cache, or
707  * vice versa.
708  *--------------------------------------------------------------*/
709 static void free_migration(struct dm_cache_migration *mg)
710 {
711         mempool_free(mg, mg->cache->migration_pool);
712 }
713
714 static void inc_nr_migrations(struct cache *cache)
715 {
716         atomic_inc(&cache->nr_migrations);
717 }
718
719 static void dec_nr_migrations(struct cache *cache)
720 {
721         atomic_dec(&cache->nr_migrations);
722
723         /*
724          * Wake the worker in case we're suspending the target.
725          */
726         wake_up(&cache->migration_wait);
727 }
728
729 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
730                          bool holder)
731 {
732         (holder ? dm_cell_release : dm_cell_release_no_holder)
733                 (cache->prison, cell, &cache->deferred_bios);
734         free_prison_cell(cache, cell);
735 }
736
737 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
738                        bool holder)
739 {
740         unsigned long flags;
741
742         spin_lock_irqsave(&cache->lock, flags);
743         __cell_defer(cache, cell, holder);
744         spin_unlock_irqrestore(&cache->lock, flags);
745
746         wake_worker(cache);
747 }
748
749 static void cleanup_migration(struct dm_cache_migration *mg)
750 {
751         dec_nr_migrations(mg->cache);
752         free_migration(mg);
753 }
754
755 static void migration_failure(struct dm_cache_migration *mg)
756 {
757         struct cache *cache = mg->cache;
758
759         if (mg->writeback) {
760                 DMWARN_LIMIT("writeback failed; couldn't copy block");
761                 set_dirty(cache, mg->old_oblock, mg->cblock);
762                 cell_defer(cache, mg->old_ocell, false);
763
764         } else if (mg->demote) {
765                 DMWARN_LIMIT("demotion failed; couldn't copy block");
766                 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
767
768                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
769                 if (mg->promote)
770                         cell_defer(cache, mg->new_ocell, 1);
771         } else {
772                 DMWARN_LIMIT("promotion failed; couldn't copy block");
773                 policy_remove_mapping(cache->policy, mg->new_oblock);
774                 cell_defer(cache, mg->new_ocell, 1);
775         }
776
777         cleanup_migration(mg);
778 }
779
780 static void migration_success_pre_commit(struct dm_cache_migration *mg)
781 {
782         unsigned long flags;
783         struct cache *cache = mg->cache;
784
785         if (mg->writeback) {
786                 cell_defer(cache, mg->old_ocell, false);
787                 clear_dirty(cache, mg->old_oblock, mg->cblock);
788                 cleanup_migration(mg);
789                 return;
790
791         } else if (mg->demote) {
792                 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
793                         DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
794                         policy_force_mapping(cache->policy, mg->new_oblock,
795                                              mg->old_oblock);
796                         if (mg->promote)
797                                 cell_defer(cache, mg->new_ocell, true);
798                         cleanup_migration(mg);
799                         return;
800                 }
801         } else {
802                 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
803                         DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
804                         policy_remove_mapping(cache->policy, mg->new_oblock);
805                         cleanup_migration(mg);
806                         return;
807                 }
808         }
809
810         spin_lock_irqsave(&cache->lock, flags);
811         list_add_tail(&mg->list, &cache->need_commit_migrations);
812         cache->commit_requested = true;
813         spin_unlock_irqrestore(&cache->lock, flags);
814 }
815
816 static void migration_success_post_commit(struct dm_cache_migration *mg)
817 {
818         unsigned long flags;
819         struct cache *cache = mg->cache;
820
821         if (mg->writeback) {
822                 DMWARN("writeback unexpectedly triggered commit");
823                 return;
824
825         } else if (mg->demote) {
826                 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
827
828                 if (mg->promote) {
829                         mg->demote = false;
830
831                         spin_lock_irqsave(&cache->lock, flags);
832                         list_add_tail(&mg->list, &cache->quiesced_migrations);
833                         spin_unlock_irqrestore(&cache->lock, flags);
834
835                 } else
836                         cleanup_migration(mg);
837
838         } else {
839                 cell_defer(cache, mg->new_ocell, true);
840                 clear_dirty(cache, mg->new_oblock, mg->cblock);
841                 cleanup_migration(mg);
842         }
843 }
844
845 static void copy_complete(int read_err, unsigned long write_err, void *context)
846 {
847         unsigned long flags;
848         struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
849         struct cache *cache = mg->cache;
850
851         if (read_err || write_err)
852                 mg->err = true;
853
854         spin_lock_irqsave(&cache->lock, flags);
855         list_add_tail(&mg->list, &cache->completed_migrations);
856         spin_unlock_irqrestore(&cache->lock, flags);
857
858         wake_worker(cache);
859 }
860
861 static void issue_copy_real(struct dm_cache_migration *mg)
862 {
863         int r;
864         struct dm_io_region o_region, c_region;
865         struct cache *cache = mg->cache;
866
867         o_region.bdev = cache->origin_dev->bdev;
868         o_region.count = cache->sectors_per_block;
869
870         c_region.bdev = cache->cache_dev->bdev;
871         c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
872         c_region.count = cache->sectors_per_block;
873
874         if (mg->writeback || mg->demote) {
875                 /* demote */
876                 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
877                 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
878         } else {
879                 /* promote */
880                 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
881                 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
882         }
883
884         if (r < 0)
885                 migration_failure(mg);
886 }
887
888 static void avoid_copy(struct dm_cache_migration *mg)
889 {
890         atomic_inc(&mg->cache->stats.copies_avoided);
891         migration_success_pre_commit(mg);
892 }
893
894 static void issue_copy(struct dm_cache_migration *mg)
895 {
896         bool avoid;
897         struct cache *cache = mg->cache;
898
899         if (mg->writeback || mg->demote)
900                 avoid = !is_dirty(cache, mg->cblock) ||
901                         is_discarded_oblock(cache, mg->old_oblock);
902         else
903                 avoid = is_discarded_oblock(cache, mg->new_oblock);
904
905         avoid ? avoid_copy(mg) : issue_copy_real(mg);
906 }
907
908 static void complete_migration(struct dm_cache_migration *mg)
909 {
910         if (mg->err)
911                 migration_failure(mg);
912         else
913                 migration_success_pre_commit(mg);
914 }
915
916 static void process_migrations(struct cache *cache, struct list_head *head,
917                                void (*fn)(struct dm_cache_migration *))
918 {
919         unsigned long flags;
920         struct list_head list;
921         struct dm_cache_migration *mg, *tmp;
922
923         INIT_LIST_HEAD(&list);
924         spin_lock_irqsave(&cache->lock, flags);
925         list_splice_init(head, &list);
926         spin_unlock_irqrestore(&cache->lock, flags);
927
928         list_for_each_entry_safe(mg, tmp, &list, list)
929                 fn(mg);
930 }
931
932 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
933 {
934         list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
935 }
936
937 static void queue_quiesced_migration(struct dm_cache_migration *mg)
938 {
939         unsigned long flags;
940         struct cache *cache = mg->cache;
941
942         spin_lock_irqsave(&cache->lock, flags);
943         __queue_quiesced_migration(mg);
944         spin_unlock_irqrestore(&cache->lock, flags);
945
946         wake_worker(cache);
947 }
948
949 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
950 {
951         unsigned long flags;
952         struct dm_cache_migration *mg, *tmp;
953
954         spin_lock_irqsave(&cache->lock, flags);
955         list_for_each_entry_safe(mg, tmp, work, list)
956                 __queue_quiesced_migration(mg);
957         spin_unlock_irqrestore(&cache->lock, flags);
958
959         wake_worker(cache);
960 }
961
962 static void check_for_quiesced_migrations(struct cache *cache,
963                                           struct per_bio_data *pb)
964 {
965         struct list_head work;
966
967         if (!pb->all_io_entry)
968                 return;
969
970         INIT_LIST_HEAD(&work);
971         if (pb->all_io_entry)
972                 dm_deferred_entry_dec(pb->all_io_entry, &work);
973
974         if (!list_empty(&work))
975                 queue_quiesced_migrations(cache, &work);
976 }
977
978 static void quiesce_migration(struct dm_cache_migration *mg)
979 {
980         if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
981                 queue_quiesced_migration(mg);
982 }
983
984 static void promote(struct cache *cache, struct prealloc *structs,
985                     dm_oblock_t oblock, dm_cblock_t cblock,
986                     struct dm_bio_prison_cell *cell)
987 {
988         struct dm_cache_migration *mg = prealloc_get_migration(structs);
989
990         mg->err = false;
991         mg->writeback = false;
992         mg->demote = false;
993         mg->promote = true;
994         mg->cache = cache;
995         mg->new_oblock = oblock;
996         mg->cblock = cblock;
997         mg->old_ocell = NULL;
998         mg->new_ocell = cell;
999         mg->start_jiffies = jiffies;
1000
1001         inc_nr_migrations(cache);
1002         quiesce_migration(mg);
1003 }
1004
1005 static void writeback(struct cache *cache, struct prealloc *structs,
1006                       dm_oblock_t oblock, dm_cblock_t cblock,
1007                       struct dm_bio_prison_cell *cell)
1008 {
1009         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1010
1011         mg->err = false;
1012         mg->writeback = true;
1013         mg->demote = false;
1014         mg->promote = false;
1015         mg->cache = cache;
1016         mg->old_oblock = oblock;
1017         mg->cblock = cblock;
1018         mg->old_ocell = cell;
1019         mg->new_ocell = NULL;
1020         mg->start_jiffies = jiffies;
1021
1022         inc_nr_migrations(cache);
1023         quiesce_migration(mg);
1024 }
1025
1026 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1027                                 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1028                                 dm_cblock_t cblock,
1029                                 struct dm_bio_prison_cell *old_ocell,
1030                                 struct dm_bio_prison_cell *new_ocell)
1031 {
1032         struct dm_cache_migration *mg = prealloc_get_migration(structs);
1033
1034         mg->err = false;
1035         mg->writeback = false;
1036         mg->demote = true;
1037         mg->promote = true;
1038         mg->cache = cache;
1039         mg->old_oblock = old_oblock;
1040         mg->new_oblock = new_oblock;
1041         mg->cblock = cblock;
1042         mg->old_ocell = old_ocell;
1043         mg->new_ocell = new_ocell;
1044         mg->start_jiffies = jiffies;
1045
1046         inc_nr_migrations(cache);
1047         quiesce_migration(mg);
1048 }
1049
1050 /*----------------------------------------------------------------
1051  * bio processing
1052  *--------------------------------------------------------------*/
1053 static void defer_bio(struct cache *cache, struct bio *bio)
1054 {
1055         unsigned long flags;
1056
1057         spin_lock_irqsave(&cache->lock, flags);
1058         bio_list_add(&cache->deferred_bios, bio);
1059         spin_unlock_irqrestore(&cache->lock, flags);
1060
1061         wake_worker(cache);
1062 }
1063
1064 static void process_flush_bio(struct cache *cache, struct bio *bio)
1065 {
1066         size_t pb_data_size = get_per_bio_data_size(cache);
1067         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1068
1069         BUG_ON(bio->bi_size);
1070         if (!pb->req_nr)
1071                 remap_to_origin(cache, bio);
1072         else
1073                 remap_to_cache(cache, bio, 0);
1074
1075         issue(cache, bio);
1076 }
1077
1078 /*
1079  * People generally discard large parts of a device, eg, the whole device
1080  * when formatting.  Splitting these large discards up into cache block
1081  * sized ios and then quiescing (always neccessary for discard) takes too
1082  * long.
1083  *
1084  * We keep it simple, and allow any size of discard to come in, and just
1085  * mark off blocks on the discard bitset.  No passdown occurs!
1086  *
1087  * To implement passdown we need to change the bio_prison such that a cell
1088  * can have a key that spans many blocks.
1089  */
1090 static void process_discard_bio(struct cache *cache, struct bio *bio)
1091 {
1092         dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1093                                                   cache->discard_block_size);
1094         dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1095         dm_block_t b;
1096
1097         end_block = block_div(end_block, cache->discard_block_size);
1098
1099         for (b = start_block; b < end_block; b++)
1100                 set_discard(cache, to_dblock(b));
1101
1102         bio_endio(bio, 0);
1103 }
1104
1105 static bool spare_migration_bandwidth(struct cache *cache)
1106 {
1107         sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1108                 cache->sectors_per_block;
1109         return current_volume < cache->migration_threshold;
1110 }
1111
1112 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1113                                dm_cblock_t cblock)
1114 {
1115         return bio_data_dir(bio) == WRITE &&
1116                 cache->features.write_through && !is_dirty(cache, cblock);
1117 }
1118
1119 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1120 {
1121         atomic_inc(bio_data_dir(bio) == READ ?
1122                    &cache->stats.read_hit : &cache->stats.write_hit);
1123 }
1124
1125 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1126 {
1127         atomic_inc(bio_data_dir(bio) == READ ?
1128                    &cache->stats.read_miss : &cache->stats.write_miss);
1129 }
1130
1131 static void process_bio(struct cache *cache, struct prealloc *structs,
1132                         struct bio *bio)
1133 {
1134         int r;
1135         bool release_cell = true;
1136         dm_oblock_t block = get_bio_block(cache, bio);
1137         struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1138         struct policy_result lookup_result;
1139         size_t pb_data_size = get_per_bio_data_size(cache);
1140         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1141         bool discarded_block = is_discarded_oblock(cache, block);
1142         bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1143
1144         /*
1145          * Check to see if that block is currently migrating.
1146          */
1147         cell_prealloc = prealloc_get_cell(structs);
1148         r = bio_detain(cache, block, bio, cell_prealloc,
1149                        (cell_free_fn) prealloc_put_cell,
1150                        structs, &new_ocell);
1151         if (r > 0)
1152                 return;
1153
1154         r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1155                        bio, &lookup_result);
1156
1157         if (r == -EWOULDBLOCK)
1158                 /* migration has been denied */
1159                 lookup_result.op = POLICY_MISS;
1160
1161         switch (lookup_result.op) {
1162         case POLICY_HIT:
1163                 inc_hit_counter(cache, bio);
1164                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1165
1166                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1167                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1168                 else
1169                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1170
1171                 issue(cache, bio);
1172                 break;
1173
1174         case POLICY_MISS:
1175                 inc_miss_counter(cache, bio);
1176                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1177                 remap_to_origin_clear_discard(cache, bio, block);
1178                 issue(cache, bio);
1179                 break;
1180
1181         case POLICY_NEW:
1182                 atomic_inc(&cache->stats.promotion);
1183                 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1184                 release_cell = false;
1185                 break;
1186
1187         case POLICY_REPLACE:
1188                 cell_prealloc = prealloc_get_cell(structs);
1189                 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1190                                (cell_free_fn) prealloc_put_cell,
1191                                structs, &old_ocell);
1192                 if (r > 0) {
1193                         /*
1194                          * We have to be careful to avoid lock inversion of
1195                          * the cells.  So we back off, and wait for the
1196                          * old_ocell to become free.
1197                          */
1198                         policy_force_mapping(cache->policy, block,
1199                                              lookup_result.old_oblock);
1200                         atomic_inc(&cache->stats.cache_cell_clash);
1201                         break;
1202                 }
1203                 atomic_inc(&cache->stats.demotion);
1204                 atomic_inc(&cache->stats.promotion);
1205
1206                 demote_then_promote(cache, structs, lookup_result.old_oblock,
1207                                     block, lookup_result.cblock,
1208                                     old_ocell, new_ocell);
1209                 release_cell = false;
1210                 break;
1211
1212         default:
1213                 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1214                             (unsigned) lookup_result.op);
1215                 bio_io_error(bio);
1216         }
1217
1218         if (release_cell)
1219                 cell_defer(cache, new_ocell, false);
1220 }
1221
1222 static int need_commit_due_to_time(struct cache *cache)
1223 {
1224         return jiffies < cache->last_commit_jiffies ||
1225                jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1226 }
1227
1228 static int commit_if_needed(struct cache *cache)
1229 {
1230         if (dm_cache_changed_this_transaction(cache->cmd) &&
1231             (cache->commit_requested || need_commit_due_to_time(cache))) {
1232                 atomic_inc(&cache->stats.commit_count);
1233                 cache->last_commit_jiffies = jiffies;
1234                 cache->commit_requested = false;
1235                 return dm_cache_commit(cache->cmd, false);
1236         }
1237
1238         return 0;
1239 }
1240
1241 static void process_deferred_bios(struct cache *cache)
1242 {
1243         unsigned long flags;
1244         struct bio_list bios;
1245         struct bio *bio;
1246         struct prealloc structs;
1247
1248         memset(&structs, 0, sizeof(structs));
1249         bio_list_init(&bios);
1250
1251         spin_lock_irqsave(&cache->lock, flags);
1252         bio_list_merge(&bios, &cache->deferred_bios);
1253         bio_list_init(&cache->deferred_bios);
1254         spin_unlock_irqrestore(&cache->lock, flags);
1255
1256         while (!bio_list_empty(&bios)) {
1257                 /*
1258                  * If we've got no free migration structs, and processing
1259                  * this bio might require one, we pause until there are some
1260                  * prepared mappings to process.
1261                  */
1262                 if (prealloc_data_structs(cache, &structs)) {
1263                         spin_lock_irqsave(&cache->lock, flags);
1264                         bio_list_merge(&cache->deferred_bios, &bios);
1265                         spin_unlock_irqrestore(&cache->lock, flags);
1266                         break;
1267                 }
1268
1269                 bio = bio_list_pop(&bios);
1270
1271                 if (bio->bi_rw & REQ_FLUSH)
1272                         process_flush_bio(cache, bio);
1273                 else if (bio->bi_rw & REQ_DISCARD)
1274                         process_discard_bio(cache, bio);
1275                 else
1276                         process_bio(cache, &structs, bio);
1277         }
1278
1279         prealloc_free_structs(cache, &structs);
1280 }
1281
1282 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1283 {
1284         unsigned long flags;
1285         struct bio_list bios;
1286         struct bio *bio;
1287
1288         bio_list_init(&bios);
1289
1290         spin_lock_irqsave(&cache->lock, flags);
1291         bio_list_merge(&bios, &cache->deferred_flush_bios);
1292         bio_list_init(&cache->deferred_flush_bios);
1293         spin_unlock_irqrestore(&cache->lock, flags);
1294
1295         while ((bio = bio_list_pop(&bios)))
1296                 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1297 }
1298
1299 static void process_deferred_writethrough_bios(struct cache *cache)
1300 {
1301         unsigned long flags;
1302         struct bio_list bios;
1303         struct bio *bio;
1304
1305         bio_list_init(&bios);
1306
1307         spin_lock_irqsave(&cache->lock, flags);
1308         bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1309         bio_list_init(&cache->deferred_writethrough_bios);
1310         spin_unlock_irqrestore(&cache->lock, flags);
1311
1312         while ((bio = bio_list_pop(&bios)))
1313                 generic_make_request(bio);
1314 }
1315
1316 static void writeback_some_dirty_blocks(struct cache *cache)
1317 {
1318         int r = 0;
1319         dm_oblock_t oblock;
1320         dm_cblock_t cblock;
1321         struct prealloc structs;
1322         struct dm_bio_prison_cell *old_ocell;
1323
1324         memset(&structs, 0, sizeof(structs));
1325
1326         while (spare_migration_bandwidth(cache)) {
1327                 if (prealloc_data_structs(cache, &structs))
1328                         break;
1329
1330                 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1331                 if (r)
1332                         break;
1333
1334                 r = get_cell(cache, oblock, &structs, &old_ocell);
1335                 if (r) {
1336                         policy_set_dirty(cache->policy, oblock);
1337                         break;
1338                 }
1339
1340                 writeback(cache, &structs, oblock, cblock, old_ocell);
1341         }
1342
1343         prealloc_free_structs(cache, &structs);
1344 }
1345
1346 /*----------------------------------------------------------------
1347  * Main worker loop
1348  *--------------------------------------------------------------*/
1349 static void start_quiescing(struct cache *cache)
1350 {
1351         unsigned long flags;
1352
1353         spin_lock_irqsave(&cache->lock, flags);
1354         cache->quiescing = 1;
1355         spin_unlock_irqrestore(&cache->lock, flags);
1356 }
1357
1358 static void stop_quiescing(struct cache *cache)
1359 {
1360         unsigned long flags;
1361
1362         spin_lock_irqsave(&cache->lock, flags);
1363         cache->quiescing = 0;
1364         spin_unlock_irqrestore(&cache->lock, flags);
1365 }
1366
1367 static bool is_quiescing(struct cache *cache)
1368 {
1369         int r;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&cache->lock, flags);
1373         r = cache->quiescing;
1374         spin_unlock_irqrestore(&cache->lock, flags);
1375
1376         return r;
1377 }
1378
1379 static void wait_for_migrations(struct cache *cache)
1380 {
1381         wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1382 }
1383
1384 static void stop_worker(struct cache *cache)
1385 {
1386         cancel_delayed_work(&cache->waker);
1387         flush_workqueue(cache->wq);
1388 }
1389
1390 static void requeue_deferred_io(struct cache *cache)
1391 {
1392         struct bio *bio;
1393         struct bio_list bios;
1394
1395         bio_list_init(&bios);
1396         bio_list_merge(&bios, &cache->deferred_bios);
1397         bio_list_init(&cache->deferred_bios);
1398
1399         while ((bio = bio_list_pop(&bios)))
1400                 bio_endio(bio, DM_ENDIO_REQUEUE);
1401 }
1402
1403 static int more_work(struct cache *cache)
1404 {
1405         if (is_quiescing(cache))
1406                 return !list_empty(&cache->quiesced_migrations) ||
1407                         !list_empty(&cache->completed_migrations) ||
1408                         !list_empty(&cache->need_commit_migrations);
1409         else
1410                 return !bio_list_empty(&cache->deferred_bios) ||
1411                         !bio_list_empty(&cache->deferred_flush_bios) ||
1412                         !bio_list_empty(&cache->deferred_writethrough_bios) ||
1413                         !list_empty(&cache->quiesced_migrations) ||
1414                         !list_empty(&cache->completed_migrations) ||
1415                         !list_empty(&cache->need_commit_migrations);
1416 }
1417
1418 static void do_worker(struct work_struct *ws)
1419 {
1420         struct cache *cache = container_of(ws, struct cache, worker);
1421
1422         do {
1423                 if (!is_quiescing(cache))
1424                         process_deferred_bios(cache);
1425
1426                 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1427                 process_migrations(cache, &cache->completed_migrations, complete_migration);
1428
1429                 writeback_some_dirty_blocks(cache);
1430
1431                 process_deferred_writethrough_bios(cache);
1432
1433                 if (commit_if_needed(cache)) {
1434                         process_deferred_flush_bios(cache, false);
1435
1436                         /*
1437                          * FIXME: rollback metadata or just go into a
1438                          * failure mode and error everything
1439                          */
1440                 } else {
1441                         process_deferred_flush_bios(cache, true);
1442                         process_migrations(cache, &cache->need_commit_migrations,
1443                                            migration_success_post_commit);
1444                 }
1445         } while (more_work(cache));
1446 }
1447
1448 /*
1449  * We want to commit periodically so that not too much
1450  * unwritten metadata builds up.
1451  */
1452 static void do_waker(struct work_struct *ws)
1453 {
1454         struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1455         policy_tick(cache->policy);
1456         wake_worker(cache);
1457         queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1458 }
1459
1460 /*----------------------------------------------------------------*/
1461
1462 static int is_congested(struct dm_dev *dev, int bdi_bits)
1463 {
1464         struct request_queue *q = bdev_get_queue(dev->bdev);
1465         return bdi_congested(&q->backing_dev_info, bdi_bits);
1466 }
1467
1468 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1469 {
1470         struct cache *cache = container_of(cb, struct cache, callbacks);
1471
1472         return is_congested(cache->origin_dev, bdi_bits) ||
1473                 is_congested(cache->cache_dev, bdi_bits);
1474 }
1475
1476 /*----------------------------------------------------------------
1477  * Target methods
1478  *--------------------------------------------------------------*/
1479
1480 /*
1481  * This function gets called on the error paths of the constructor, so we
1482  * have to cope with a partially initialised struct.
1483  */
1484 static void destroy(struct cache *cache)
1485 {
1486         unsigned i;
1487
1488         if (cache->next_migration)
1489                 mempool_free(cache->next_migration, cache->migration_pool);
1490
1491         if (cache->migration_pool)
1492                 mempool_destroy(cache->migration_pool);
1493
1494         if (cache->all_io_ds)
1495                 dm_deferred_set_destroy(cache->all_io_ds);
1496
1497         if (cache->prison)
1498                 dm_bio_prison_destroy(cache->prison);
1499
1500         if (cache->wq)
1501                 destroy_workqueue(cache->wq);
1502
1503         if (cache->dirty_bitset)
1504                 free_bitset(cache->dirty_bitset);
1505
1506         if (cache->discard_bitset)
1507                 free_bitset(cache->discard_bitset);
1508
1509         if (cache->copier)
1510                 dm_kcopyd_client_destroy(cache->copier);
1511
1512         if (cache->cmd)
1513                 dm_cache_metadata_close(cache->cmd);
1514
1515         if (cache->metadata_dev)
1516                 dm_put_device(cache->ti, cache->metadata_dev);
1517
1518         if (cache->origin_dev)
1519                 dm_put_device(cache->ti, cache->origin_dev);
1520
1521         if (cache->cache_dev)
1522                 dm_put_device(cache->ti, cache->cache_dev);
1523
1524         if (cache->policy)
1525                 dm_cache_policy_destroy(cache->policy);
1526
1527         for (i = 0; i < cache->nr_ctr_args ; i++)
1528                 kfree(cache->ctr_args[i]);
1529         kfree(cache->ctr_args);
1530
1531         kfree(cache);
1532 }
1533
1534 static void cache_dtr(struct dm_target *ti)
1535 {
1536         struct cache *cache = ti->private;
1537
1538         destroy(cache);
1539 }
1540
1541 static sector_t get_dev_size(struct dm_dev *dev)
1542 {
1543         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1544 }
1545
1546 /*----------------------------------------------------------------*/
1547
1548 /*
1549  * Construct a cache device mapping.
1550  *
1551  * cache <metadata dev> <cache dev> <origin dev> <block size>
1552  *       <#feature args> [<feature arg>]*
1553  *       <policy> <#policy args> [<policy arg>]*
1554  *
1555  * metadata dev    : fast device holding the persistent metadata
1556  * cache dev       : fast device holding cached data blocks
1557  * origin dev      : slow device holding original data blocks
1558  * block size      : cache unit size in sectors
1559  *
1560  * #feature args   : number of feature arguments passed
1561  * feature args    : writethrough.  (The default is writeback.)
1562  *
1563  * policy          : the replacement policy to use
1564  * #policy args    : an even number of policy arguments corresponding
1565  *                   to key/value pairs passed to the policy
1566  * policy args     : key/value pairs passed to the policy
1567  *                   E.g. 'sequential_threshold 1024'
1568  *                   See cache-policies.txt for details.
1569  *
1570  * Optional feature arguments are:
1571  *   writethrough  : write through caching that prohibits cache block
1572  *                   content from being different from origin block content.
1573  *                   Without this argument, the default behaviour is to write
1574  *                   back cache block contents later for performance reasons,
1575  *                   so they may differ from the corresponding origin blocks.
1576  */
1577 struct cache_args {
1578         struct dm_target *ti;
1579
1580         struct dm_dev *metadata_dev;
1581
1582         struct dm_dev *cache_dev;
1583         sector_t cache_sectors;
1584
1585         struct dm_dev *origin_dev;
1586         sector_t origin_sectors;
1587
1588         uint32_t block_size;
1589
1590         const char *policy_name;
1591         int policy_argc;
1592         const char **policy_argv;
1593
1594         struct cache_features features;
1595 };
1596
1597 static void destroy_cache_args(struct cache_args *ca)
1598 {
1599         if (ca->metadata_dev)
1600                 dm_put_device(ca->ti, ca->metadata_dev);
1601
1602         if (ca->cache_dev)
1603                 dm_put_device(ca->ti, ca->cache_dev);
1604
1605         if (ca->origin_dev)
1606                 dm_put_device(ca->ti, ca->origin_dev);
1607
1608         kfree(ca);
1609 }
1610
1611 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1612 {
1613         if (!as->argc) {
1614                 *error = "Insufficient args";
1615                 return false;
1616         }
1617
1618         return true;
1619 }
1620
1621 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1622                               char **error)
1623 {
1624         int r;
1625         sector_t metadata_dev_size;
1626         char b[BDEVNAME_SIZE];
1627
1628         if (!at_least_one_arg(as, error))
1629                 return -EINVAL;
1630
1631         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1632                           &ca->metadata_dev);
1633         if (r) {
1634                 *error = "Error opening metadata device";
1635                 return r;
1636         }
1637
1638         metadata_dev_size = get_dev_size(ca->metadata_dev);
1639         if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1640                 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1641                        bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1642
1643         return 0;
1644 }
1645
1646 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1647                            char **error)
1648 {
1649         int r;
1650
1651         if (!at_least_one_arg(as, error))
1652                 return -EINVAL;
1653
1654         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1655                           &ca->cache_dev);
1656         if (r) {
1657                 *error = "Error opening cache device";
1658                 return r;
1659         }
1660         ca->cache_sectors = get_dev_size(ca->cache_dev);
1661
1662         return 0;
1663 }
1664
1665 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1666                             char **error)
1667 {
1668         int r;
1669
1670         if (!at_least_one_arg(as, error))
1671                 return -EINVAL;
1672
1673         r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1674                           &ca->origin_dev);
1675         if (r) {
1676                 *error = "Error opening origin device";
1677                 return r;
1678         }
1679
1680         ca->origin_sectors = get_dev_size(ca->origin_dev);
1681         if (ca->ti->len > ca->origin_sectors) {
1682                 *error = "Device size larger than cached device";
1683                 return -EINVAL;
1684         }
1685
1686         return 0;
1687 }
1688
1689 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1690                             char **error)
1691 {
1692         unsigned long block_size;
1693
1694         if (!at_least_one_arg(as, error))
1695                 return -EINVAL;
1696
1697         if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
1698             block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1699             block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1700             block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1701                 *error = "Invalid data block size";
1702                 return -EINVAL;
1703         }
1704
1705         if (block_size > ca->cache_sectors) {
1706                 *error = "Data block size is larger than the cache device";
1707                 return -EINVAL;
1708         }
1709
1710         ca->block_size = block_size;
1711
1712         return 0;
1713 }
1714
1715 static void init_features(struct cache_features *cf)
1716 {
1717         cf->mode = CM_WRITE;
1718         cf->write_through = false;
1719 }
1720
1721 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1722                           char **error)
1723 {
1724         static struct dm_arg _args[] = {
1725                 {0, 1, "Invalid number of cache feature arguments"},
1726         };
1727
1728         int r;
1729         unsigned argc;
1730         const char *arg;
1731         struct cache_features *cf = &ca->features;
1732
1733         init_features(cf);
1734
1735         r = dm_read_arg_group(_args, as, &argc, error);
1736         if (r)
1737                 return -EINVAL;
1738
1739         while (argc--) {
1740                 arg = dm_shift_arg(as);
1741
1742                 if (!strcasecmp(arg, "writeback"))
1743                         cf->write_through = false;
1744
1745                 else if (!strcasecmp(arg, "writethrough"))
1746                         cf->write_through = true;
1747
1748                 else {
1749                         *error = "Unrecognised cache feature requested";
1750                         return -EINVAL;
1751                 }
1752         }
1753
1754         return 0;
1755 }
1756
1757 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1758                         char **error)
1759 {
1760         static struct dm_arg _args[] = {
1761                 {0, 1024, "Invalid number of policy arguments"},
1762         };
1763
1764         int r;
1765
1766         if (!at_least_one_arg(as, error))
1767                 return -EINVAL;
1768
1769         ca->policy_name = dm_shift_arg(as);
1770
1771         r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1772         if (r)
1773                 return -EINVAL;
1774
1775         ca->policy_argv = (const char **)as->argv;
1776         dm_consume_args(as, ca->policy_argc);
1777
1778         return 0;
1779 }
1780
1781 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1782                             char **error)
1783 {
1784         int r;
1785         struct dm_arg_set as;
1786
1787         as.argc = argc;
1788         as.argv = argv;
1789
1790         r = parse_metadata_dev(ca, &as, error);
1791         if (r)
1792                 return r;
1793
1794         r = parse_cache_dev(ca, &as, error);
1795         if (r)
1796                 return r;
1797
1798         r = parse_origin_dev(ca, &as, error);
1799         if (r)
1800                 return r;
1801
1802         r = parse_block_size(ca, &as, error);
1803         if (r)
1804                 return r;
1805
1806         r = parse_features(ca, &as, error);
1807         if (r)
1808                 return r;
1809
1810         r = parse_policy(ca, &as, error);
1811         if (r)
1812                 return r;
1813
1814         return 0;
1815 }
1816
1817 /*----------------------------------------------------------------*/
1818
1819 static struct kmem_cache *migration_cache;
1820
1821 #define NOT_CORE_OPTION 1
1822
1823 static int process_config_option(struct cache *cache, const char *key, const char *value)
1824 {
1825         unsigned long tmp;
1826
1827         if (!strcasecmp(key, "migration_threshold")) {
1828                 if (kstrtoul(value, 10, &tmp))
1829                         return -EINVAL;
1830
1831                 cache->migration_threshold = tmp;
1832                 return 0;
1833         }
1834
1835         return NOT_CORE_OPTION;
1836 }
1837
1838 static int set_config_value(struct cache *cache, const char *key, const char *value)
1839 {
1840         int r = process_config_option(cache, key, value);
1841
1842         if (r == NOT_CORE_OPTION)
1843                 r = policy_set_config_value(cache->policy, key, value);
1844
1845         if (r)
1846                 DMWARN("bad config value for %s: %s", key, value);
1847
1848         return r;
1849 }
1850
1851 static int set_config_values(struct cache *cache, int argc, const char **argv)
1852 {
1853         int r = 0;
1854
1855         if (argc & 1) {
1856                 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1857                 return -EINVAL;
1858         }
1859
1860         while (argc) {
1861                 r = set_config_value(cache, argv[0], argv[1]);
1862                 if (r)
1863                         break;
1864
1865                 argc -= 2;
1866                 argv += 2;
1867         }
1868
1869         return r;
1870 }
1871
1872 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1873                                char **error)
1874 {
1875         cache->policy = dm_cache_policy_create(ca->policy_name,
1876                                                cache->cache_size,
1877                                                cache->origin_sectors,
1878                                                cache->sectors_per_block);
1879         if (!cache->policy) {
1880                 *error = "Error creating cache's policy";
1881                 return -ENOMEM;
1882         }
1883
1884         return 0;
1885 }
1886
1887 /*
1888  * We want the discard block size to be a power of two, at least the size
1889  * of the cache block size, and have no more than 2^14 discard blocks
1890  * across the origin.
1891  */
1892 #define MAX_DISCARD_BLOCKS (1 << 14)
1893
1894 static bool too_many_discard_blocks(sector_t discard_block_size,
1895                                     sector_t origin_size)
1896 {
1897         (void) sector_div(origin_size, discard_block_size);
1898
1899         return origin_size > MAX_DISCARD_BLOCKS;
1900 }
1901
1902 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1903                                              sector_t origin_size)
1904 {
1905         sector_t discard_block_size;
1906
1907         discard_block_size = roundup_pow_of_two(cache_block_size);
1908
1909         if (origin_size)
1910                 while (too_many_discard_blocks(discard_block_size, origin_size))
1911                         discard_block_size *= 2;
1912
1913         return discard_block_size;
1914 }
1915
1916 #define DEFAULT_MIGRATION_THRESHOLD 2048
1917
1918 static int cache_create(struct cache_args *ca, struct cache **result)
1919 {
1920         int r = 0;
1921         char **error = &ca->ti->error;
1922         struct cache *cache;
1923         struct dm_target *ti = ca->ti;
1924         dm_block_t origin_blocks;
1925         struct dm_cache_metadata *cmd;
1926         bool may_format = ca->features.mode == CM_WRITE;
1927
1928         cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1929         if (!cache)
1930                 return -ENOMEM;
1931
1932         cache->ti = ca->ti;
1933         ti->private = cache;
1934         ti->num_flush_bios = 2;
1935         ti->flush_supported = true;
1936
1937         ti->num_discard_bios = 1;
1938         ti->discards_supported = true;
1939         ti->discard_zeroes_data_unsupported = true;
1940
1941         cache->features = ca->features;
1942         ti->per_bio_data_size = get_per_bio_data_size(cache);
1943
1944         cache->callbacks.congested_fn = cache_is_congested;
1945         dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1946
1947         cache->metadata_dev = ca->metadata_dev;
1948         cache->origin_dev = ca->origin_dev;
1949         cache->cache_dev = ca->cache_dev;
1950
1951         ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1952
1953         /* FIXME: factor out this whole section */
1954         origin_blocks = cache->origin_sectors = ca->origin_sectors;
1955         origin_blocks = block_div(origin_blocks, ca->block_size);
1956         cache->origin_blocks = to_oblock(origin_blocks);
1957
1958         cache->sectors_per_block = ca->block_size;
1959         if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1960                 r = -EINVAL;
1961                 goto bad;
1962         }
1963
1964         if (ca->block_size & (ca->block_size - 1)) {
1965                 dm_block_t cache_size = ca->cache_sectors;
1966
1967                 cache->sectors_per_block_shift = -1;
1968                 cache_size = block_div(cache_size, ca->block_size);
1969                 cache->cache_size = to_cblock(cache_size);
1970         } else {
1971                 cache->sectors_per_block_shift = __ffs(ca->block_size);
1972                 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1973         }
1974
1975         r = create_cache_policy(cache, ca, error);
1976         if (r)
1977                 goto bad;
1978
1979         cache->policy_nr_args = ca->policy_argc;
1980         cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1981
1982         r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
1983         if (r) {
1984                 *error = "Error setting cache policy's config values";
1985                 goto bad;
1986         }
1987
1988         cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1989                                      ca->block_size, may_format,
1990                                      dm_cache_policy_get_hint_size(cache->policy));
1991         if (IS_ERR(cmd)) {
1992                 *error = "Error creating metadata object";
1993                 r = PTR_ERR(cmd);
1994                 goto bad;
1995         }
1996         cache->cmd = cmd;
1997
1998         spin_lock_init(&cache->lock);
1999         bio_list_init(&cache->deferred_bios);
2000         bio_list_init(&cache->deferred_flush_bios);
2001         bio_list_init(&cache->deferred_writethrough_bios);
2002         INIT_LIST_HEAD(&cache->quiesced_migrations);
2003         INIT_LIST_HEAD(&cache->completed_migrations);
2004         INIT_LIST_HEAD(&cache->need_commit_migrations);
2005         atomic_set(&cache->nr_migrations, 0);
2006         init_waitqueue_head(&cache->migration_wait);
2007
2008         r = -ENOMEM;
2009         cache->nr_dirty = 0;
2010         cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2011         if (!cache->dirty_bitset) {
2012                 *error = "could not allocate dirty bitset";
2013                 goto bad;
2014         }
2015         clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2016
2017         cache->discard_block_size =
2018                 calculate_discard_block_size(cache->sectors_per_block,
2019                                              cache->origin_sectors);
2020         cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
2021         cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2022         if (!cache->discard_bitset) {
2023                 *error = "could not allocate discard bitset";
2024                 goto bad;
2025         }
2026         clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2027
2028         cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2029         if (IS_ERR(cache->copier)) {
2030                 *error = "could not create kcopyd client";
2031                 r = PTR_ERR(cache->copier);
2032                 goto bad;
2033         }
2034
2035         cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2036         if (!cache->wq) {
2037                 *error = "could not create workqueue for metadata object";
2038                 goto bad;
2039         }
2040         INIT_WORK(&cache->worker, do_worker);
2041         INIT_DELAYED_WORK(&cache->waker, do_waker);
2042         cache->last_commit_jiffies = jiffies;
2043
2044         cache->prison = dm_bio_prison_create(PRISON_CELLS);
2045         if (!cache->prison) {
2046                 *error = "could not create bio prison";
2047                 goto bad;
2048         }
2049
2050         cache->all_io_ds = dm_deferred_set_create();
2051         if (!cache->all_io_ds) {
2052                 *error = "could not create all_io deferred set";
2053                 goto bad;
2054         }
2055
2056         cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2057                                                          migration_cache);
2058         if (!cache->migration_pool) {
2059                 *error = "Error creating cache's migration mempool";
2060                 goto bad;
2061         }
2062
2063         cache->next_migration = NULL;
2064
2065         cache->need_tick_bio = true;
2066         cache->sized = false;
2067         cache->quiescing = false;
2068         cache->commit_requested = false;
2069         cache->loaded_mappings = false;
2070         cache->loaded_discards = false;
2071
2072         load_stats(cache);
2073
2074         atomic_set(&cache->stats.demotion, 0);
2075         atomic_set(&cache->stats.promotion, 0);
2076         atomic_set(&cache->stats.copies_avoided, 0);
2077         atomic_set(&cache->stats.cache_cell_clash, 0);
2078         atomic_set(&cache->stats.commit_count, 0);
2079         atomic_set(&cache->stats.discard_count, 0);
2080
2081         *result = cache;
2082         return 0;
2083
2084 bad:
2085         destroy(cache);
2086         return r;
2087 }
2088
2089 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2090 {
2091         unsigned i;
2092         const char **copy;
2093
2094         copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2095         if (!copy)
2096                 return -ENOMEM;
2097         for (i = 0; i < argc; i++) {
2098                 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2099                 if (!copy[i]) {
2100                         while (i--)
2101                                 kfree(copy[i]);
2102                         kfree(copy);
2103                         return -ENOMEM;
2104                 }
2105         }
2106
2107         cache->nr_ctr_args = argc;
2108         cache->ctr_args = copy;
2109
2110         return 0;
2111 }
2112
2113 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2114 {
2115         int r = -EINVAL;
2116         struct cache_args *ca;
2117         struct cache *cache = NULL;
2118
2119         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2120         if (!ca) {
2121                 ti->error = "Error allocating memory for cache";
2122                 return -ENOMEM;
2123         }
2124         ca->ti = ti;
2125
2126         r = parse_cache_args(ca, argc, argv, &ti->error);
2127         if (r)
2128                 goto out;
2129
2130         r = cache_create(ca, &cache);
2131         if (r)
2132                 goto out;
2133
2134         r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2135         if (r) {
2136                 destroy(cache);
2137                 goto out;
2138         }
2139
2140         ti->private = cache;
2141
2142 out:
2143         destroy_cache_args(ca);
2144         return r;
2145 }
2146
2147 static int cache_map(struct dm_target *ti, struct bio *bio)
2148 {
2149         struct cache *cache = ti->private;
2150
2151         int r;
2152         dm_oblock_t block = get_bio_block(cache, bio);
2153         size_t pb_data_size = get_per_bio_data_size(cache);
2154         bool can_migrate = false;
2155         bool discarded_block;
2156         struct dm_bio_prison_cell *cell;
2157         struct policy_result lookup_result;
2158         struct per_bio_data *pb;
2159
2160         if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2161                 /*
2162                  * This can only occur if the io goes to a partial block at
2163                  * the end of the origin device.  We don't cache these.
2164                  * Just remap to the origin and carry on.
2165                  */
2166                 remap_to_origin_clear_discard(cache, bio, block);
2167                 return DM_MAPIO_REMAPPED;
2168         }
2169
2170         pb = init_per_bio_data(bio, pb_data_size);
2171
2172         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2173                 defer_bio(cache, bio);
2174                 return DM_MAPIO_SUBMITTED;
2175         }
2176
2177         /*
2178          * Check to see if that block is currently migrating.
2179          */
2180         cell = alloc_prison_cell(cache);
2181         if (!cell) {
2182                 defer_bio(cache, bio);
2183                 return DM_MAPIO_SUBMITTED;
2184         }
2185
2186         r = bio_detain(cache, block, bio, cell,
2187                        (cell_free_fn) free_prison_cell,
2188                        cache, &cell);
2189         if (r) {
2190                 if (r < 0)
2191                         defer_bio(cache, bio);
2192
2193                 return DM_MAPIO_SUBMITTED;
2194         }
2195
2196         discarded_block = is_discarded_oblock(cache, block);
2197
2198         r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2199                        bio, &lookup_result);
2200         if (r == -EWOULDBLOCK) {
2201                 cell_defer(cache, cell, true);
2202                 return DM_MAPIO_SUBMITTED;
2203
2204         } else if (r) {
2205                 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2206                 bio_io_error(bio);
2207                 return DM_MAPIO_SUBMITTED;
2208         }
2209
2210         switch (lookup_result.op) {
2211         case POLICY_HIT:
2212                 inc_hit_counter(cache, bio);
2213                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2214
2215                 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2216                         remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2217                 else
2218                         remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2219
2220                 cell_defer(cache, cell, false);
2221                 break;
2222
2223         case POLICY_MISS:
2224                 inc_miss_counter(cache, bio);
2225                 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2226
2227                 if (pb->req_nr != 0) {
2228                         /*
2229                          * This is a duplicate writethrough io that is no
2230                          * longer needed because the block has been demoted.
2231                          */
2232                         bio_endio(bio, 0);
2233                         cell_defer(cache, cell, false);
2234                         return DM_MAPIO_SUBMITTED;
2235                 } else {
2236                         remap_to_origin_clear_discard(cache, bio, block);
2237                         cell_defer(cache, cell, false);
2238                 }
2239                 break;
2240
2241         default:
2242                 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2243                             (unsigned) lookup_result.op);
2244                 bio_io_error(bio);
2245                 return DM_MAPIO_SUBMITTED;
2246         }
2247
2248         return DM_MAPIO_REMAPPED;
2249 }
2250
2251 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2252 {
2253         struct cache *cache = ti->private;
2254         unsigned long flags;
2255         size_t pb_data_size = get_per_bio_data_size(cache);
2256         struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2257
2258         if (pb->tick) {
2259                 policy_tick(cache->policy);
2260
2261                 spin_lock_irqsave(&cache->lock, flags);
2262                 cache->need_tick_bio = true;
2263                 spin_unlock_irqrestore(&cache->lock, flags);
2264         }
2265
2266         check_for_quiesced_migrations(cache, pb);
2267
2268         return 0;
2269 }
2270
2271 static int write_dirty_bitset(struct cache *cache)
2272 {
2273         unsigned i, r;
2274
2275         for (i = 0; i < from_cblock(cache->cache_size); i++) {
2276                 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2277                                        is_dirty(cache, to_cblock(i)));
2278                 if (r)
2279                         return r;
2280         }
2281
2282         return 0;
2283 }
2284
2285 static int write_discard_bitset(struct cache *cache)
2286 {
2287         unsigned i, r;
2288
2289         r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2290                                            cache->discard_nr_blocks);
2291         if (r) {
2292                 DMERR("could not resize on-disk discard bitset");
2293                 return r;
2294         }
2295
2296         for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2297                 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2298                                          is_discarded(cache, to_dblock(i)));
2299                 if (r)
2300                         return r;
2301         }
2302
2303         return 0;
2304 }
2305
2306 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2307                      uint32_t hint)
2308 {
2309         struct cache *cache = context;
2310         return dm_cache_save_hint(cache->cmd, cblock, hint);
2311 }
2312
2313 static int write_hints(struct cache *cache)
2314 {
2315         int r;
2316
2317         r = dm_cache_begin_hints(cache->cmd, cache->policy);
2318         if (r) {
2319                 DMERR("dm_cache_begin_hints failed");
2320                 return r;
2321         }
2322
2323         r = policy_walk_mappings(cache->policy, save_hint, cache);
2324         if (r)
2325                 DMERR("policy_walk_mappings failed");
2326
2327         return r;
2328 }
2329
2330 /*
2331  * returns true on success
2332  */
2333 static bool sync_metadata(struct cache *cache)
2334 {
2335         int r1, r2, r3, r4;
2336
2337         r1 = write_dirty_bitset(cache);
2338         if (r1)
2339                 DMERR("could not write dirty bitset");
2340
2341         r2 = write_discard_bitset(cache);
2342         if (r2)
2343                 DMERR("could not write discard bitset");
2344
2345         save_stats(cache);
2346
2347         r3 = write_hints(cache);
2348         if (r3)
2349                 DMERR("could not write hints");
2350
2351         /*
2352          * If writing the above metadata failed, we still commit, but don't
2353          * set the clean shutdown flag.  This will effectively force every
2354          * dirty bit to be set on reload.
2355          */
2356         r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2357         if (r4)
2358                 DMERR("could not write cache metadata.  Data loss may occur.");
2359
2360         return !r1 && !r2 && !r3 && !r4;
2361 }
2362
2363 static void cache_postsuspend(struct dm_target *ti)
2364 {
2365         struct cache *cache = ti->private;
2366
2367         start_quiescing(cache);
2368         wait_for_migrations(cache);
2369         stop_worker(cache);
2370         requeue_deferred_io(cache);
2371         stop_quiescing(cache);
2372
2373         (void) sync_metadata(cache);
2374 }
2375
2376 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2377                         bool dirty, uint32_t hint, bool hint_valid)
2378 {
2379         int r;
2380         struct cache *cache = context;
2381
2382         r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2383         if (r)
2384                 return r;
2385
2386         if (dirty)
2387                 set_dirty(cache, oblock, cblock);
2388         else
2389                 clear_dirty(cache, oblock, cblock);
2390
2391         return 0;
2392 }
2393
2394 static int load_discard(void *context, sector_t discard_block_size,
2395                         dm_dblock_t dblock, bool discard)
2396 {
2397         struct cache *cache = context;
2398
2399         /* FIXME: handle mis-matched block size */
2400
2401         if (discard)
2402                 set_discard(cache, dblock);
2403         else
2404                 clear_discard(cache, dblock);
2405
2406         return 0;
2407 }
2408
2409 static int cache_preresume(struct dm_target *ti)
2410 {
2411         int r = 0;
2412         struct cache *cache = ti->private;
2413         sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2414         (void) sector_div(actual_cache_size, cache->sectors_per_block);
2415
2416         /*
2417          * Check to see if the cache has resized.
2418          */
2419         if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2420                 cache->cache_size = to_cblock(actual_cache_size);
2421
2422                 r = dm_cache_resize(cache->cmd, cache->cache_size);
2423                 if (r) {
2424                         DMERR("could not resize cache metadata");
2425                         return r;
2426                 }
2427
2428                 cache->sized = true;
2429         }
2430
2431         if (!cache->loaded_mappings) {
2432                 r = dm_cache_load_mappings(cache->cmd, cache->policy,
2433                                            load_mapping, cache);
2434                 if (r) {
2435                         DMERR("could not load cache mappings");
2436                         return r;
2437                 }
2438
2439                 cache->loaded_mappings = true;
2440         }
2441
2442         if (!cache->loaded_discards) {
2443                 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2444                 if (r) {
2445                         DMERR("could not load origin discards");
2446                         return r;
2447                 }
2448
2449                 cache->loaded_discards = true;
2450         }
2451
2452         return r;
2453 }
2454
2455 static void cache_resume(struct dm_target *ti)
2456 {
2457         struct cache *cache = ti->private;
2458
2459         cache->need_tick_bio = true;
2460         do_waker(&cache->waker.work);
2461 }
2462
2463 /*
2464  * Status format:
2465  *
2466  * <#used metadata blocks>/<#total metadata blocks>
2467  * <#read hits> <#read misses> <#write hits> <#write misses>
2468  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2469  * <#features> <features>*
2470  * <#core args> <core args>
2471  * <#policy args> <policy args>*
2472  */
2473 static void cache_status(struct dm_target *ti, status_type_t type,
2474                          unsigned status_flags, char *result, unsigned maxlen)
2475 {
2476         int r = 0;
2477         unsigned i;
2478         ssize_t sz = 0;
2479         dm_block_t nr_free_blocks_metadata = 0;
2480         dm_block_t nr_blocks_metadata = 0;
2481         char buf[BDEVNAME_SIZE];
2482         struct cache *cache = ti->private;
2483         dm_cblock_t residency;
2484
2485         switch (type) {
2486         case STATUSTYPE_INFO:
2487                 /* Commit to ensure statistics aren't out-of-date */
2488                 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2489                         r = dm_cache_commit(cache->cmd, false);
2490                         if (r)
2491                                 DMERR("could not commit metadata for accurate status");
2492                 }
2493
2494                 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2495                                                            &nr_free_blocks_metadata);
2496                 if (r) {
2497                         DMERR("could not get metadata free block count");
2498                         goto err;
2499                 }
2500
2501                 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2502                 if (r) {
2503                         DMERR("could not get metadata device size");
2504                         goto err;
2505                 }
2506
2507                 residency = policy_residency(cache->policy);
2508
2509                 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2510                        (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2511                        (unsigned long long)nr_blocks_metadata,
2512                        (unsigned) atomic_read(&cache->stats.read_hit),
2513                        (unsigned) atomic_read(&cache->stats.read_miss),
2514                        (unsigned) atomic_read(&cache->stats.write_hit),
2515                        (unsigned) atomic_read(&cache->stats.write_miss),
2516                        (unsigned) atomic_read(&cache->stats.demotion),
2517                        (unsigned) atomic_read(&cache->stats.promotion),
2518                        (unsigned long long) from_cblock(residency),
2519                        cache->nr_dirty);
2520
2521                 if (cache->features.write_through)
2522                         DMEMIT("1 writethrough ");
2523                 else
2524                         DMEMIT("0 ");
2525
2526                 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2527                 if (sz < maxlen) {
2528                         r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2529                         if (r)
2530                                 DMERR("policy_emit_config_values returned %d", r);
2531                 }
2532
2533                 break;
2534
2535         case STATUSTYPE_TABLE:
2536                 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2537                 DMEMIT("%s ", buf);
2538                 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2539                 DMEMIT("%s ", buf);
2540                 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2541                 DMEMIT("%s", buf);
2542
2543                 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2544                         DMEMIT(" %s", cache->ctr_args[i]);
2545                 if (cache->nr_ctr_args)
2546                         DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2547         }
2548
2549         return;
2550
2551 err:
2552         DMEMIT("Error");
2553 }
2554
2555 /*
2556  * Supports <key> <value>.
2557  *
2558  * The key migration_threshold is supported by the cache target core.
2559  */
2560 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2561 {
2562         struct cache *cache = ti->private;
2563
2564         if (argc != 2)
2565                 return -EINVAL;
2566
2567         return set_config_value(cache, argv[0], argv[1]);
2568 }
2569
2570 static int cache_iterate_devices(struct dm_target *ti,
2571                                  iterate_devices_callout_fn fn, void *data)
2572 {
2573         int r = 0;
2574         struct cache *cache = ti->private;
2575
2576         r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2577         if (!r)
2578                 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2579
2580         return r;
2581 }
2582
2583 /*
2584  * We assume I/O is going to the origin (which is the volume
2585  * more likely to have restrictions e.g. by being striped).
2586  * (Looking up the exact location of the data would be expensive
2587  * and could always be out of date by the time the bio is submitted.)
2588  */
2589 static int cache_bvec_merge(struct dm_target *ti,
2590                             struct bvec_merge_data *bvm,
2591                             struct bio_vec *biovec, int max_size)
2592 {
2593         struct cache *cache = ti->private;
2594         struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2595
2596         if (!q->merge_bvec_fn)
2597                 return max_size;
2598
2599         bvm->bi_bdev = cache->origin_dev->bdev;
2600         return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2601 }
2602
2603 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2604 {
2605         /*
2606          * FIXME: these limits may be incompatible with the cache device
2607          */
2608         limits->max_discard_sectors = cache->discard_block_size * 1024;
2609         limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2610 }
2611
2612 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2613 {
2614         struct cache *cache = ti->private;
2615         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2616
2617         /*
2618          * If the system-determined stacked limits are compatible with the
2619          * cache's blocksize (io_opt is a factor) do not override them.
2620          */
2621         if (io_opt_sectors < cache->sectors_per_block ||
2622             do_div(io_opt_sectors, cache->sectors_per_block)) {
2623                 blk_limits_io_min(limits, 0);
2624                 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2625         }
2626         set_discard_limits(cache, limits);
2627 }
2628
2629 /*----------------------------------------------------------------*/
2630
2631 static struct target_type cache_target = {
2632         .name = "cache",
2633         .version = {1, 1, 1},
2634         .module = THIS_MODULE,
2635         .ctr = cache_ctr,
2636         .dtr = cache_dtr,
2637         .map = cache_map,
2638         .end_io = cache_end_io,
2639         .postsuspend = cache_postsuspend,
2640         .preresume = cache_preresume,
2641         .resume = cache_resume,
2642         .status = cache_status,
2643         .message = cache_message,
2644         .iterate_devices = cache_iterate_devices,
2645         .merge = cache_bvec_merge,
2646         .io_hints = cache_io_hints,
2647 };
2648
2649 static int __init dm_cache_init(void)
2650 {
2651         int r;
2652
2653         r = dm_register_target(&cache_target);
2654         if (r) {
2655                 DMERR("cache target registration failed: %d", r);
2656                 return r;
2657         }
2658
2659         migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2660         if (!migration_cache) {
2661                 dm_unregister_target(&cache_target);
2662                 return -ENOMEM;
2663         }
2664
2665         return 0;
2666 }
2667
2668 static void __exit dm_cache_exit(void)
2669 {
2670         dm_unregister_target(&cache_target);
2671         kmem_cache_destroy(migration_cache);
2672 }
2673
2674 module_init(dm_cache_init);
2675 module_exit(dm_cache_exit);
2676
2677 MODULE_DESCRIPTION(DM_NAME " cache target");
2678 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2679 MODULE_LICENSE("GPL");