2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
26 * metadata/data stored in disk with 4k size unit (a block) regardless
27 * underneath hardware sector size. only works with PAGE_SIZE == 4096
29 #define BLOCK_SECTORS (8)
32 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
33 * recovery scans a very long log
35 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
36 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
39 * We only need 2 bios per I/O unit to make progress, but ensure we
40 * have a few more available to not get too tight.
42 #define R5L_POOL_SIZE 4
45 * r5c journal modes of the array: write-back or write-through.
46 * write-through mode has identical behavior as existing log only
49 enum r5c_journal_mode {
50 R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
51 R5C_JOURNAL_MODE_WRITE_BACK = 1,
55 * raid5 cache state machine
57 * With rhe RAID cache, each stripe works in two phases:
61 * These two phases are controlled by bit STRIPE_R5C_CACHING:
62 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
63 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
65 * When there is no journal, or the journal is in write-through mode,
66 * the stripe is always in writing-out phase.
68 * For write-back journal, the stripe is sent to caching phase on write
69 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
70 * the write-out phase by clearing STRIPE_R5C_CACHING.
72 * Stripes in caching phase do not write the raid disks. Instead, all
73 * writes are committed from the log device. Therefore, a stripe in
74 * caching phase handles writes as:
75 * - write to log device
78 * Stripes in writing-out phase handle writes as:
80 * - write pending data and parity to journal
81 * - write data and parity to raid disks
82 * - return IO for pending writes
90 sector_t device_size; /* log device size, round to
92 sector_t max_free_space; /* reclaim run if free space is at
95 sector_t last_checkpoint; /* log tail. where recovery scan
97 u64 last_cp_seq; /* log tail sequence */
99 sector_t log_start; /* log head. where new data appends */
100 u64 seq; /* log head sequence */
102 sector_t next_checkpoint;
105 struct mutex io_mutex;
106 struct r5l_io_unit *current_io; /* current io_unit accepting new data */
108 spinlock_t io_list_lock;
109 struct list_head running_ios; /* io_units which are still running,
110 * and have not yet been completely
111 * written to the log */
112 struct list_head io_end_ios; /* io_units which have been completely
113 * written to the log but not yet written
115 struct list_head flushing_ios; /* io_units which are waiting for log
117 struct list_head finished_ios; /* io_units which settle down in log disk */
118 struct bio flush_bio;
120 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
122 struct kmem_cache *io_kc;
125 mempool_t *meta_pool;
127 struct md_thread *reclaim_thread;
128 unsigned long reclaim_target; /* number of space that need to be
129 * reclaimed. if it's 0, reclaim spaces
130 * used by io_units which are in
131 * IO_UNIT_STRIPE_END state (eg, reclaim
132 * dones't wait for specific io_unit
133 * switching to IO_UNIT_STRIPE_END
135 wait_queue_head_t iounit_wait;
137 struct list_head no_space_stripes; /* pending stripes, log has no space */
138 spinlock_t no_space_stripes_lock;
140 bool need_cache_flush;
143 enum r5c_journal_mode r5c_journal_mode;
147 * an IO range starts from a meta data block and end at the next meta data
148 * block. The io unit's the meta data block tracks data/parity followed it. io
149 * unit is written to log disk with normal write, as we always flush log disk
150 * first and then start move data to raid disks, there is no requirement to
151 * write io unit with FLUSH/FUA
156 struct page *meta_page; /* store meta block */
157 int meta_offset; /* current offset in meta_page */
159 struct bio *current_bio;/* current_bio accepting new data */
161 atomic_t pending_stripe;/* how many stripes not flushed to raid */
162 u64 seq; /* seq number of the metablock */
163 sector_t log_start; /* where the io_unit starts */
164 sector_t log_end; /* where the io_unit ends */
165 struct list_head log_sibling; /* log->running_ios */
166 struct list_head stripe_list; /* stripes added to the io_unit */
172 /* r5l_io_unit state */
173 enum r5l_io_unit_state {
174 IO_UNIT_RUNNING = 0, /* accepting new IO */
175 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
176 * don't accepting new bio */
177 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
178 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
181 bool r5c_is_writeback(struct r5l_log *log)
183 return (log != NULL &&
184 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
187 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
190 if (start >= log->device_size)
191 start = start - log->device_size;
195 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
201 return end + log->device_size - start;
204 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
208 used_size = r5l_ring_distance(log, log->last_checkpoint,
211 return log->device_size > used_size + size;
214 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
215 enum r5l_io_unit_state state)
217 if (WARN_ON(io->state >= state))
223 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
224 struct bio_list *return_bi)
226 struct bio *wbi, *wbi2;
230 while (wbi && wbi->bi_iter.bi_sector <
231 dev->sector + STRIPE_SECTORS) {
232 wbi2 = r5_next_bio(wbi, dev->sector);
233 if (!raid5_dec_bi_active_stripes(wbi)) {
234 md_write_end(conf->mddev);
235 bio_list_add(return_bi, wbi);
241 void r5c_handle_cached_data_endio(struct r5conf *conf,
242 struct stripe_head *sh, int disks, struct bio_list *return_bi)
246 for (i = sh->disks; i--; ) {
247 if (sh->dev[i].written) {
248 set_bit(R5_UPTODATE, &sh->dev[i].flags);
249 r5c_return_dev_pending_writes(conf, &sh->dev[i],
251 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
253 !test_bit(STRIPE_DEGRADED, &sh->state),
260 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
261 * This function should only be called in write-back mode.
263 static void r5c_make_stripe_write_out(struct stripe_head *sh)
265 struct r5conf *conf = sh->raid_conf;
266 struct r5l_log *log = conf->log;
268 BUG_ON(!r5c_is_writeback(log));
270 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
271 clear_bit(STRIPE_R5C_CACHING, &sh->state);
273 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
274 atomic_inc(&conf->preread_active_stripes);
276 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
277 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
278 atomic_dec(&conf->r5c_cached_partial_stripes);
281 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
282 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
283 atomic_dec(&conf->r5c_cached_full_stripes);
287 static void r5c_handle_data_cached(struct stripe_head *sh)
291 for (i = sh->disks; i--; )
292 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
293 set_bit(R5_InJournal, &sh->dev[i].flags);
294 clear_bit(R5_LOCKED, &sh->dev[i].flags);
296 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
300 * this journal write must contain full parity,
301 * it may also contain some data pages
303 static void r5c_handle_parity_cached(struct stripe_head *sh)
307 for (i = sh->disks; i--; )
308 if (test_bit(R5_InJournal, &sh->dev[i].flags))
309 set_bit(R5_Wantwrite, &sh->dev[i].flags);
313 * Setting proper flags after writing (or flushing) data and/or parity to the
314 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
316 static void r5c_finish_cache_stripe(struct stripe_head *sh)
318 struct r5l_log *log = sh->raid_conf->log;
320 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
321 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
323 * Set R5_InJournal for parity dev[pd_idx]. This means
324 * all data AND parity in the journal. For RAID 6, it is
325 * NOT necessary to set the flag for dev[qd_idx], as the
326 * two parities are written out together.
328 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
329 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
330 r5c_handle_data_cached(sh);
332 r5c_handle_parity_cached(sh);
333 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
337 static void r5l_io_run_stripes(struct r5l_io_unit *io)
339 struct stripe_head *sh, *next;
341 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
342 list_del_init(&sh->log_list);
344 r5c_finish_cache_stripe(sh);
346 set_bit(STRIPE_HANDLE, &sh->state);
347 raid5_release_stripe(sh);
351 static void r5l_log_run_stripes(struct r5l_log *log)
353 struct r5l_io_unit *io, *next;
355 assert_spin_locked(&log->io_list_lock);
357 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
358 /* don't change list order */
359 if (io->state < IO_UNIT_IO_END)
362 list_move_tail(&io->log_sibling, &log->finished_ios);
363 r5l_io_run_stripes(io);
367 static void r5l_move_to_end_ios(struct r5l_log *log)
369 struct r5l_io_unit *io, *next;
371 assert_spin_locked(&log->io_list_lock);
373 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
374 /* don't change list order */
375 if (io->state < IO_UNIT_IO_END)
377 list_move_tail(&io->log_sibling, &log->io_end_ios);
381 static void r5l_log_endio(struct bio *bio)
383 struct r5l_io_unit *io = bio->bi_private;
384 struct r5l_log *log = io->log;
388 md_error(log->rdev->mddev, log->rdev);
391 mempool_free(io->meta_page, log->meta_pool);
393 spin_lock_irqsave(&log->io_list_lock, flags);
394 __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
395 if (log->need_cache_flush)
396 r5l_move_to_end_ios(log);
398 r5l_log_run_stripes(log);
399 spin_unlock_irqrestore(&log->io_list_lock, flags);
401 if (log->need_cache_flush)
402 md_wakeup_thread(log->rdev->mddev->thread);
405 static void r5l_submit_current_io(struct r5l_log *log)
407 struct r5l_io_unit *io = log->current_io;
408 struct r5l_meta_block *block;
415 block = page_address(io->meta_page);
416 block->meta_size = cpu_to_le32(io->meta_offset);
417 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
418 block->checksum = cpu_to_le32(crc);
420 log->current_io = NULL;
421 spin_lock_irqsave(&log->io_list_lock, flags);
422 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
423 spin_unlock_irqrestore(&log->io_list_lock, flags);
425 submit_bio(io->current_bio);
428 static struct bio *r5l_bio_alloc(struct r5l_log *log)
430 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
432 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
433 bio->bi_bdev = log->rdev->bdev;
434 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
439 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
441 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
444 * If we filled up the log device start from the beginning again,
445 * which will require a new bio.
447 * Note: for this to work properly the log size needs to me a multiple
450 if (log->log_start == 0)
451 io->need_split_bio = true;
453 io->log_end = log->log_start;
456 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
458 struct r5l_io_unit *io;
459 struct r5l_meta_block *block;
461 io = mempool_alloc(log->io_pool, GFP_ATOMIC);
464 memset(io, 0, sizeof(*io));
467 INIT_LIST_HEAD(&io->log_sibling);
468 INIT_LIST_HEAD(&io->stripe_list);
469 io->state = IO_UNIT_RUNNING;
471 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
472 block = page_address(io->meta_page);
474 block->magic = cpu_to_le32(R5LOG_MAGIC);
475 block->version = R5LOG_VERSION;
476 block->seq = cpu_to_le64(log->seq);
477 block->position = cpu_to_le64(log->log_start);
479 io->log_start = log->log_start;
480 io->meta_offset = sizeof(struct r5l_meta_block);
481 io->seq = log->seq++;
483 io->current_bio = r5l_bio_alloc(log);
484 io->current_bio->bi_end_io = r5l_log_endio;
485 io->current_bio->bi_private = io;
486 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
488 r5_reserve_log_entry(log, io);
490 spin_lock_irq(&log->io_list_lock);
491 list_add_tail(&io->log_sibling, &log->running_ios);
492 spin_unlock_irq(&log->io_list_lock);
497 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
499 if (log->current_io &&
500 log->current_io->meta_offset + payload_size > PAGE_SIZE)
501 r5l_submit_current_io(log);
503 if (!log->current_io) {
504 log->current_io = r5l_new_meta(log);
505 if (!log->current_io)
512 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
514 u32 checksum1, u32 checksum2,
515 bool checksum2_valid)
517 struct r5l_io_unit *io = log->current_io;
518 struct r5l_payload_data_parity *payload;
520 payload = page_address(io->meta_page) + io->meta_offset;
521 payload->header.type = cpu_to_le16(type);
522 payload->header.flags = cpu_to_le16(0);
523 payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
525 payload->location = cpu_to_le64(location);
526 payload->checksum[0] = cpu_to_le32(checksum1);
528 payload->checksum[1] = cpu_to_le32(checksum2);
530 io->meta_offset += sizeof(struct r5l_payload_data_parity) +
531 sizeof(__le32) * (1 + !!checksum2_valid);
534 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
536 struct r5l_io_unit *io = log->current_io;
538 if (io->need_split_bio) {
539 struct bio *prev = io->current_bio;
541 io->current_bio = r5l_bio_alloc(log);
542 bio_chain(io->current_bio, prev);
547 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
550 r5_reserve_log_entry(log, io);
553 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
554 int data_pages, int parity_pages)
559 struct r5l_io_unit *io;
562 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
564 sizeof(struct r5l_payload_data_parity) +
565 sizeof(__le32) * parity_pages;
567 ret = r5l_get_meta(log, meta_size);
571 io = log->current_io;
573 for (i = 0; i < sh->disks; i++) {
574 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
575 test_bit(R5_InJournal, &sh->dev[i].flags))
577 if (i == sh->pd_idx || i == sh->qd_idx)
579 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
580 raid5_compute_blocknr(sh, i, 0),
581 sh->dev[i].log_checksum, 0, false);
582 r5l_append_payload_page(log, sh->dev[i].page);
585 if (parity_pages == 2) {
586 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
587 sh->sector, sh->dev[sh->pd_idx].log_checksum,
588 sh->dev[sh->qd_idx].log_checksum, true);
589 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
590 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
591 } else if (parity_pages == 1) {
592 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
593 sh->sector, sh->dev[sh->pd_idx].log_checksum,
595 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
596 } else /* Just writing data, not parity, in caching phase */
597 BUG_ON(parity_pages != 0);
599 list_add_tail(&sh->log_list, &io->stripe_list);
600 atomic_inc(&io->pending_stripe);
606 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
608 * running in raid5d, where reclaim could wait for raid5d too (when it flushes
609 * data from log to raid disks), so we shouldn't wait for reclaim here
611 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
614 int data_pages, parity_pages;
621 /* Don't support stripe batch */
622 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
623 test_bit(STRIPE_SYNCING, &sh->state)) {
624 /* the stripe is written to log, we start writing it to raid */
625 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
629 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
631 for (i = 0; i < sh->disks; i++) {
634 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
635 test_bit(R5_InJournal, &sh->dev[i].flags))
639 /* checksum is already calculated in last run */
640 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
642 addr = kmap_atomic(sh->dev[i].page);
643 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
647 parity_pages = 1 + !!(sh->qd_idx >= 0);
648 data_pages = write_disks - parity_pages;
650 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
652 * The stripe must enter state machine again to finish the write, so
655 clear_bit(STRIPE_DELAYED, &sh->state);
656 atomic_inc(&sh->count);
658 mutex_lock(&log->io_mutex);
660 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
661 if (!r5l_has_free_space(log, reserve)) {
662 spin_lock(&log->no_space_stripes_lock);
663 list_add_tail(&sh->log_list, &log->no_space_stripes);
664 spin_unlock(&log->no_space_stripes_lock);
666 r5l_wake_reclaim(log, reserve);
668 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
670 spin_lock_irq(&log->io_list_lock);
671 list_add_tail(&sh->log_list, &log->no_mem_stripes);
672 spin_unlock_irq(&log->io_list_lock);
676 mutex_unlock(&log->io_mutex);
680 void r5l_write_stripe_run(struct r5l_log *log)
684 mutex_lock(&log->io_mutex);
685 r5l_submit_current_io(log);
686 mutex_unlock(&log->io_mutex);
689 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
694 * we flush log disk cache first, then write stripe data to raid disks.
695 * So if bio is finished, the log disk cache is flushed already. The
696 * recovery guarantees we can recovery the bio from log disk, so we
697 * don't need to flush again
699 if (bio->bi_iter.bi_size == 0) {
703 bio->bi_opf &= ~REQ_PREFLUSH;
707 /* This will run after log space is reclaimed */
708 static void r5l_run_no_space_stripes(struct r5l_log *log)
710 struct stripe_head *sh;
712 spin_lock(&log->no_space_stripes_lock);
713 while (!list_empty(&log->no_space_stripes)) {
714 sh = list_first_entry(&log->no_space_stripes,
715 struct stripe_head, log_list);
716 list_del_init(&sh->log_list);
717 set_bit(STRIPE_HANDLE, &sh->state);
718 raid5_release_stripe(sh);
720 spin_unlock(&log->no_space_stripes_lock);
723 static sector_t r5l_reclaimable_space(struct r5l_log *log)
725 return r5l_ring_distance(log, log->last_checkpoint,
726 log->next_checkpoint);
729 static void r5l_run_no_mem_stripe(struct r5l_log *log)
731 struct stripe_head *sh;
733 assert_spin_locked(&log->io_list_lock);
735 if (!list_empty(&log->no_mem_stripes)) {
736 sh = list_first_entry(&log->no_mem_stripes,
737 struct stripe_head, log_list);
738 list_del_init(&sh->log_list);
739 set_bit(STRIPE_HANDLE, &sh->state);
740 raid5_release_stripe(sh);
744 static bool r5l_complete_finished_ios(struct r5l_log *log)
746 struct r5l_io_unit *io, *next;
749 assert_spin_locked(&log->io_list_lock);
751 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
752 /* don't change list order */
753 if (io->state < IO_UNIT_STRIPE_END)
756 log->next_checkpoint = io->log_start;
757 log->next_cp_seq = io->seq;
759 list_del(&io->log_sibling);
760 mempool_free(io, log->io_pool);
761 r5l_run_no_mem_stripe(log);
769 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
771 struct r5l_log *log = io->log;
774 spin_lock_irqsave(&log->io_list_lock, flags);
775 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
777 if (!r5l_complete_finished_ios(log)) {
778 spin_unlock_irqrestore(&log->io_list_lock, flags);
782 if (r5l_reclaimable_space(log) > log->max_free_space)
783 r5l_wake_reclaim(log, 0);
785 spin_unlock_irqrestore(&log->io_list_lock, flags);
786 wake_up(&log->iounit_wait);
789 void r5l_stripe_write_finished(struct stripe_head *sh)
791 struct r5l_io_unit *io;
796 if (io && atomic_dec_and_test(&io->pending_stripe))
797 __r5l_stripe_write_finished(io);
800 static void r5l_log_flush_endio(struct bio *bio)
802 struct r5l_log *log = container_of(bio, struct r5l_log,
805 struct r5l_io_unit *io;
808 md_error(log->rdev->mddev, log->rdev);
810 spin_lock_irqsave(&log->io_list_lock, flags);
811 list_for_each_entry(io, &log->flushing_ios, log_sibling)
812 r5l_io_run_stripes(io);
813 list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
814 spin_unlock_irqrestore(&log->io_list_lock, flags);
818 * Starting dispatch IO to raid.
819 * io_unit(meta) consists of a log. There is one situation we want to avoid. A
820 * broken meta in the middle of a log causes recovery can't find meta at the
821 * head of log. If operations require meta at the head persistent in log, we
822 * must make sure meta before it persistent in log too. A case is:
824 * stripe data/parity is in log, we start write stripe to raid disks. stripe
825 * data/parity must be persistent in log before we do the write to raid disks.
827 * The solution is we restrictly maintain io_unit list order. In this case, we
828 * only write stripes of an io_unit to raid disks till the io_unit is the first
829 * one whose data/parity is in log.
831 void r5l_flush_stripe_to_raid(struct r5l_log *log)
835 if (!log || !log->need_cache_flush)
838 spin_lock_irq(&log->io_list_lock);
839 /* flush bio is running */
840 if (!list_empty(&log->flushing_ios)) {
841 spin_unlock_irq(&log->io_list_lock);
844 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
845 do_flush = !list_empty(&log->flushing_ios);
846 spin_unlock_irq(&log->io_list_lock);
850 bio_reset(&log->flush_bio);
851 log->flush_bio.bi_bdev = log->rdev->bdev;
852 log->flush_bio.bi_end_io = r5l_log_flush_endio;
853 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
854 submit_bio(&log->flush_bio);
857 static void r5l_write_super(struct r5l_log *log, sector_t cp);
858 static void r5l_write_super_and_discard_space(struct r5l_log *log,
861 struct block_device *bdev = log->rdev->bdev;
864 r5l_write_super(log, end);
866 if (!blk_queue_discard(bdev_get_queue(bdev)))
869 mddev = log->rdev->mddev;
871 * Discard could zero data, so before discard we must make sure
872 * superblock is updated to new log tail. Updating superblock (either
873 * directly call md_update_sb() or depend on md thread) must hold
874 * reconfig mutex. On the other hand, raid5_quiesce is called with
875 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
876 * for all IO finish, hence waitting for reclaim thread, while reclaim
877 * thread is calling this function and waitting for reconfig mutex. So
878 * there is a deadlock. We workaround this issue with a trylock.
879 * FIXME: we could miss discard if we can't take reconfig mutex
881 set_mask_bits(&mddev->flags, 0,
882 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
883 if (!mddev_trylock(mddev))
885 md_update_sb(mddev, 1);
888 /* discard IO error really doesn't matter, ignore it */
889 if (log->last_checkpoint < end) {
890 blkdev_issue_discard(bdev,
891 log->last_checkpoint + log->rdev->data_offset,
892 end - log->last_checkpoint, GFP_NOIO, 0);
894 blkdev_issue_discard(bdev,
895 log->last_checkpoint + log->rdev->data_offset,
896 log->device_size - log->last_checkpoint,
898 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
903 static void r5l_do_reclaim(struct r5l_log *log)
905 sector_t reclaim_target = xchg(&log->reclaim_target, 0);
906 sector_t reclaimable;
907 sector_t next_checkpoint;
910 spin_lock_irq(&log->io_list_lock);
912 * move proper io_unit to reclaim list. We should not change the order.
913 * reclaimable/unreclaimable io_unit can be mixed in the list, we
914 * shouldn't reuse space of an unreclaimable io_unit
917 reclaimable = r5l_reclaimable_space(log);
918 if (reclaimable >= reclaim_target ||
919 (list_empty(&log->running_ios) &&
920 list_empty(&log->io_end_ios) &&
921 list_empty(&log->flushing_ios) &&
922 list_empty(&log->finished_ios)))
925 md_wakeup_thread(log->rdev->mddev->thread);
926 wait_event_lock_irq(log->iounit_wait,
927 r5l_reclaimable_space(log) > reclaimable,
931 next_checkpoint = log->next_checkpoint;
932 next_cp_seq = log->next_cp_seq;
933 spin_unlock_irq(&log->io_list_lock);
935 BUG_ON(reclaimable < 0);
936 if (reclaimable == 0)
940 * write_super will flush cache of each raid disk. We must write super
941 * here, because the log area might be reused soon and we don't want to
944 r5l_write_super_and_discard_space(log, next_checkpoint);
946 mutex_lock(&log->io_mutex);
947 log->last_checkpoint = next_checkpoint;
948 log->last_cp_seq = next_cp_seq;
949 mutex_unlock(&log->io_mutex);
951 r5l_run_no_space_stripes(log);
954 static void r5l_reclaim_thread(struct md_thread *thread)
956 struct mddev *mddev = thread->mddev;
957 struct r5conf *conf = mddev->private;
958 struct r5l_log *log = conf->log;
965 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
967 unsigned long target;
968 unsigned long new = (unsigned long)space; /* overflow in theory */
971 target = log->reclaim_target;
974 } while (cmpxchg(&log->reclaim_target, target, new) != target);
975 md_wakeup_thread(log->reclaim_thread);
978 void r5l_quiesce(struct r5l_log *log, int state)
981 if (!log || state == 2)
985 * This is a special case for hotadd. In suspend, the array has
986 * no journal. In resume, journal is initialized as well as the
989 if (log->reclaim_thread)
991 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
992 log->rdev->mddev, "reclaim");
993 } else if (state == 1) {
994 /* make sure r5l_write_super_and_discard_space exits */
995 mddev = log->rdev->mddev;
996 wake_up(&mddev->sb_wait);
997 r5l_wake_reclaim(log, -1L);
998 md_unregister_thread(&log->reclaim_thread);
1003 bool r5l_log_disk_error(struct r5conf *conf)
1005 struct r5l_log *log;
1007 /* don't allow write if journal disk is missing */
1009 log = rcu_dereference(conf->log);
1012 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1014 ret = test_bit(Faulty, &log->rdev->flags);
1019 struct r5l_recovery_ctx {
1020 struct page *meta_page; /* current meta */
1021 sector_t meta_total_blocks; /* total size of current meta and data */
1022 sector_t pos; /* recovery position */
1023 u64 seq; /* recovery position seq */
1026 static int r5l_read_meta_block(struct r5l_log *log,
1027 struct r5l_recovery_ctx *ctx)
1029 struct page *page = ctx->meta_page;
1030 struct r5l_meta_block *mb;
1031 u32 crc, stored_crc;
1033 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1037 mb = page_address(page);
1038 stored_crc = le32_to_cpu(mb->checksum);
1041 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1042 le64_to_cpu(mb->seq) != ctx->seq ||
1043 mb->version != R5LOG_VERSION ||
1044 le64_to_cpu(mb->position) != ctx->pos)
1047 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1048 if (stored_crc != crc)
1051 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1054 ctx->meta_total_blocks = BLOCK_SECTORS;
1059 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1060 struct r5l_recovery_ctx *ctx,
1061 sector_t stripe_sect,
1064 struct r5conf *conf = log->rdev->mddev->private;
1065 struct stripe_head *sh;
1066 struct r5l_payload_data_parity *payload;
1069 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1071 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1072 ctx->meta_total_blocks);
1073 payload = page_address(ctx->meta_page) + *offset;
1075 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1076 raid5_compute_sector(conf,
1077 le64_to_cpu(payload->location), 0,
1080 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1081 sh->dev[disk_index].page, REQ_OP_READ, 0,
1083 sh->dev[disk_index].log_checksum =
1084 le32_to_cpu(payload->checksum[0]);
1085 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1087 disk_index = sh->pd_idx;
1088 sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1089 sh->dev[disk_index].page, REQ_OP_READ, 0,
1091 sh->dev[disk_index].log_checksum =
1092 le32_to_cpu(payload->checksum[0]);
1093 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1095 if (sh->qd_idx >= 0) {
1096 disk_index = sh->qd_idx;
1097 sync_page_io(log->rdev,
1098 r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1099 PAGE_SIZE, sh->dev[disk_index].page,
1100 REQ_OP_READ, 0, false);
1101 sh->dev[disk_index].log_checksum =
1102 le32_to_cpu(payload->checksum[1]);
1103 set_bit(R5_Wantwrite,
1104 &sh->dev[disk_index].flags);
1108 ctx->meta_total_blocks += le32_to_cpu(payload->size);
1109 *offset += sizeof(struct r5l_payload_data_parity) +
1111 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1112 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1116 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1120 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1122 addr = kmap_atomic(sh->dev[disk_index].page);
1123 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1124 kunmap_atomic(addr);
1125 if (checksum != sh->dev[disk_index].log_checksum)
1129 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1130 struct md_rdev *rdev, *rrdev;
1132 if (!test_and_clear_bit(R5_Wantwrite,
1133 &sh->dev[disk_index].flags))
1136 /* in case device is broken */
1138 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1140 atomic_inc(&rdev->nr_pending);
1142 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1143 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1145 rdev_dec_pending(rdev, rdev->mddev);
1148 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1150 atomic_inc(&rrdev->nr_pending);
1152 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1153 sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1155 rdev_dec_pending(rrdev, rrdev->mddev);
1160 raid5_release_stripe(sh);
1164 for (disk_index = 0; disk_index < sh->disks; disk_index++)
1165 sh->dev[disk_index].flags = 0;
1166 raid5_release_stripe(sh);
1170 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1171 struct r5l_recovery_ctx *ctx)
1173 struct r5conf *conf = log->rdev->mddev->private;
1174 struct r5l_payload_data_parity *payload;
1175 struct r5l_meta_block *mb;
1177 sector_t stripe_sector;
1179 mb = page_address(ctx->meta_page);
1180 offset = sizeof(struct r5l_meta_block);
1182 while (offset < le32_to_cpu(mb->meta_size)) {
1185 payload = (void *)mb + offset;
1186 stripe_sector = raid5_compute_sector(conf,
1187 le64_to_cpu(payload->location), 0, &dd, NULL);
1188 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
1195 /* copy data/parity from log to raid disks */
1196 static void r5l_recovery_flush_log(struct r5l_log *log,
1197 struct r5l_recovery_ctx *ctx)
1200 if (r5l_read_meta_block(log, ctx))
1202 if (r5l_recovery_flush_one_meta(log, ctx))
1205 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1209 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1213 struct r5l_meta_block *mb;
1216 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1219 mb = page_address(page);
1220 mb->magic = cpu_to_le32(R5LOG_MAGIC);
1221 mb->version = R5LOG_VERSION;
1222 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1223 mb->seq = cpu_to_le64(seq);
1224 mb->position = cpu_to_le64(pos);
1225 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1226 mb->checksum = cpu_to_le32(crc);
1228 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1229 WRITE_FUA, false)) {
1237 static int r5l_recovery_log(struct r5l_log *log)
1239 struct r5l_recovery_ctx ctx;
1241 ctx.pos = log->last_checkpoint;
1242 ctx.seq = log->last_cp_seq;
1243 ctx.meta_page = alloc_page(GFP_KERNEL);
1247 r5l_recovery_flush_log(log, &ctx);
1248 __free_page(ctx.meta_page);
1251 * we did a recovery. Now ctx.pos points to an invalid meta block. New
1252 * log will start here. but we can't let superblock point to last valid
1253 * meta block. The log might looks like:
1254 * | meta 1| meta 2| meta 3|
1255 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1256 * superblock points to meta 1, we write a new valid meta 2n. if crash
1257 * happens again, new recovery will start from meta 1. Since meta 2n is
1258 * valid now, recovery will think meta 3 is valid, which is wrong.
1259 * The solution is we create a new meta in meta2 with its seq == meta
1260 * 1's seq + 10 and let superblock points to meta2. The same recovery will
1261 * not think meta 3 is a valid meta, because its seq doesn't match
1263 if (ctx.seq > log->last_cp_seq) {
1266 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1269 log->seq = ctx.seq + 11;
1270 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1271 r5l_write_super(log, ctx.pos);
1272 log->last_checkpoint = ctx.pos;
1273 log->next_checkpoint = ctx.pos;
1275 log->log_start = ctx.pos;
1281 static void r5l_write_super(struct r5l_log *log, sector_t cp)
1283 struct mddev *mddev = log->rdev->mddev;
1285 log->rdev->journal_tail = cp;
1286 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1290 * Try handle write operation in caching phase. This function should only
1291 * be called in write-back mode.
1293 * If all outstanding writes can be handled in caching phase, returns 0
1294 * If writes requires write-out phase, call r5c_make_stripe_write_out()
1295 * and returns -EAGAIN
1297 int r5c_try_caching_write(struct r5conf *conf,
1298 struct stripe_head *sh,
1299 struct stripe_head_state *s,
1302 struct r5l_log *log = conf->log;
1307 BUG_ON(!r5c_is_writeback(log));
1309 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1311 * There are two different scenarios here:
1312 * 1. The stripe has some data cached, and it is sent to
1313 * write-out phase for reclaim
1314 * 2. The stripe is clean, and this is the first write
1316 * For 1, return -EAGAIN, so we continue with
1317 * handle_stripe_dirtying().
1319 * For 2, set STRIPE_R5C_CACHING and continue with caching
1323 /* case 1: anything injournal or anything in written */
1324 if (s->injournal > 0 || s->written > 0)
1327 set_bit(STRIPE_R5C_CACHING, &sh->state);
1330 for (i = disks; i--; ) {
1332 /* if non-overwrite, use writing-out phase */
1333 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1334 !test_bit(R5_InJournal, &dev->flags)) {
1335 r5c_make_stripe_write_out(sh);
1340 for (i = disks; i--; ) {
1343 set_bit(R5_Wantwrite, &dev->flags);
1344 set_bit(R5_Wantdrain, &dev->flags);
1345 set_bit(R5_LOCKED, &dev->flags);
1351 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1353 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1354 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1355 * r5c_handle_data_cached()
1357 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1364 * free extra pages (orig_page) we allocated for prexor
1366 void r5c_release_extra_page(struct stripe_head *sh)
1370 for (i = sh->disks; i--; )
1371 if (sh->dev[i].page != sh->dev[i].orig_page) {
1372 struct page *p = sh->dev[i].orig_page;
1374 sh->dev[i].orig_page = sh->dev[i].page;
1380 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1381 * stripe is committed to RAID disks.
1383 void r5c_finish_stripe_write_out(struct r5conf *conf,
1384 struct stripe_head *sh,
1385 struct stripe_head_state *s)
1391 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1394 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1395 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1397 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1400 for (i = sh->disks; i--; ) {
1401 clear_bit(R5_InJournal, &sh->dev[i].flags);
1402 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1407 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
1408 * We updated R5_InJournal, so we also update s->injournal.
1412 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1413 if (atomic_dec_and_test(&conf->pending_full_writes))
1414 md_wakeup_thread(conf->mddev->thread);
1417 wake_up(&conf->wait_for_overlap);
1421 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1422 struct stripe_head_state *s)
1431 for (i = 0; i < sh->disks; i++) {
1434 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1436 addr = kmap_atomic(sh->dev[i].page);
1437 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1439 kunmap_atomic(addr);
1442 WARN_ON(pages == 0);
1445 * The stripe must enter state machine again to call endio, so
1448 clear_bit(STRIPE_DELAYED, &sh->state);
1449 atomic_inc(&sh->count);
1451 mutex_lock(&log->io_mutex);
1453 reserve = (1 + pages) << (PAGE_SHIFT - 9);
1454 if (!r5l_has_free_space(log, reserve)) {
1455 spin_lock(&log->no_space_stripes_lock);
1456 list_add_tail(&sh->log_list, &log->no_space_stripes);
1457 spin_unlock(&log->no_space_stripes_lock);
1459 r5l_wake_reclaim(log, reserve);
1461 ret = r5l_log_stripe(log, sh, pages, 0);
1463 spin_lock_irq(&log->io_list_lock);
1464 list_add_tail(&sh->log_list, &log->no_mem_stripes);
1465 spin_unlock_irq(&log->io_list_lock);
1469 mutex_unlock(&log->io_mutex);
1474 static int r5l_load_log(struct r5l_log *log)
1476 struct md_rdev *rdev = log->rdev;
1478 struct r5l_meta_block *mb;
1479 sector_t cp = log->rdev->journal_tail;
1480 u32 stored_crc, expected_crc;
1481 bool create_super = false;
1484 /* Make sure it's valid */
1485 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1487 page = alloc_page(GFP_KERNEL);
1491 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1495 mb = page_address(page);
1497 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1498 mb->version != R5LOG_VERSION) {
1499 create_super = true;
1502 stored_crc = le32_to_cpu(mb->checksum);
1504 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1505 if (stored_crc != expected_crc) {
1506 create_super = true;
1509 if (le64_to_cpu(mb->position) != cp) {
1510 create_super = true;
1515 log->last_cp_seq = prandom_u32();
1517 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
1519 * Make sure super points to correct address. Log might have
1520 * data very soon. If super hasn't correct log tail address,
1521 * recovery can't find the log
1523 r5l_write_super(log, cp);
1525 log->last_cp_seq = le64_to_cpu(mb->seq);
1527 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1528 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1529 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1530 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1531 log->last_checkpoint = cp;
1532 log->next_checkpoint = cp;
1536 return r5l_recovery_log(log);
1542 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1544 struct request_queue *q = bdev_get_queue(rdev->bdev);
1545 struct r5l_log *log;
1547 if (PAGE_SIZE != 4096)
1551 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1552 * raid_disks r5l_payload_data_parity.
1554 * Write journal and cache does not work for very big array
1555 * (raid_disks > 203)
1557 if (sizeof(struct r5l_meta_block) +
1558 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1559 conf->raid_disks) > PAGE_SIZE) {
1560 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1561 mdname(conf->mddev), conf->raid_disks);
1565 log = kzalloc(sizeof(*log), GFP_KERNEL);
1570 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
1572 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1573 sizeof(rdev->mddev->uuid));
1575 mutex_init(&log->io_mutex);
1577 spin_lock_init(&log->io_list_lock);
1578 INIT_LIST_HEAD(&log->running_ios);
1579 INIT_LIST_HEAD(&log->io_end_ios);
1580 INIT_LIST_HEAD(&log->flushing_ios);
1581 INIT_LIST_HEAD(&log->finished_ios);
1582 bio_init(&log->flush_bio);
1584 log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1588 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
1592 log->bs = bioset_create(R5L_POOL_SIZE, 0);
1596 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1597 if (!log->meta_pool)
1600 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1601 log->rdev->mddev, "reclaim");
1602 if (!log->reclaim_thread)
1603 goto reclaim_thread;
1604 init_waitqueue_head(&log->iounit_wait);
1606 INIT_LIST_HEAD(&log->no_mem_stripes);
1608 INIT_LIST_HEAD(&log->no_space_stripes);
1609 spin_lock_init(&log->no_space_stripes_lock);
1611 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1613 if (r5l_load_log(log))
1616 rcu_assign_pointer(conf->log, log);
1617 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1621 md_unregister_thread(&log->reclaim_thread);
1623 mempool_destroy(log->meta_pool);
1625 bioset_free(log->bs);
1627 mempool_destroy(log->io_pool);
1629 kmem_cache_destroy(log->io_kc);
1635 void r5l_exit_log(struct r5l_log *log)
1637 md_unregister_thread(&log->reclaim_thread);
1638 mempool_destroy(log->meta_pool);
1639 bioset_free(log->bs);
1640 mempool_destroy(log->io_pool);
1641 kmem_cache_destroy(log->io_kc);