drivers/md/raid5-cache.c

   1 /*
   2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  */
  14 #include <linux/kernel.h>
  15 #include <linux/wait.h>
  16 #include <linux/blkdev.h>
  17 #include <linux/slab.h>
  18 #include <linux/raid/md_p.h>
  19 #include <linux/crc32c.h>
  20 #include <linux/random.h>
  21 #include "md.h"
  22 #include "raid5.h"
  23 #include "bitmap.h"
  24
  25 /*
  26  * metadata/data stored in disk with 4k size unit (a block) regardless
  27  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  28  */
  29 #define BLOCK_SECTORS (8)
  30
  31 /*
  32  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
  33  * recovery scans a very long log
  34  */
  35 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  36 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  37
  38 /*
  39  * We only need 2 bios per I/O unit to make progress, but ensure we
  40  * have a few more available to not get too tight.
  41  */
  42 #define R5L_POOL_SIZE   4
  43
  44 /*
  45  * r5c journal modes of the array: write-back or write-through.
  46  * write-through mode has identical behavior as existing log only
  47  * implementation.
  48  */
  49 enum r5c_journal_mode {
  50         R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
  51         R5C_JOURNAL_MODE_WRITE_BACK = 1,
  52 };
  53
  54 /*
  55  * raid5 cache state machine
  56  *
  57  * With rhe RAID cache, each stripe works in two phases:
  58  *      - caching phase
  59  *      - writing-out phase
  60  *
  61  * These two phases are controlled by bit STRIPE_R5C_CACHING:
  62  *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
  63  *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
  64  *
  65  * When there is no journal, or the journal is in write-through mode,
  66  * the stripe is always in writing-out phase.
  67  *
  68  * For write-back journal, the stripe is sent to caching phase on write
  69  * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
  70  * the write-out phase by clearing STRIPE_R5C_CACHING.
  71  *
  72  * Stripes in caching phase do not write the raid disks. Instead, all
  73  * writes are committed from the log device. Therefore, a stripe in
  74  * caching phase handles writes as:
  75  *      - write to log device
  76  *      - return IO
  77  *
  78  * Stripes in writing-out phase handle writes as:
  79  *      - calculate parity
  80  *      - write pending data and parity to journal
  81  *      - write data and parity to raid disks
  82  *      - return IO for pending writes
  83  */
  84
  85 struct r5l_log {
  86         struct md_rdev *rdev;
  87
  88         u32 uuid_checksum;
  89
  90         sector_t device_size;           /* log device size, round to
  91                                          * BLOCK_SECTORS */
  92         sector_t max_free_space;        /* reclaim run if free space is at
  93                                          * this size */
  94
  95         sector_t last_checkpoint;       /* log tail. where recovery scan
  96                                          * starts from */
  97         u64 last_cp_seq;                /* log tail sequence */
  98
  99         sector_t log_start;             /* log head. where new data appends */
 100         u64 seq;                        /* log head sequence */
 101
 102         sector_t next_checkpoint;
 103         u64 next_cp_seq;
 104
 105         struct mutex io_mutex;
 106         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
 107
 108         spinlock_t io_list_lock;
 109         struct list_head running_ios;   /* io_units which are still running,
 110                                          * and have not yet been completely
 111                                          * written to the log */
 112         struct list_head io_end_ios;    /* io_units which have been completely
 113                                          * written to the log but not yet written
 114                                          * to the RAID */
 115         struct list_head flushing_ios;  /* io_units which are waiting for log
 116                                          * cache flush */
 117         struct list_head finished_ios;  /* io_units which settle down in log disk */
 118         struct bio flush_bio;
 119
 120         struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
 121
 122         struct kmem_cache *io_kc;
 123         mempool_t *io_pool;
 124         struct bio_set *bs;
 125         mempool_t *meta_pool;
 126
 127         struct md_thread *reclaim_thread;
 128         unsigned long reclaim_target;   /* number of space that need to be
 129                                          * reclaimed.  if it's 0, reclaim spaces
 130                                          * used by io_units which are in
 131                                          * IO_UNIT_STRIPE_END state (eg, reclaim
 132                                          * dones't wait for specific io_unit
 133                                          * switching to IO_UNIT_STRIPE_END
 134                                          * state) */
 135         wait_queue_head_t iounit_wait;
 136
 137         struct list_head no_space_stripes; /* pending stripes, log has no space */
 138         spinlock_t no_space_stripes_lock;
 139
 140         bool need_cache_flush;
 141
 142         /* for r5c_cache */
 143         enum r5c_journal_mode r5c_journal_mode;
 144 };
 145
 146 /*
 147  * an IO range starts from a meta data block and end at the next meta data
 148  * block. The io unit's the meta data block tracks data/parity followed it. io
 149  * unit is written to log disk with normal write, as we always flush log disk
 150  * first and then start move data to raid disks, there is no requirement to
 151  * write io unit with FLUSH/FUA
 152  */
 153 struct r5l_io_unit {
 154         struct r5l_log *log;
 155
 156         struct page *meta_page; /* store meta block */
 157         int meta_offset;        /* current offset in meta_page */
 158
 159         struct bio *current_bio;/* current_bio accepting new data */
 160
 161         atomic_t pending_stripe;/* how many stripes not flushed to raid */
 162         u64 seq;                /* seq number of the metablock */
 163         sector_t log_start;     /* where the io_unit starts */
 164         sector_t log_end;       /* where the io_unit ends */
 165         struct list_head log_sibling; /* log->running_ios */
 166         struct list_head stripe_list; /* stripes added to the io_unit */
 167
 168         int state;
 169         bool need_split_bio;
 170 };
 171
 172 /* r5l_io_unit state */
 173 enum r5l_io_unit_state {
 174         IO_UNIT_RUNNING = 0,    /* accepting new IO */
 175         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 176                                  * don't accepting new bio */
 177         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 178         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 179 };
 180
 181 bool r5c_is_writeback(struct r5l_log *log)
 182 {
 183         return (log != NULL &&
 184                 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
 185 }
 186
 187 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 188 {
 189         start += inc;
 190         if (start >= log->device_size)
 191                 start = start - log->device_size;
 192         return start;
 193 }
 194
 195 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 196                                   sector_t end)
 197 {
 198         if (end >= start)
 199                 return end - start;
 200         else
 201                 return end + log->device_size - start;
 202 }
 203
 204 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 205 {
 206         sector_t used_size;
 207
 208         used_size = r5l_ring_distance(log, log->last_checkpoint,
 209                                         log->log_start);
 210
 211         return log->device_size > used_size + size;
 212 }
 213
 214 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 215                                     enum r5l_io_unit_state state)
 216 {
 217         if (WARN_ON(io->state >= state))
 218                 return;
 219         io->state = state;
 220 }
 221
 222 static void
 223 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 224                               struct bio_list *return_bi)
 225 {
 226         struct bio *wbi, *wbi2;
 227
 228         wbi = dev->written;
 229         dev->written = NULL;
 230         while (wbi && wbi->bi_iter.bi_sector <
 231                dev->sector + STRIPE_SECTORS) {
 232                 wbi2 = r5_next_bio(wbi, dev->sector);
 233                 if (!raid5_dec_bi_active_stripes(wbi)) {
 234                         md_write_end(conf->mddev);
 235                         bio_list_add(return_bi, wbi);
 236                 }
 237                 wbi = wbi2;
 238         }
 239 }
 240
 241 void r5c_handle_cached_data_endio(struct r5conf *conf,
 242           struct stripe_head *sh, int disks, struct bio_list *return_bi)
 243 {
 244         int i;
 245
 246         for (i = sh->disks; i--; ) {
 247                 if (sh->dev[i].written) {
 248                         set_bit(R5_UPTODATE, &sh->dev[i].flags);
 249                         r5c_return_dev_pending_writes(conf, &sh->dev[i],
 250                                                       return_bi);
 251                         bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 252                                         STRIPE_SECTORS,
 253                                         !test_bit(STRIPE_DEGRADED, &sh->state),
 254                                         0);
 255                 }
 256         }
 257 }
 258
 259 /*
 260  * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
 261  * This function should only be called in write-back mode.
 262  */
 263 static void r5c_make_stripe_write_out(struct stripe_head *sh)
 264 {
 265         struct r5conf *conf = sh->raid_conf;
 266         struct r5l_log *log = conf->log;
 267
 268         BUG_ON(!r5c_is_writeback(log));
 269
 270         WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
 271         clear_bit(STRIPE_R5C_CACHING, &sh->state);
 272
 273         if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 274                 atomic_inc(&conf->preread_active_stripes);
 275
 276         if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
 277                 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
 278                 atomic_dec(&conf->r5c_cached_partial_stripes);
 279         }
 280
 281         if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
 282                 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
 283                 atomic_dec(&conf->r5c_cached_full_stripes);
 284         }
 285 }
 286
 287 static void r5c_handle_data_cached(struct stripe_head *sh)
 288 {
 289         int i;
 290
 291         for (i = sh->disks; i--; )
 292                 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 293                         set_bit(R5_InJournal, &sh->dev[i].flags);
 294                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
 295                 }
 296         clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 297 }
 298
 299 /*
 300  * this journal write must contain full parity,
 301  * it may also contain some data pages
 302  */
 303 static void r5c_handle_parity_cached(struct stripe_head *sh)
 304 {
 305         int i;
 306
 307         for (i = sh->disks; i--; )
 308                 if (test_bit(R5_InJournal, &sh->dev[i].flags))
 309                         set_bit(R5_Wantwrite, &sh->dev[i].flags);
 310 }
 311
 312 /*
 313  * Setting proper flags after writing (or flushing) data and/or parity to the
 314  * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
 315  */
 316 static void r5c_finish_cache_stripe(struct stripe_head *sh)
 317 {
 318         struct r5l_log *log = sh->raid_conf->log;
 319
 320         if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 321                 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 322                 /*
 323                  * Set R5_InJournal for parity dev[pd_idx]. This means
 324                  * all data AND parity in the journal. For RAID 6, it is
 325                  * NOT necessary to set the flag for dev[qd_idx], as the
 326                  * two parities are written out together.
 327                  */
 328                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 329         } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 330                 r5c_handle_data_cached(sh);
 331         } else {
 332                 r5c_handle_parity_cached(sh);
 333                 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
 334         }
 335 }
 336
 337 static void r5l_io_run_stripes(struct r5l_io_unit *io)
 338 {
 339         struct stripe_head *sh, *next;
 340
 341         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
 342                 list_del_init(&sh->log_list);
 343
 344                 r5c_finish_cache_stripe(sh);
 345
 346                 set_bit(STRIPE_HANDLE, &sh->state);
 347                 raid5_release_stripe(sh);
 348         }
 349 }
 350
 351 static void r5l_log_run_stripes(struct r5l_log *log)
 352 {
 353         struct r5l_io_unit *io, *next;
 354
 355         assert_spin_locked(&log->io_list_lock);
 356
 357         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 358                 /* don't change list order */
 359                 if (io->state < IO_UNIT_IO_END)
 360                         break;
 361
 362                 list_move_tail(&io->log_sibling, &log->finished_ios);
 363                 r5l_io_run_stripes(io);
 364         }
 365 }
 366
 367 static void r5l_move_to_end_ios(struct r5l_log *log)
 368 {
 369         struct r5l_io_unit *io, *next;
 370
 371         assert_spin_locked(&log->io_list_lock);
 372
 373         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
 374                 /* don't change list order */
 375                 if (io->state < IO_UNIT_IO_END)
 376                         break;
 377                 list_move_tail(&io->log_sibling, &log->io_end_ios);
 378         }
 379 }
 380
 381 static void r5l_log_endio(struct bio *bio)
 382 {
 383         struct r5l_io_unit *io = bio->bi_private;
 384         struct r5l_log *log = io->log;
 385         unsigned long flags;
 386
 387         if (bio->bi_error)
 388                 md_error(log->rdev->mddev, log->rdev);
 389
 390         bio_put(bio);
 391         mempool_free(io->meta_page, log->meta_pool);
 392
 393         spin_lock_irqsave(&log->io_list_lock, flags);
 394         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 395         if (log->need_cache_flush)
 396                 r5l_move_to_end_ios(log);
 397         else
 398                 r5l_log_run_stripes(log);
 399         spin_unlock_irqrestore(&log->io_list_lock, flags);
 400
 401         if (log->need_cache_flush)
 402                 md_wakeup_thread(log->rdev->mddev->thread);
 403 }
 404
 405 static void r5l_submit_current_io(struct r5l_log *log)
 406 {
 407         struct r5l_io_unit *io = log->current_io;
 408         struct r5l_meta_block *block;
 409         unsigned long flags;
 410         u32 crc;
 411
 412         if (!io)
 413                 return;
 414
 415         block = page_address(io->meta_page);
 416         block->meta_size = cpu_to_le32(io->meta_offset);
 417         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 418         block->checksum = cpu_to_le32(crc);
 419
 420         log->current_io = NULL;
 421         spin_lock_irqsave(&log->io_list_lock, flags);
 422         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 423         spin_unlock_irqrestore(&log->io_list_lock, flags);
 424
 425         submit_bio(io->current_bio);
 426 }
 427
 428 static struct bio *r5l_bio_alloc(struct r5l_log *log)
 429 {
 430         struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 431
 432         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 433         bio->bi_bdev = log->rdev->bdev;
 434         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 435
 436         return bio;
 437 }
 438
 439 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
 440 {
 441         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 442
 443         /*
 444          * If we filled up the log device start from the beginning again,
 445          * which will require a new bio.
 446          *
 447          * Note: for this to work properly the log size needs to me a multiple
 448          * of BLOCK_SECTORS.
 449          */
 450         if (log->log_start == 0)
 451                 io->need_split_bio = true;
 452
 453         io->log_end = log->log_start;
 454 }
 455
 456 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 457 {
 458         struct r5l_io_unit *io;
 459         struct r5l_meta_block *block;
 460
 461         io = mempool_alloc(log->io_pool, GFP_ATOMIC);
 462         if (!io)
 463                 return NULL;
 464         memset(io, 0, sizeof(*io));
 465
 466         io->log = log;
 467         INIT_LIST_HEAD(&io->log_sibling);
 468         INIT_LIST_HEAD(&io->stripe_list);
 469         io->state = IO_UNIT_RUNNING;
 470
 471         io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
 472         block = page_address(io->meta_page);
 473         clear_page(block);
 474         block->magic = cpu_to_le32(R5LOG_MAGIC);
 475         block->version = R5LOG_VERSION;
 476         block->seq = cpu_to_le64(log->seq);
 477         block->position = cpu_to_le64(log->log_start);
 478
 479         io->log_start = log->log_start;
 480         io->meta_offset = sizeof(struct r5l_meta_block);
 481         io->seq = log->seq++;
 482
 483         io->current_bio = r5l_bio_alloc(log);
 484         io->current_bio->bi_end_io = r5l_log_endio;
 485         io->current_bio->bi_private = io;
 486         bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
 487
 488         r5_reserve_log_entry(log, io);
 489
 490         spin_lock_irq(&log->io_list_lock);
 491         list_add_tail(&io->log_sibling, &log->running_ios);
 492         spin_unlock_irq(&log->io_list_lock);
 493
 494         return io;
 495 }
 496
 497 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 498 {
 499         if (log->current_io &&
 500             log->current_io->meta_offset + payload_size > PAGE_SIZE)
 501                 r5l_submit_current_io(log);
 502
 503         if (!log->current_io) {
 504                 log->current_io = r5l_new_meta(log);
 505                 if (!log->current_io)
 506                         return -ENOMEM;
 507         }
 508
 509         return 0;
 510 }
 511
 512 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 513                                     sector_t location,
 514                                     u32 checksum1, u32 checksum2,
 515                                     bool checksum2_valid)
 516 {
 517         struct r5l_io_unit *io = log->current_io;
 518         struct r5l_payload_data_parity *payload;
 519
 520         payload = page_address(io->meta_page) + io->meta_offset;
 521         payload->header.type = cpu_to_le16(type);
 522         payload->header.flags = cpu_to_le16(0);
 523         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 524                                     (PAGE_SHIFT - 9));
 525         payload->location = cpu_to_le64(location);
 526         payload->checksum[0] = cpu_to_le32(checksum1);
 527         if (checksum2_valid)
 528                 payload->checksum[1] = cpu_to_le32(checksum2);
 529
 530         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 531                 sizeof(__le32) * (1 + !!checksum2_valid);
 532 }
 533
 534 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 535 {
 536         struct r5l_io_unit *io = log->current_io;
 537
 538         if (io->need_split_bio) {
 539                 struct bio *prev = io->current_bio;
 540
 541                 io->current_bio = r5l_bio_alloc(log);
 542                 bio_chain(io->current_bio, prev);
 543
 544                 submit_bio(prev);
 545         }
 546
 547         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
 548                 BUG();
 549
 550         r5_reserve_log_entry(log, io);
 551 }
 552
 553 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 554                            int data_pages, int parity_pages)
 555 {
 556         int i;
 557         int meta_size;
 558         int ret;
 559         struct r5l_io_unit *io;
 560
 561         meta_size =
 562                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 563                  * data_pages) +
 564                 sizeof(struct r5l_payload_data_parity) +
 565                 sizeof(__le32) * parity_pages;
 566
 567         ret = r5l_get_meta(log, meta_size);
 568         if (ret)
 569                 return ret;
 570
 571         io = log->current_io;
 572
 573         for (i = 0; i < sh->disks; i++) {
 574                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 575                     test_bit(R5_InJournal, &sh->dev[i].flags))
 576                         continue;
 577                 if (i == sh->pd_idx || i == sh->qd_idx)
 578                         continue;
 579                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 580                                         raid5_compute_blocknr(sh, i, 0),
 581                                         sh->dev[i].log_checksum, 0, false);
 582                 r5l_append_payload_page(log, sh->dev[i].page);
 583         }
 584
 585         if (parity_pages == 2) {
 586                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 587                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 588                                         sh->dev[sh->qd_idx].log_checksum, true);
 589                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 590                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 591         } else if (parity_pages == 1) {
 592                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 593                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 594                                         0, false);
 595                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 596         } else  /* Just writing data, not parity, in caching phase */
 597                 BUG_ON(parity_pages != 0);
 598
 599         list_add_tail(&sh->log_list, &io->stripe_list);
 600         atomic_inc(&io->pending_stripe);
 601         sh->log_io = io;
 602
 603         return 0;
 604 }
 605
 606 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 607 /*
 608  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 609  * data from log to raid disks), so we shouldn't wait for reclaim here
 610  */
 611 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 612 {
 613         int write_disks = 0;
 614         int data_pages, parity_pages;
 615         int reserve;
 616         int i;
 617         int ret = 0;
 618
 619         if (!log)
 620                 return -EAGAIN;
 621         /* Don't support stripe batch */
 622         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 623             test_bit(STRIPE_SYNCING, &sh->state)) {
 624                 /* the stripe is written to log, we start writing it to raid */
 625                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 626                 return -EAGAIN;
 627         }
 628
 629         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
 630
 631         for (i = 0; i < sh->disks; i++) {
 632                 void *addr;
 633
 634                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
 635                     test_bit(R5_InJournal, &sh->dev[i].flags))
 636                         continue;
 637
 638                 write_disks++;
 639                 /* checksum is already calculated in last run */
 640                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 641                         continue;
 642                 addr = kmap_atomic(sh->dev[i].page);
 643                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 644                                                     addr, PAGE_SIZE);
 645                 kunmap_atomic(addr);
 646         }
 647         parity_pages = 1 + !!(sh->qd_idx >= 0);
 648         data_pages = write_disks - parity_pages;
 649
 650         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 651         /*
 652          * The stripe must enter state machine again to finish the write, so
 653          * don't delay.
 654          */
 655         clear_bit(STRIPE_DELAYED, &sh->state);
 656         atomic_inc(&sh->count);
 657
 658         mutex_lock(&log->io_mutex);
 659         /* meta + data */
 660         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 661         if (!r5l_has_free_space(log, reserve)) {
 662                 spin_lock(&log->no_space_stripes_lock);
 663                 list_add_tail(&sh->log_list, &log->no_space_stripes);
 664                 spin_unlock(&log->no_space_stripes_lock);
 665
 666                 r5l_wake_reclaim(log, reserve);
 667         } else {
 668                 ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
 669                 if (ret) {
 670                         spin_lock_irq(&log->io_list_lock);
 671                         list_add_tail(&sh->log_list, &log->no_mem_stripes);
 672                         spin_unlock_irq(&log->io_list_lock);
 673                 }
 674         }
 675
 676         mutex_unlock(&log->io_mutex);
 677         return 0;
 678 }
 679
 680 void r5l_write_stripe_run(struct r5l_log *log)
 681 {
 682         if (!log)
 683                 return;
 684         mutex_lock(&log->io_mutex);
 685         r5l_submit_current_io(log);
 686         mutex_unlock(&log->io_mutex);
 687 }
 688
 689 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 690 {
 691         if (!log)
 692                 return -ENODEV;
 693         /*
 694          * we flush log disk cache first, then write stripe data to raid disks.
 695          * So if bio is finished, the log disk cache is flushed already. The
 696          * recovery guarantees we can recovery the bio from log disk, so we
 697          * don't need to flush again
 698          */
 699         if (bio->bi_iter.bi_size == 0) {
 700                 bio_endio(bio);
 701                 return 0;
 702         }
 703         bio->bi_opf &= ~REQ_PREFLUSH;
 704         return -EAGAIN;
 705 }
 706
 707 /* This will run after log space is reclaimed */
 708 static void r5l_run_no_space_stripes(struct r5l_log *log)
 709 {
 710         struct stripe_head *sh;
 711
 712         spin_lock(&log->no_space_stripes_lock);
 713         while (!list_empty(&log->no_space_stripes)) {
 714                 sh = list_first_entry(&log->no_space_stripes,
 715                                       struct stripe_head, log_list);
 716                 list_del_init(&sh->log_list);
 717                 set_bit(STRIPE_HANDLE, &sh->state);
 718                 raid5_release_stripe(sh);
 719         }
 720         spin_unlock(&log->no_space_stripes_lock);
 721 }
 722
 723 static sector_t r5l_reclaimable_space(struct r5l_log *log)
 724 {
 725         return r5l_ring_distance(log, log->last_checkpoint,
 726                                  log->next_checkpoint);
 727 }
 728
 729 static void r5l_run_no_mem_stripe(struct r5l_log *log)
 730 {
 731         struct stripe_head *sh;
 732
 733         assert_spin_locked(&log->io_list_lock);
 734
 735         if (!list_empty(&log->no_mem_stripes)) {
 736                 sh = list_first_entry(&log->no_mem_stripes,
 737                                       struct stripe_head, log_list);
 738                 list_del_init(&sh->log_list);
 739                 set_bit(STRIPE_HANDLE, &sh->state);
 740                 raid5_release_stripe(sh);
 741         }
 742 }
 743
 744 static bool r5l_complete_finished_ios(struct r5l_log *log)
 745 {
 746         struct r5l_io_unit *io, *next;
 747         bool found = false;
 748
 749         assert_spin_locked(&log->io_list_lock);
 750
 751         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
 752                 /* don't change list order */
 753                 if (io->state < IO_UNIT_STRIPE_END)
 754                         break;
 755
 756                 log->next_checkpoint = io->log_start;
 757                 log->next_cp_seq = io->seq;
 758
 759                 list_del(&io->log_sibling);
 760                 mempool_free(io, log->io_pool);
 761                 r5l_run_no_mem_stripe(log);
 762
 763                 found = true;
 764         }
 765
 766         return found;
 767 }
 768
 769 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 770 {
 771         struct r5l_log *log = io->log;
 772         unsigned long flags;
 773
 774         spin_lock_irqsave(&log->io_list_lock, flags);
 775         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
 776
 777         if (!r5l_complete_finished_ios(log)) {
 778                 spin_unlock_irqrestore(&log->io_list_lock, flags);
 779                 return;
 780         }
 781
 782         if (r5l_reclaimable_space(log) > log->max_free_space)
 783                 r5l_wake_reclaim(log, 0);
 784
 785         spin_unlock_irqrestore(&log->io_list_lock, flags);
 786         wake_up(&log->iounit_wait);
 787 }
 788
 789 void r5l_stripe_write_finished(struct stripe_head *sh)
 790 {
 791         struct r5l_io_unit *io;
 792
 793         io = sh->log_io;
 794         sh->log_io = NULL;
 795
 796         if (io && atomic_dec_and_test(&io->pending_stripe))
 797                 __r5l_stripe_write_finished(io);
 798 }
 799
 800 static void r5l_log_flush_endio(struct bio *bio)
 801 {
 802         struct r5l_log *log = container_of(bio, struct r5l_log,
 803                 flush_bio);
 804         unsigned long flags;
 805         struct r5l_io_unit *io;
 806
 807         if (bio->bi_error)
 808                 md_error(log->rdev->mddev, log->rdev);
 809
 810         spin_lock_irqsave(&log->io_list_lock, flags);
 811         list_for_each_entry(io, &log->flushing_ios, log_sibling)
 812                 r5l_io_run_stripes(io);
 813         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
 814         spin_unlock_irqrestore(&log->io_list_lock, flags);
 815 }
 816
 817 /*
 818  * Starting dispatch IO to raid.
 819  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
 820  * broken meta in the middle of a log causes recovery can't find meta at the
 821  * head of log. If operations require meta at the head persistent in log, we
 822  * must make sure meta before it persistent in log too. A case is:
 823  *
 824  * stripe data/parity is in log, we start write stripe to raid disks. stripe
 825  * data/parity must be persistent in log before we do the write to raid disks.
 826  *
 827  * The solution is we restrictly maintain io_unit list order. In this case, we
 828  * only write stripes of an io_unit to raid disks till the io_unit is the first
 829  * one whose data/parity is in log.
 830  */
 831 void r5l_flush_stripe_to_raid(struct r5l_log *log)
 832 {
 833         bool do_flush;
 834
 835         if (!log || !log->need_cache_flush)
 836                 return;
 837
 838         spin_lock_irq(&log->io_list_lock);
 839         /* flush bio is running */
 840         if (!list_empty(&log->flushing_ios)) {
 841                 spin_unlock_irq(&log->io_list_lock);
 842                 return;
 843         }
 844         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
 845         do_flush = !list_empty(&log->flushing_ios);
 846         spin_unlock_irq(&log->io_list_lock);
 847
 848         if (!do_flush)
 849                 return;
 850         bio_reset(&log->flush_bio);
 851         log->flush_bio.bi_bdev = log->rdev->bdev;
 852         log->flush_bio.bi_end_io = r5l_log_flush_endio;
 853         bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
 854         submit_bio(&log->flush_bio);
 855 }
 856
 857 static void r5l_write_super(struct r5l_log *log, sector_t cp);
 858 static void r5l_write_super_and_discard_space(struct r5l_log *log,
 859         sector_t end)
 860 {
 861         struct block_device *bdev = log->rdev->bdev;
 862         struct mddev *mddev;
 863
 864         r5l_write_super(log, end);
 865
 866         if (!blk_queue_discard(bdev_get_queue(bdev)))
 867                 return;
 868
 869         mddev = log->rdev->mddev;
 870         /*
 871          * Discard could zero data, so before discard we must make sure
 872          * superblock is updated to new log tail. Updating superblock (either
 873          * directly call md_update_sb() or depend on md thread) must hold
 874          * reconfig mutex. On the other hand, raid5_quiesce is called with
 875          * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
 876          * for all IO finish, hence waitting for reclaim thread, while reclaim
 877          * thread is calling this function and waitting for reconfig mutex. So
 878          * there is a deadlock. We workaround this issue with a trylock.
 879          * FIXME: we could miss discard if we can't take reconfig mutex
 880          */
 881         set_mask_bits(&mddev->flags, 0,
 882                 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
 883         if (!mddev_trylock(mddev))
 884                 return;
 885         md_update_sb(mddev, 1);
 886         mddev_unlock(mddev);
 887
 888         /* discard IO error really doesn't matter, ignore it */
 889         if (log->last_checkpoint < end) {
 890                 blkdev_issue_discard(bdev,
 891                                 log->last_checkpoint + log->rdev->data_offset,
 892                                 end - log->last_checkpoint, GFP_NOIO, 0);
 893         } else {
 894                 blkdev_issue_discard(bdev,
 895                                 log->last_checkpoint + log->rdev->data_offset,
 896                                 log->device_size - log->last_checkpoint,
 897                                 GFP_NOIO, 0);
 898                 blkdev_issue_discard(bdev, log->rdev->data_offset, end,
 899                                 GFP_NOIO, 0);
 900         }
 901 }
 902
 903 static void r5l_do_reclaim(struct r5l_log *log)
 904 {
 905         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
 906         sector_t reclaimable;
 907         sector_t next_checkpoint;
 908         u64 next_cp_seq;
 909
 910         spin_lock_irq(&log->io_list_lock);
 911         /*
 912          * move proper io_unit to reclaim list. We should not change the order.
 913          * reclaimable/unreclaimable io_unit can be mixed in the list, we
 914          * shouldn't reuse space of an unreclaimable io_unit
 915          */
 916         while (1) {
 917                 reclaimable = r5l_reclaimable_space(log);
 918                 if (reclaimable >= reclaim_target ||
 919                     (list_empty(&log->running_ios) &&
 920                      list_empty(&log->io_end_ios) &&
 921                      list_empty(&log->flushing_ios) &&
 922                      list_empty(&log->finished_ios)))
 923                         break;
 924
 925                 md_wakeup_thread(log->rdev->mddev->thread);
 926                 wait_event_lock_irq(log->iounit_wait,
 927                                     r5l_reclaimable_space(log) > reclaimable,
 928                                     log->io_list_lock);
 929         }
 930
 931         next_checkpoint = log->next_checkpoint;
 932         next_cp_seq = log->next_cp_seq;
 933         spin_unlock_irq(&log->io_list_lock);
 934
 935         BUG_ON(reclaimable < 0);
 936         if (reclaimable == 0)
 937                 return;
 938
 939         /*
 940          * write_super will flush cache of each raid disk. We must write super
 941          * here, because the log area might be reused soon and we don't want to
 942          * confuse recovery
 943          */
 944         r5l_write_super_and_discard_space(log, next_checkpoint);
 945
 946         mutex_lock(&log->io_mutex);
 947         log->last_checkpoint = next_checkpoint;
 948         log->last_cp_seq = next_cp_seq;
 949         mutex_unlock(&log->io_mutex);
 950
 951         r5l_run_no_space_stripes(log);
 952 }
 953
 954 static void r5l_reclaim_thread(struct md_thread *thread)
 955 {
 956         struct mddev *mddev = thread->mddev;
 957         struct r5conf *conf = mddev->private;
 958         struct r5l_log *log = conf->log;
 959
 960         if (!log)
 961                 return;
 962         r5l_do_reclaim(log);
 963 }
 964
 965 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 966 {
 967         unsigned long target;
 968         unsigned long new = (unsigned long)space; /* overflow in theory */
 969
 970         do {
 971                 target = log->reclaim_target;
 972                 if (new < target)
 973                         return;
 974         } while (cmpxchg(&log->reclaim_target, target, new) != target);
 975         md_wakeup_thread(log->reclaim_thread);
 976 }
 977
 978 void r5l_quiesce(struct r5l_log *log, int state)
 979 {
 980         struct mddev *mddev;
 981         if (!log || state == 2)
 982                 return;
 983         if (state == 0) {
 984                 /*
 985                  * This is a special case for hotadd. In suspend, the array has
 986                  * no journal. In resume, journal is initialized as well as the
 987                  * reclaim thread.
 988                  */
 989                 if (log->reclaim_thread)
 990                         return;
 991                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 992                                         log->rdev->mddev, "reclaim");
 993         } else if (state == 1) {
 994                 /* make sure r5l_write_super_and_discard_space exits */
 995                 mddev = log->rdev->mddev;
 996                 wake_up(&mddev->sb_wait);
 997                 r5l_wake_reclaim(log, -1L);
 998                 md_unregister_thread(&log->reclaim_thread);
 999                 r5l_do_reclaim(log);
1000         }
1001 }
1002
1003 bool r5l_log_disk_error(struct r5conf *conf)
1004 {
1005         struct r5l_log *log;
1006         bool ret;
1007         /* don't allow write if journal disk is missing */
1008         rcu_read_lock();
1009         log = rcu_dereference(conf->log);
1010
1011         if (!log)
1012                 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1013         else
1014                 ret = test_bit(Faulty, &log->rdev->flags);
1015         rcu_read_unlock();
1016         return ret;
1017 }
1018
1019 struct r5l_recovery_ctx {
1020         struct page *meta_page;         /* current meta */
1021         sector_t meta_total_blocks;     /* total size of current meta and data */
1022         sector_t pos;                   /* recovery position */
1023         u64 seq;                        /* recovery position seq */
1024 };
1025
1026 static int r5l_read_meta_block(struct r5l_log *log,
1027                                struct r5l_recovery_ctx *ctx)
1028 {
1029         struct page *page = ctx->meta_page;
1030         struct r5l_meta_block *mb;
1031         u32 crc, stored_crc;
1032
1033         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
1034                           false))
1035                 return -EIO;
1036
1037         mb = page_address(page);
1038         stored_crc = le32_to_cpu(mb->checksum);
1039         mb->checksum = 0;
1040
1041         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1042             le64_to_cpu(mb->seq) != ctx->seq ||
1043             mb->version != R5LOG_VERSION ||
1044             le64_to_cpu(mb->position) != ctx->pos)
1045                 return -EINVAL;
1046
1047         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1048         if (stored_crc != crc)
1049                 return -EINVAL;
1050
1051         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
1052                 return -EINVAL;
1053
1054         ctx->meta_total_blocks = BLOCK_SECTORS;
1055
1056         return 0;
1057 }
1058
1059 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
1060                                          struct r5l_recovery_ctx *ctx,
1061                                          sector_t stripe_sect,
1062                                          int *offset)
1063 {
1064         struct r5conf *conf = log->rdev->mddev->private;
1065         struct stripe_head *sh;
1066         struct r5l_payload_data_parity *payload;
1067         int disk_index;
1068
1069         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
1070         while (1) {
1071                 sector_t log_offset = r5l_ring_add(log, ctx->pos,
1072                                 ctx->meta_total_blocks);
1073                 payload = page_address(ctx->meta_page) + *offset;
1074
1075                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
1076                         raid5_compute_sector(conf,
1077                                              le64_to_cpu(payload->location), 0,
1078                                              &disk_index, sh);
1079
1080                         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1081                                      sh->dev[disk_index].page, REQ_OP_READ, 0,
1082                                      false);
1083                         sh->dev[disk_index].log_checksum =
1084                                 le32_to_cpu(payload->checksum[0]);
1085                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1086                 } else {
1087                         disk_index = sh->pd_idx;
1088                         sync_page_io(log->rdev, log_offset, PAGE_SIZE,
1089                                      sh->dev[disk_index].page, REQ_OP_READ, 0,
1090                                      false);
1091                         sh->dev[disk_index].log_checksum =
1092                                 le32_to_cpu(payload->checksum[0]);
1093                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1094
1095                         if (sh->qd_idx >= 0) {
1096                                 disk_index = sh->qd_idx;
1097                                 sync_page_io(log->rdev,
1098                                              r5l_ring_add(log, log_offset, BLOCK_SECTORS),
1099                                              PAGE_SIZE, sh->dev[disk_index].page,
1100                                              REQ_OP_READ, 0, false);
1101                                 sh->dev[disk_index].log_checksum =
1102                                         le32_to_cpu(payload->checksum[1]);
1103                                 set_bit(R5_Wantwrite,
1104                                         &sh->dev[disk_index].flags);
1105                         }
1106                 }
1107
1108                 ctx->meta_total_blocks += le32_to_cpu(payload->size);
1109                 *offset += sizeof(struct r5l_payload_data_parity) +
1110                         sizeof(__le32) *
1111                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1112                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
1113                         break;
1114         }
1115
1116         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1117                 void *addr;
1118                 u32 checksum;
1119
1120                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
1121                         continue;
1122                 addr = kmap_atomic(sh->dev[disk_index].page);
1123                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
1124                 kunmap_atomic(addr);
1125                 if (checksum != sh->dev[disk_index].log_checksum)
1126                         goto error;
1127         }
1128
1129         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1130                 struct md_rdev *rdev, *rrdev;
1131
1132                 if (!test_and_clear_bit(R5_Wantwrite,
1133                                         &sh->dev[disk_index].flags))
1134                         continue;
1135
1136                 /* in case device is broken */
1137                 rcu_read_lock();
1138                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1139                 if (rdev) {
1140                         atomic_inc(&rdev->nr_pending);
1141                         rcu_read_unlock();
1142                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1143                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1144                                      false);
1145                         rdev_dec_pending(rdev, rdev->mddev);
1146                         rcu_read_lock();
1147                 }
1148                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1149                 if (rrdev) {
1150                         atomic_inc(&rrdev->nr_pending);
1151                         rcu_read_unlock();
1152                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1153                                      sh->dev[disk_index].page, REQ_OP_WRITE, 0,
1154                                      false);
1155                         rdev_dec_pending(rrdev, rrdev->mddev);
1156                         rcu_read_lock();
1157                 }
1158                 rcu_read_unlock();
1159         }
1160         raid5_release_stripe(sh);
1161         return 0;
1162
1163 error:
1164         for (disk_index = 0; disk_index < sh->disks; disk_index++)
1165                 sh->dev[disk_index].flags = 0;
1166         raid5_release_stripe(sh);
1167         return -EINVAL;
1168 }
1169
1170 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
1171                                        struct r5l_recovery_ctx *ctx)
1172 {
1173         struct r5conf *conf = log->rdev->mddev->private;
1174         struct r5l_payload_data_parity *payload;
1175         struct r5l_meta_block *mb;
1176         int offset;
1177         sector_t stripe_sector;
1178
1179         mb = page_address(ctx->meta_page);
1180         offset = sizeof(struct r5l_meta_block);
1181
1182         while (offset < le32_to_cpu(mb->meta_size)) {
1183                 int dd;
1184
1185                 payload = (void *)mb + offset;
1186                 stripe_sector = raid5_compute_sector(conf,
1187                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
1188                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
1189                                                   &offset))
1190                         return -EINVAL;
1191         }
1192         return 0;
1193 }
1194
1195 /* copy data/parity from log to raid disks */
1196 static void r5l_recovery_flush_log(struct r5l_log *log,
1197                                    struct r5l_recovery_ctx *ctx)
1198 {
1199         while (1) {
1200                 if (r5l_read_meta_block(log, ctx))
1201                         return;
1202                 if (r5l_recovery_flush_one_meta(log, ctx))
1203                         return;
1204                 ctx->seq++;
1205                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
1206         }
1207 }
1208
1209 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
1210                                           u64 seq)
1211 {
1212         struct page *page;
1213         struct r5l_meta_block *mb;
1214         u32 crc;
1215
1216         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1217         if (!page)
1218                 return -ENOMEM;
1219         mb = page_address(page);
1220         mb->magic = cpu_to_le32(R5LOG_MAGIC);
1221         mb->version = R5LOG_VERSION;
1222         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
1223         mb->seq = cpu_to_le64(seq);
1224         mb->position = cpu_to_le64(pos);
1225         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1226         mb->checksum = cpu_to_le32(crc);
1227
1228         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
1229                           WRITE_FUA, false)) {
1230                 __free_page(page);
1231                 return -EIO;
1232         }
1233         __free_page(page);
1234         return 0;
1235 }
1236
1237 static int r5l_recovery_log(struct r5l_log *log)
1238 {
1239         struct r5l_recovery_ctx ctx;
1240
1241         ctx.pos = log->last_checkpoint;
1242         ctx.seq = log->last_cp_seq;
1243         ctx.meta_page = alloc_page(GFP_KERNEL);
1244         if (!ctx.meta_page)
1245                 return -ENOMEM;
1246
1247         r5l_recovery_flush_log(log, &ctx);
1248         __free_page(ctx.meta_page);
1249
1250         /*
1251          * we did a recovery. Now ctx.pos points to an invalid meta block. New
1252          * log will start here. but we can't let superblock point to last valid
1253          * meta block. The log might looks like:
1254          * | meta 1| meta 2| meta 3|
1255          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
1256          * superblock points to meta 1, we write a new valid meta 2n.  if crash
1257          * happens again, new recovery will start from meta 1. Since meta 2n is
1258          * valid now, recovery will think meta 3 is valid, which is wrong.
1259          * The solution is we create a new meta in meta2 with its seq == meta
1260          * 1's seq + 10 and let superblock points to meta2. The same recovery will
1261          * not think meta 3 is a valid meta, because its seq doesn't match
1262          */
1263         if (ctx.seq > log->last_cp_seq) {
1264                 int ret;
1265
1266                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
1267                 if (ret)
1268                         return ret;
1269                 log->seq = ctx.seq + 11;
1270                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
1271                 r5l_write_super(log, ctx.pos);
1272                 log->last_checkpoint = ctx.pos;
1273                 log->next_checkpoint = ctx.pos;
1274         } else {
1275                 log->log_start = ctx.pos;
1276                 log->seq = ctx.seq;
1277         }
1278         return 0;
1279 }
1280
1281 static void r5l_write_super(struct r5l_log *log, sector_t cp)
1282 {
1283         struct mddev *mddev = log->rdev->mddev;
1284
1285         log->rdev->journal_tail = cp;
1286         set_bit(MD_CHANGE_DEVS, &mddev->flags);
1287 }
1288
1289 /*
1290  * Try handle write operation in caching phase. This function should only
1291  * be called in write-back mode.
1292  *
1293  * If all outstanding writes can be handled in caching phase, returns 0
1294  * If writes requires write-out phase, call r5c_make_stripe_write_out()
1295  * and returns -EAGAIN
1296  */
1297 int r5c_try_caching_write(struct r5conf *conf,
1298                           struct stripe_head *sh,
1299                           struct stripe_head_state *s,
1300                           int disks)
1301 {
1302         struct r5l_log *log = conf->log;
1303         int i;
1304         struct r5dev *dev;
1305         int to_cache = 0;
1306
1307         BUG_ON(!r5c_is_writeback(log));
1308
1309         if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
1310                 /*
1311                  * There are two different scenarios here:
1312                  *  1. The stripe has some data cached, and it is sent to
1313                  *     write-out phase for reclaim
1314                  *  2. The stripe is clean, and this is the first write
1315                  *
1316                  * For 1, return -EAGAIN, so we continue with
1317                  * handle_stripe_dirtying().
1318                  *
1319                  * For 2, set STRIPE_R5C_CACHING and continue with caching
1320                  * write.
1321                  */
1322
1323                 /* case 1: anything injournal or anything in written */
1324                 if (s->injournal > 0 || s->written > 0)
1325                         return -EAGAIN;
1326                 /* case 2 */
1327                 set_bit(STRIPE_R5C_CACHING, &sh->state);
1328         }
1329
1330         for (i = disks; i--; ) {
1331                 dev = &sh->dev[i];
1332                 /* if non-overwrite, use writing-out phase */
1333                 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
1334                     !test_bit(R5_InJournal, &dev->flags)) {
1335                         r5c_make_stripe_write_out(sh);
1336                         return -EAGAIN;
1337                 }
1338         }
1339
1340         for (i = disks; i--; ) {
1341                 dev = &sh->dev[i];
1342                 if (dev->towrite) {
1343                         set_bit(R5_Wantwrite, &dev->flags);
1344                         set_bit(R5_Wantdrain, &dev->flags);
1345                         set_bit(R5_LOCKED, &dev->flags);
1346                         to_cache++;
1347                 }
1348         }
1349
1350         if (to_cache) {
1351                 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1352                 /*
1353                  * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
1354                  * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
1355                  * r5c_handle_data_cached()
1356                  */
1357                 set_bit(STRIPE_LOG_TRAPPED, &sh->state);
1358         }
1359
1360         return 0;
1361 }
1362
1363 /*
1364  * free extra pages (orig_page) we allocated for prexor
1365  */
1366 void r5c_release_extra_page(struct stripe_head *sh)
1367 {
1368         int i;
1369
1370         for (i = sh->disks; i--; )
1371                 if (sh->dev[i].page != sh->dev[i].orig_page) {
1372                         struct page *p = sh->dev[i].orig_page;
1373
1374                         sh->dev[i].orig_page = sh->dev[i].page;
1375                         put_page(p);
1376                 }
1377 }
1378
1379 /*
1380  * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
1381  * stripe is committed to RAID disks.
1382  */
1383 void r5c_finish_stripe_write_out(struct r5conf *conf,
1384                                  struct stripe_head *sh,
1385                                  struct stripe_head_state *s)
1386 {
1387         int i;
1388         int do_wakeup = 0;
1389
1390         if (!conf->log ||
1391             !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
1392                 return;
1393
1394         WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
1395         clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
1396
1397         if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
1398                 return;
1399
1400         for (i = sh->disks; i--; ) {
1401                 clear_bit(R5_InJournal, &sh->dev[i].flags);
1402                 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1403                         do_wakeup = 1;
1404         }
1405
1406         /*
1407          * analyse_stripe() runs before r5c_finish_stripe_write_out(),
1408          * We updated R5_InJournal, so we also update s->injournal.
1409          */
1410         s->injournal = 0;
1411
1412         if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1413                 if (atomic_dec_and_test(&conf->pending_full_writes))
1414                         md_wakeup_thread(conf->mddev->thread);
1415
1416         if (do_wakeup)
1417                 wake_up(&conf->wait_for_overlap);
1418 }
1419
1420 int
1421 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
1422                struct stripe_head_state *s)
1423 {
1424         int pages = 0;
1425         int reserve;
1426         int i;
1427         int ret = 0;
1428
1429         BUG_ON(!log);
1430
1431         for (i = 0; i < sh->disks; i++) {
1432                 void *addr;
1433
1434                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
1435                         continue;
1436                 addr = kmap_atomic(sh->dev[i].page);
1437                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
1438                                                     addr, PAGE_SIZE);
1439                 kunmap_atomic(addr);
1440                 pages++;
1441         }
1442         WARN_ON(pages == 0);
1443
1444         /*
1445          * The stripe must enter state machine again to call endio, so
1446          * don't delay.
1447          */
1448         clear_bit(STRIPE_DELAYED, &sh->state);
1449         atomic_inc(&sh->count);
1450
1451         mutex_lock(&log->io_mutex);
1452         /* meta + data */
1453         reserve = (1 + pages) << (PAGE_SHIFT - 9);
1454         if (!r5l_has_free_space(log, reserve)) {
1455                 spin_lock(&log->no_space_stripes_lock);
1456                 list_add_tail(&sh->log_list, &log->no_space_stripes);
1457                 spin_unlock(&log->no_space_stripes_lock);
1458
1459                 r5l_wake_reclaim(log, reserve);
1460         } else {
1461                 ret = r5l_log_stripe(log, sh, pages, 0);
1462                 if (ret) {
1463                         spin_lock_irq(&log->io_list_lock);
1464                         list_add_tail(&sh->log_list, &log->no_mem_stripes);
1465                         spin_unlock_irq(&log->io_list_lock);
1466                 }
1467         }
1468
1469         mutex_unlock(&log->io_mutex);
1470         return 0;
1471 }
1472
1473
1474 static int r5l_load_log(struct r5l_log *log)
1475 {
1476         struct md_rdev *rdev = log->rdev;
1477         struct page *page;
1478         struct r5l_meta_block *mb;
1479         sector_t cp = log->rdev->journal_tail;
1480         u32 stored_crc, expected_crc;
1481         bool create_super = false;
1482         int ret;
1483
1484         /* Make sure it's valid */
1485         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1486                 cp = 0;
1487         page = alloc_page(GFP_KERNEL);
1488         if (!page)
1489                 return -ENOMEM;
1490
1491         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
1492                 ret = -EIO;
1493                 goto ioerr;
1494         }
1495         mb = page_address(page);
1496
1497         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1498             mb->version != R5LOG_VERSION) {
1499                 create_super = true;
1500                 goto create;
1501         }
1502         stored_crc = le32_to_cpu(mb->checksum);
1503         mb->checksum = 0;
1504         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1505         if (stored_crc != expected_crc) {
1506                 create_super = true;
1507                 goto create;
1508         }
1509         if (le64_to_cpu(mb->position) != cp) {
1510                 create_super = true;
1511                 goto create;
1512         }
1513 create:
1514         if (create_super) {
1515                 log->last_cp_seq = prandom_u32();
1516                 cp = 0;
1517                 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
1518                 /*
1519                  * Make sure super points to correct address. Log might have
1520                  * data very soon. If super hasn't correct log tail address,
1521                  * recovery can't find the log
1522                  */
1523                 r5l_write_super(log, cp);
1524         } else
1525                 log->last_cp_seq = le64_to_cpu(mb->seq);
1526
1527         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1528         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1529         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1530                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1531         log->last_checkpoint = cp;
1532         log->next_checkpoint = cp;
1533
1534         __free_page(page);
1535
1536         return r5l_recovery_log(log);
1537 ioerr:
1538         __free_page(page);
1539         return ret;
1540 }
1541
1542 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1543 {
1544         struct request_queue *q = bdev_get_queue(rdev->bdev);
1545         struct r5l_log *log;
1546
1547         if (PAGE_SIZE != 4096)
1548                 return -EINVAL;
1549
1550         /*
1551          * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
1552          * raid_disks r5l_payload_data_parity.
1553          *
1554          * Write journal and cache does not work for very big array
1555          * (raid_disks > 203)
1556          */
1557         if (sizeof(struct r5l_meta_block) +
1558             ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
1559              conf->raid_disks) > PAGE_SIZE) {
1560                 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
1561                        mdname(conf->mddev), conf->raid_disks);
1562                 return -EINVAL;
1563         }
1564
1565         log = kzalloc(sizeof(*log), GFP_KERNEL);
1566         if (!log)
1567                 return -ENOMEM;
1568         log->rdev = rdev;
1569
1570         log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
1571
1572         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1573                                        sizeof(rdev->mddev->uuid));
1574
1575         mutex_init(&log->io_mutex);
1576
1577         spin_lock_init(&log->io_list_lock);
1578         INIT_LIST_HEAD(&log->running_ios);
1579         INIT_LIST_HEAD(&log->io_end_ios);
1580         INIT_LIST_HEAD(&log->flushing_ios);
1581         INIT_LIST_HEAD(&log->finished_ios);
1582         bio_init(&log->flush_bio);
1583
1584         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1585         if (!log->io_kc)
1586                 goto io_kc;
1587
1588         log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc);
1589         if (!log->io_pool)
1590                 goto io_pool;
1591
1592         log->bs = bioset_create(R5L_POOL_SIZE, 0);
1593         if (!log->bs)
1594                 goto io_bs;
1595
1596         log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0);
1597         if (!log->meta_pool)
1598                 goto out_mempool;
1599
1600         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1601                                                  log->rdev->mddev, "reclaim");
1602         if (!log->reclaim_thread)
1603                 goto reclaim_thread;
1604         init_waitqueue_head(&log->iounit_wait);
1605
1606         INIT_LIST_HEAD(&log->no_mem_stripes);
1607
1608         INIT_LIST_HEAD(&log->no_space_stripes);
1609         spin_lock_init(&log->no_space_stripes_lock);
1610
1611         log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
1612
1613         if (r5l_load_log(log))
1614                 goto error;
1615
1616         rcu_assign_pointer(conf->log, log);
1617         set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
1618         return 0;
1619
1620 error:
1621         md_unregister_thread(&log->reclaim_thread);
1622 reclaim_thread:
1623         mempool_destroy(log->meta_pool);
1624 out_mempool:
1625         bioset_free(log->bs);
1626 io_bs:
1627         mempool_destroy(log->io_pool);
1628 io_pool:
1629         kmem_cache_destroy(log->io_kc);
1630 io_kc:
1631         kfree(log);
1632         return -EINVAL;
1633 }
1634
1635 void r5l_exit_log(struct r5l_log *log)
1636 {
1637         md_unregister_thread(&log->reclaim_thread);
1638         mempool_destroy(log->meta_pool);
1639         bioset_free(log->bs);
1640         mempool_destroy(log->io_pool);
1641         kmem_cache_destroy(log->io_kc);
1642         kfree(log);
1643 }