drivers/md/raid5-cache.c

   1 /*
   2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  */
  14 #include <linux/kernel.h>
  15 #include <linux/wait.h>
  16 #include <linux/blkdev.h>
  17 #include <linux/slab.h>
  18 #include <linux/raid/md_p.h>
  19 #include <linux/crc32c.h>
  20 #include <linux/random.h>
  21 #include "md.h"
  22 #include "raid5.h"
  23
  24 /*
  25  * metadata/data stored in disk with 4k size unit (a block) regardless
  26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
  27  */
  28 #define BLOCK_SECTORS (8)
  29
  30 /*
  31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
  32  * recovery scans a very long log
  33  */
  34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  36
  37 struct r5l_log {
  38         struct md_rdev *rdev;
  39
  40         u32 uuid_checksum;
  41
  42         sector_t device_size;           /* log device size, round to
  43                                          * BLOCK_SECTORS */
  44         sector_t max_free_space;        /* reclaim run if free space is at
  45                                          * this size */
  46
  47         sector_t last_checkpoint;       /* log tail. where recovery scan
  48                                          * starts from */
  49         u64 last_cp_seq;                /* log tail sequence */
  50
  51         sector_t log_start;             /* log head. where new data appends */
  52         u64 seq;                        /* log head sequence */
  53
  54         struct mutex io_mutex;
  55         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
  56
  57         spinlock_t io_list_lock;
  58         struct list_head running_ios;   /* io_units which are still running,
  59                                          * and have not yet been completely
  60                                          * written to the log */
  61         struct list_head io_end_ios;    /* io_units which have been completely
  62                                          * written to the log but not yet written
  63                                          * to the RAID */
  64         struct list_head flushing_ios;  /* io_units which are waiting for log
  65                                          * cache flush */
  66         struct list_head flushed_ios;   /* io_units which settle down in log disk */
  67         struct bio flush_bio;
  68         struct list_head stripe_end_ios;/* io_units which have been completely
  69                                          * written to the RAID but have not yet
  70                                          * been considered for updating super */
  71
  72         struct kmem_cache *io_kc;
  73
  74         struct md_thread *reclaim_thread;
  75         unsigned long reclaim_target;   /* number of space that need to be
  76                                          * reclaimed.  if it's 0, reclaim spaces
  77                                          * used by io_units which are in
  78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
  79                                          * dones't wait for specific io_unit
  80                                          * switching to IO_UNIT_STRIPE_END
  81                                          * state) */
  82
  83         struct list_head no_space_stripes; /* pending stripes, log has no space */
  84         spinlock_t no_space_stripes_lock;
  85 };
  86
  87 /*
  88  * an IO range starts from a meta data block and end at the next meta data
  89  * block. The io unit's the meta data block tracks data/parity followed it. io
  90  * unit is written to log disk with normal write, as we always flush log disk
  91  * first and then start move data to raid disks, there is no requirement to
  92  * write io unit with FLUSH/FUA
  93  */
  94 struct r5l_io_unit {
  95         struct r5l_log *log;
  96
  97         struct page *meta_page; /* store meta block */
  98         int meta_offset;        /* current offset in meta_page */
  99
 100         struct bio_list bios;
 101         atomic_t pending_io;    /* pending bios not written to log yet */
 102         struct bio *current_bio;/* current_bio accepting new data */
 103
 104         atomic_t pending_stripe;/* how many stripes not flushed to raid */
 105         u64 seq;                /* seq number of the metablock */
 106         sector_t log_start;     /* where the io_unit starts */
 107         sector_t log_end;       /* where the io_unit ends */
 108         struct list_head log_sibling; /* log->running_ios */
 109         struct list_head stripe_list; /* stripes added to the io_unit */
 110
 111         int state;
 112         wait_queue_head_t wait_state;
 113 };
 114
 115 /* r5l_io_unit state */
 116 enum r5l_io_unit_state {
 117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
 118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
 119                                  * don't accepting new bio */
 120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
 121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
 122 };
 123
 124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 125 {
 126         start += inc;
 127         if (start >= log->device_size)
 128                 start = start - log->device_size;
 129         return start;
 130 }
 131
 132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
 133                                   sector_t end)
 134 {
 135         if (end >= start)
 136                 return end - start;
 137         else
 138                 return end + log->device_size - start;
 139 }
 140
 141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 142 {
 143         sector_t used_size;
 144
 145         used_size = r5l_ring_distance(log, log->last_checkpoint,
 146                                         log->log_start);
 147
 148         return log->device_size > used_size + size;
 149 }
 150
 151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
 152 {
 153         struct r5l_io_unit *io;
 154         /* We can't handle memory allocate failure so far */
 155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
 156
 157         io = kmem_cache_zalloc(log->io_kc, gfp);
 158         io->log = log;
 159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
 160
 161         bio_list_init(&io->bios);
 162         INIT_LIST_HEAD(&io->log_sibling);
 163         INIT_LIST_HEAD(&io->stripe_list);
 164         io->state = IO_UNIT_RUNNING;
 165         init_waitqueue_head(&io->wait_state);
 166         return io;
 167 }
 168
 169 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
 170 {
 171         __free_page(io->meta_page);
 172         kmem_cache_free(log->io_kc, io);
 173 }
 174
 175 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
 176                                   enum r5l_io_unit_state state)
 177 {
 178         struct r5l_io_unit *io;
 179
 180         while (!list_empty(from)) {
 181                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
 182                 /* don't change list order */
 183                 if (io->state >= state)
 184                         list_move_tail(&io->log_sibling, to);
 185                 else
 186                         break;
 187         }
 188 }
 189
 190 /*
 191  * We don't want too many io_units reside in stripe_end_ios list, which will
 192  * waste a lot of memory. So we try to remove some. But we must keep at least 2
 193  * io_units. The superblock must point to a valid meta, if it's the last meta,
 194  * recovery can scan less
 195  */
 196 static void r5l_compress_stripe_end_list(struct r5l_log *log)
 197 {
 198         struct r5l_io_unit *first, *last, *io;
 199
 200         first = list_first_entry(&log->stripe_end_ios,
 201                                  struct r5l_io_unit, log_sibling);
 202         last = list_last_entry(&log->stripe_end_ios,
 203                                struct r5l_io_unit, log_sibling);
 204         if (first == last)
 205                 return;
 206         list_del(&first->log_sibling);
 207         list_del(&last->log_sibling);
 208         while (!list_empty(&log->stripe_end_ios)) {
 209                 io = list_first_entry(&log->stripe_end_ios,
 210                                       struct r5l_io_unit, log_sibling);
 211                 list_del(&io->log_sibling);
 212                 first->log_end = io->log_end;
 213                 r5l_free_io_unit(log, io);
 214         }
 215         list_add_tail(&first->log_sibling, &log->stripe_end_ios);
 216         list_add_tail(&last->log_sibling, &log->stripe_end_ios);
 217 }
 218
 219 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 220 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 221                                     enum r5l_io_unit_state state)
 222 {
 223         struct r5l_log *log = io->log;
 224
 225         if (WARN_ON(io->state >= state))
 226                 return;
 227         io->state = state;
 228         if (state == IO_UNIT_IO_END)
 229                 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
 230                                       IO_UNIT_IO_END);
 231         if (state == IO_UNIT_STRIPE_END) {
 232                 struct r5l_io_unit *last;
 233                 sector_t reclaimable_space;
 234
 235                 r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios,
 236                                       IO_UNIT_STRIPE_END);
 237
 238                 last = list_last_entry(&log->stripe_end_ios,
 239                                        struct r5l_io_unit, log_sibling);
 240                 reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
 241                                                       last->log_end);
 242                 if (reclaimable_space >= log->max_free_space)
 243                         r5l_wake_reclaim(log, 0);
 244
 245                 r5l_compress_stripe_end_list(log);
 246         }
 247         wake_up(&io->wait_state);
 248 }
 249
 250 static void r5l_set_io_unit_state(struct r5l_io_unit *io,
 251                                   enum r5l_io_unit_state state)
 252 {
 253         struct r5l_log *log = io->log;
 254         unsigned long flags;
 255
 256         spin_lock_irqsave(&log->io_list_lock, flags);
 257         __r5l_set_io_unit_state(io, state);
 258         spin_unlock_irqrestore(&log->io_list_lock, flags);
 259 }
 260
 261 /* XXX: totally ignores I/O errors */
 262 static void r5l_log_endio(struct bio *bio)
 263 {
 264         struct r5l_io_unit *io = bio->bi_private;
 265         struct r5l_log *log = io->log;
 266
 267         bio_put(bio);
 268
 269         if (!atomic_dec_and_test(&io->pending_io))
 270                 return;
 271
 272         r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 273         md_wakeup_thread(log->rdev->mddev->thread);
 274 }
 275
 276 static void r5l_submit_current_io(struct r5l_log *log)
 277 {
 278         struct r5l_io_unit *io = log->current_io;
 279         struct r5l_meta_block *block;
 280         struct bio *bio;
 281         u32 crc;
 282
 283         if (!io)
 284                 return;
 285
 286         block = page_address(io->meta_page);
 287         block->meta_size = cpu_to_le32(io->meta_offset);
 288         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 289         block->checksum = cpu_to_le32(crc);
 290
 291         log->current_io = NULL;
 292         r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 293
 294         while ((bio = bio_list_pop(&io->bios))) {
 295                 /* all IO must start from rdev->data_offset */
 296                 bio->bi_iter.bi_sector += log->rdev->data_offset;
 297                 submit_bio(WRITE, bio);
 298         }
 299 }
 300
 301 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 302 {
 303         struct r5l_io_unit *io;
 304         struct r5l_meta_block *block;
 305         struct bio *bio;
 306
 307         io = r5l_alloc_io_unit(log);
 308
 309         block = page_address(io->meta_page);
 310         block->magic = cpu_to_le32(R5LOG_MAGIC);
 311         block->version = R5LOG_VERSION;
 312         block->seq = cpu_to_le64(log->seq);
 313         block->position = cpu_to_le64(log->log_start);
 314
 315         io->log_start = log->log_start;
 316         io->meta_offset = sizeof(struct r5l_meta_block);
 317         io->seq = log->seq;
 318
 319         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
 320         io->current_bio = bio;
 321         bio->bi_rw = WRITE;
 322         bio->bi_bdev = log->rdev->bdev;
 323         bio->bi_iter.bi_sector = log->log_start;
 324         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
 325         bio->bi_end_io = r5l_log_endio;
 326         bio->bi_private = io;
 327
 328         bio_list_add(&io->bios, bio);
 329         atomic_inc(&io->pending_io);
 330
 331         log->seq++;
 332         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
 333         io->log_end = log->log_start;
 334         /* current bio hit disk end */
 335         if (log->log_start == 0)
 336                 io->current_bio = NULL;
 337
 338         spin_lock_irq(&log->io_list_lock);
 339         list_add_tail(&io->log_sibling, &log->running_ios);
 340         spin_unlock_irq(&log->io_list_lock);
 341
 342         return io;
 343 }
 344
 345 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
 346 {
 347         struct r5l_io_unit *io;
 348
 349         io = log->current_io;
 350         if (io && io->meta_offset + payload_size > PAGE_SIZE)
 351                 r5l_submit_current_io(log);
 352         io = log->current_io;
 353         if (io)
 354                 return 0;
 355
 356         log->current_io = r5l_new_meta(log);
 357         return 0;
 358 }
 359
 360 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
 361                                     sector_t location,
 362                                     u32 checksum1, u32 checksum2,
 363                                     bool checksum2_valid)
 364 {
 365         struct r5l_io_unit *io = log->current_io;
 366         struct r5l_payload_data_parity *payload;
 367
 368         payload = page_address(io->meta_page) + io->meta_offset;
 369         payload->header.type = cpu_to_le16(type);
 370         payload->header.flags = cpu_to_le16(0);
 371         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
 372                                     (PAGE_SHIFT - 9));
 373         payload->location = cpu_to_le64(location);
 374         payload->checksum[0] = cpu_to_le32(checksum1);
 375         if (checksum2_valid)
 376                 payload->checksum[1] = cpu_to_le32(checksum2);
 377
 378         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
 379                 sizeof(__le32) * (1 + !!checksum2_valid);
 380 }
 381
 382 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 383 {
 384         struct r5l_io_unit *io = log->current_io;
 385
 386 alloc_bio:
 387         if (!io->current_bio) {
 388                 struct bio *bio;
 389
 390                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
 391                 bio->bi_rw = WRITE;
 392                 bio->bi_bdev = log->rdev->bdev;
 393                 bio->bi_iter.bi_sector = log->log_start;
 394                 bio->bi_end_io = r5l_log_endio;
 395                 bio->bi_private = io;
 396                 bio_list_add(&io->bios, bio);
 397                 atomic_inc(&io->pending_io);
 398                 io->current_bio = bio;
 399         }
 400         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
 401                 io->current_bio = NULL;
 402                 goto alloc_bio;
 403         }
 404         log->log_start = r5l_ring_add(log, log->log_start,
 405                                       BLOCK_SECTORS);
 406         /* current bio hit disk end */
 407         if (log->log_start == 0)
 408                 io->current_bio = NULL;
 409
 410         io->log_end = log->log_start;
 411 }
 412
 413 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 414                            int data_pages, int parity_pages)
 415 {
 416         int i;
 417         int meta_size;
 418         struct r5l_io_unit *io;
 419
 420         meta_size =
 421                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 422                  * data_pages) +
 423                 sizeof(struct r5l_payload_data_parity) +
 424                 sizeof(__le32) * parity_pages;
 425
 426         r5l_get_meta(log, meta_size);
 427         io = log->current_io;
 428
 429         for (i = 0; i < sh->disks; i++) {
 430                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 431                         continue;
 432                 if (i == sh->pd_idx || i == sh->qd_idx)
 433                         continue;
 434                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 435                                         raid5_compute_blocknr(sh, i, 0),
 436                                         sh->dev[i].log_checksum, 0, false);
 437                 r5l_append_payload_page(log, sh->dev[i].page);
 438         }
 439
 440         if (sh->qd_idx >= 0) {
 441                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 442                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 443                                         sh->dev[sh->qd_idx].log_checksum, true);
 444                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 445                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
 446         } else {
 447                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
 448                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
 449                                         0, false);
 450                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
 451         }
 452
 453         list_add_tail(&sh->log_list, &io->stripe_list);
 454         atomic_inc(&io->pending_stripe);
 455         sh->log_io = io;
 456 }
 457
 458 /*
 459  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
 460  * data from log to raid disks), so we shouldn't wait for reclaim here
 461  */
 462 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 463 {
 464         int write_disks = 0;
 465         int data_pages, parity_pages;
 466         int meta_size;
 467         int reserve;
 468         int i;
 469
 470         if (!log)
 471                 return -EAGAIN;
 472         /* Don't support stripe batch */
 473         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 474             test_bit(STRIPE_SYNCING, &sh->state)) {
 475                 /* the stripe is written to log, we start writing it to raid */
 476                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
 477                 return -EAGAIN;
 478         }
 479
 480         for (i = 0; i < sh->disks; i++) {
 481                 void *addr;
 482
 483                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
 484                         continue;
 485                 write_disks++;
 486                 /* checksum is already calculated in last run */
 487                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 488                         continue;
 489                 addr = kmap_atomic(sh->dev[i].page);
 490                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
 491                                                     addr, PAGE_SIZE);
 492                 kunmap_atomic(addr);
 493         }
 494         parity_pages = 1 + !!(sh->qd_idx >= 0);
 495         data_pages = write_disks - parity_pages;
 496
 497         meta_size =
 498                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
 499                  * data_pages) +
 500                 sizeof(struct r5l_payload_data_parity) +
 501                 sizeof(__le32) * parity_pages;
 502         /* Doesn't work with very big raid array */
 503         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
 504                 return -EINVAL;
 505
 506         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 507         atomic_inc(&sh->count);
 508
 509         mutex_lock(&log->io_mutex);
 510         /* meta + data */
 511         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
 512         if (r5l_has_free_space(log, reserve))
 513                 r5l_log_stripe(log, sh, data_pages, parity_pages);
 514         else {
 515                 spin_lock(&log->no_space_stripes_lock);
 516                 list_add_tail(&sh->log_list, &log->no_space_stripes);
 517                 spin_unlock(&log->no_space_stripes_lock);
 518
 519                 r5l_wake_reclaim(log, reserve);
 520         }
 521         mutex_unlock(&log->io_mutex);
 522
 523         return 0;
 524 }
 525
 526 void r5l_write_stripe_run(struct r5l_log *log)
 527 {
 528         if (!log)
 529                 return;
 530         mutex_lock(&log->io_mutex);
 531         r5l_submit_current_io(log);
 532         mutex_unlock(&log->io_mutex);
 533 }
 534
 535 /* This will run after log space is reclaimed */
 536 static void r5l_run_no_space_stripes(struct r5l_log *log)
 537 {
 538         struct stripe_head *sh;
 539
 540         spin_lock(&log->no_space_stripes_lock);
 541         while (!list_empty(&log->no_space_stripes)) {
 542                 sh = list_first_entry(&log->no_space_stripes,
 543                                       struct stripe_head, log_list);
 544                 list_del_init(&sh->log_list);
 545                 set_bit(STRIPE_HANDLE, &sh->state);
 546                 raid5_release_stripe(sh);
 547         }
 548         spin_unlock(&log->no_space_stripes_lock);
 549 }
 550
 551 void r5l_stripe_write_finished(struct stripe_head *sh)
 552 {
 553         struct r5l_io_unit *io;
 554
 555         /* Don't support stripe batch */
 556         io = sh->log_io;
 557         if (!io)
 558                 return;
 559         sh->log_io = NULL;
 560
 561         if (atomic_dec_and_test(&io->pending_stripe))
 562                 r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
 563 }
 564
 565 static void r5l_log_flush_endio(struct bio *bio)
 566 {
 567         struct r5l_log *log = container_of(bio, struct r5l_log,
 568                 flush_bio);
 569         unsigned long flags;
 570         struct r5l_io_unit *io;
 571         struct stripe_head *sh;
 572
 573         spin_lock_irqsave(&log->io_list_lock, flags);
 574         list_for_each_entry(io, &log->flushing_ios, log_sibling) {
 575                 while (!list_empty(&io->stripe_list)) {
 576                         sh = list_first_entry(&io->stripe_list,
 577                                 struct stripe_head, log_list);
 578                         list_del_init(&sh->log_list);
 579                         set_bit(STRIPE_HANDLE, &sh->state);
 580                         raid5_release_stripe(sh);
 581                 }
 582         }
 583         list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
 584         spin_unlock_irqrestore(&log->io_list_lock, flags);
 585 }
 586
 587 /*
 588  * Starting dispatch IO to raid.
 589  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
 590  * broken meta in the middle of a log causes recovery can't find meta at the
 591  * head of log. If operations require meta at the head persistent in log, we
 592  * must make sure meta before it persistent in log too. A case is:
 593  *
 594  * stripe data/parity is in log, we start write stripe to raid disks. stripe
 595  * data/parity must be persistent in log before we do the write to raid disks.
 596  *
 597  * The solution is we restrictly maintain io_unit list order. In this case, we
 598  * only write stripes of an io_unit to raid disks till the io_unit is the first
 599  * one whose data/parity is in log.
 600  */
 601 void r5l_flush_stripe_to_raid(struct r5l_log *log)
 602 {
 603         bool do_flush;
 604         if (!log)
 605                 return;
 606
 607         spin_lock_irq(&log->io_list_lock);
 608         /* flush bio is running */
 609         if (!list_empty(&log->flushing_ios)) {
 610                 spin_unlock_irq(&log->io_list_lock);
 611                 return;
 612         }
 613         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
 614         do_flush = !list_empty(&log->flushing_ios);
 615         spin_unlock_irq(&log->io_list_lock);
 616
 617         if (!do_flush)
 618                 return;
 619         bio_reset(&log->flush_bio);
 620         log->flush_bio.bi_bdev = log->rdev->bdev;
 621         log->flush_bio.bi_end_io = r5l_log_flush_endio;
 622         submit_bio(WRITE_FLUSH, &log->flush_bio);
 623 }
 624
 625 static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
 626 {
 627         md_wakeup_thread(log->rdev->mddev->thread);
 628         wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
 629 }
 630
 631 static void r5l_write_super(struct r5l_log *log, sector_t cp);
 632 static void r5l_do_reclaim(struct r5l_log *log)
 633 {
 634         struct r5l_io_unit *io, *last;
 635         LIST_HEAD(list);
 636         sector_t free = 0;
 637         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
 638
 639         spin_lock_irq(&log->io_list_lock);
 640         /*
 641          * move proper io_unit to reclaim list. We should not change the order.
 642          * reclaimable/unreclaimable io_unit can be mixed in the list, we
 643          * shouldn't reuse space of an unreclaimable io_unit
 644          */
 645         while (1) {
 646                 struct list_head *target_list = NULL;
 647
 648                 while (!list_empty(&log->stripe_end_ios)) {
 649                         io = list_first_entry(&log->stripe_end_ios,
 650                                               struct r5l_io_unit, log_sibling);
 651                         list_move_tail(&io->log_sibling, &list);
 652                         free += r5l_ring_distance(log, io->log_start,
 653                                                   io->log_end);
 654                 }
 655
 656                 if (free >= reclaim_target ||
 657                     (list_empty(&log->running_ios) &&
 658                      list_empty(&log->io_end_ios) &&
 659                      list_empty(&log->flushing_ios) &&
 660                      list_empty(&log->flushed_ios)))
 661                         break;
 662
 663                 /* Below waiting mostly happens when we shutdown the raid */
 664                 if (!list_empty(&log->flushed_ios))
 665                         target_list = &log->flushed_ios;
 666                 else if (!list_empty(&log->flushing_ios))
 667                         target_list = &log->flushing_ios;
 668                 else if (!list_empty(&log->io_end_ios))
 669                         target_list = &log->io_end_ios;
 670                 else if (!list_empty(&log->running_ios))
 671                         target_list = &log->running_ios;
 672
 673                 io = list_first_entry(target_list,
 674                                       struct r5l_io_unit, log_sibling);
 675                 spin_unlock_irq(&log->io_list_lock);
 676                 /* nobody else can delete the io, we are safe */
 677                 r5l_kick_io_unit(log, io);
 678                 spin_lock_irq(&log->io_list_lock);
 679         }
 680         spin_unlock_irq(&log->io_list_lock);
 681
 682         if (list_empty(&list))
 683                 return;
 684
 685         /* super always point to last valid meta */
 686         last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
 687         /*
 688          * write_super will flush cache of each raid disk. We must write super
 689          * here, because the log area might be reused soon and we don't want to
 690          * confuse recovery
 691          */
 692         r5l_write_super(log, last->log_start);
 693
 694         mutex_lock(&log->io_mutex);
 695         log->last_checkpoint = last->log_start;
 696         log->last_cp_seq = last->seq;
 697         mutex_unlock(&log->io_mutex);
 698         r5l_run_no_space_stripes(log);
 699
 700         while (!list_empty(&list)) {
 701                 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
 702                 list_del(&io->log_sibling);
 703                 r5l_free_io_unit(log, io);
 704         }
 705 }
 706
 707 static void r5l_reclaim_thread(struct md_thread *thread)
 708 {
 709         struct mddev *mddev = thread->mddev;
 710         struct r5conf *conf = mddev->private;
 711         struct r5l_log *log = conf->log;
 712
 713         if (!log)
 714                 return;
 715         r5l_do_reclaim(log);
 716 }
 717
 718 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 719 {
 720         unsigned long target;
 721         unsigned long new = (unsigned long)space; /* overflow in theory */
 722
 723         do {
 724                 target = log->reclaim_target;
 725                 if (new < target)
 726                         return;
 727         } while (cmpxchg(&log->reclaim_target, target, new) != target);
 728         md_wakeup_thread(log->reclaim_thread);
 729 }
 730
 731 struct r5l_recovery_ctx {
 732         struct page *meta_page;         /* current meta */
 733         sector_t meta_total_blocks;     /* total size of current meta and data */
 734         sector_t pos;                   /* recovery position */
 735         u64 seq;                        /* recovery position seq */
 736 };
 737
 738 static int r5l_read_meta_block(struct r5l_log *log,
 739                                struct r5l_recovery_ctx *ctx)
 740 {
 741         struct page *page = ctx->meta_page;
 742         struct r5l_meta_block *mb;
 743         u32 crc, stored_crc;
 744
 745         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
 746                 return -EIO;
 747
 748         mb = page_address(page);
 749         stored_crc = le32_to_cpu(mb->checksum);
 750         mb->checksum = 0;
 751
 752         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
 753             le64_to_cpu(mb->seq) != ctx->seq ||
 754             mb->version != R5LOG_VERSION ||
 755             le64_to_cpu(mb->position) != ctx->pos)
 756                 return -EINVAL;
 757
 758         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
 759         if (stored_crc != crc)
 760                 return -EINVAL;
 761
 762         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
 763                 return -EINVAL;
 764
 765         ctx->meta_total_blocks = BLOCK_SECTORS;
 766
 767         return 0;
 768 }
 769
 770 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
 771                                          struct r5l_recovery_ctx *ctx,
 772                                          sector_t stripe_sect,
 773                                          int *offset, sector_t *log_offset)
 774 {
 775         struct r5conf *conf = log->rdev->mddev->private;
 776         struct stripe_head *sh;
 777         struct r5l_payload_data_parity *payload;
 778         int disk_index;
 779
 780         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
 781         while (1) {
 782                 payload = page_address(ctx->meta_page) + *offset;
 783
 784                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
 785                         raid5_compute_sector(conf,
 786                                              le64_to_cpu(payload->location), 0,
 787                                              &disk_index, sh);
 788
 789                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
 790                                      sh->dev[disk_index].page, READ, false);
 791                         sh->dev[disk_index].log_checksum =
 792                                 le32_to_cpu(payload->checksum[0]);
 793                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
 794                         ctx->meta_total_blocks += BLOCK_SECTORS;
 795                 } else {
 796                         disk_index = sh->pd_idx;
 797                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
 798                                      sh->dev[disk_index].page, READ, false);
 799                         sh->dev[disk_index].log_checksum =
 800                                 le32_to_cpu(payload->checksum[0]);
 801                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
 802
 803                         if (sh->qd_idx >= 0) {
 804                                 disk_index = sh->qd_idx;
 805                                 sync_page_io(log->rdev,
 806                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
 807                                              PAGE_SIZE, sh->dev[disk_index].page,
 808                                              READ, false);
 809                                 sh->dev[disk_index].log_checksum =
 810                                         le32_to_cpu(payload->checksum[1]);
 811                                 set_bit(R5_Wantwrite,
 812                                         &sh->dev[disk_index].flags);
 813                         }
 814                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
 815                 }
 816
 817                 *log_offset = r5l_ring_add(log, *log_offset,
 818                                            le32_to_cpu(payload->size));
 819                 *offset += sizeof(struct r5l_payload_data_parity) +
 820                         sizeof(__le32) *
 821                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 822                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
 823                         break;
 824         }
 825
 826         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 827                 void *addr;
 828                 u32 checksum;
 829
 830                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 831                         continue;
 832                 addr = kmap_atomic(sh->dev[disk_index].page);
 833                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
 834                 kunmap_atomic(addr);
 835                 if (checksum != sh->dev[disk_index].log_checksum)
 836                         goto error;
 837         }
 838
 839         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 840                 struct md_rdev *rdev, *rrdev;
 841
 842                 if (!test_and_clear_bit(R5_Wantwrite,
 843                                         &sh->dev[disk_index].flags))
 844                         continue;
 845
 846                 /* in case device is broken */
 847                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
 848                 if (rdev)
 849                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
 850                                      sh->dev[disk_index].page, WRITE, false);
 851                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
 852                 if (rrdev)
 853                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
 854                                      sh->dev[disk_index].page, WRITE, false);
 855         }
 856         raid5_release_stripe(sh);
 857         return 0;
 858
 859 error:
 860         for (disk_index = 0; disk_index < sh->disks; disk_index++)
 861                 sh->dev[disk_index].flags = 0;
 862         raid5_release_stripe(sh);
 863         return -EINVAL;
 864 }
 865
 866 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
 867                                        struct r5l_recovery_ctx *ctx)
 868 {
 869         struct r5conf *conf = log->rdev->mddev->private;
 870         struct r5l_payload_data_parity *payload;
 871         struct r5l_meta_block *mb;
 872         int offset;
 873         sector_t log_offset;
 874         sector_t stripe_sector;
 875
 876         mb = page_address(ctx->meta_page);
 877         offset = sizeof(struct r5l_meta_block);
 878         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
 879
 880         while (offset < le32_to_cpu(mb->meta_size)) {
 881                 int dd;
 882
 883                 payload = (void *)mb + offset;
 884                 stripe_sector = raid5_compute_sector(conf,
 885                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
 886                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
 887                                                   &offset, &log_offset))
 888                         return -EINVAL;
 889         }
 890         return 0;
 891 }
 892
 893 /* copy data/parity from log to raid disks */
 894 static void r5l_recovery_flush_log(struct r5l_log *log,
 895                                    struct r5l_recovery_ctx *ctx)
 896 {
 897         while (1) {
 898                 if (r5l_read_meta_block(log, ctx))
 899                         return;
 900                 if (r5l_recovery_flush_one_meta(log, ctx))
 901                         return;
 902                 ctx->seq++;
 903                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
 904         }
 905 }
 906
 907 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 908                                           u64 seq)
 909 {
 910         struct page *page;
 911         struct r5l_meta_block *mb;
 912         u32 crc;
 913
 914         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 915         if (!page)
 916                 return -ENOMEM;
 917         mb = page_address(page);
 918         mb->magic = cpu_to_le32(R5LOG_MAGIC);
 919         mb->version = R5LOG_VERSION;
 920         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
 921         mb->seq = cpu_to_le64(seq);
 922         mb->position = cpu_to_le64(pos);
 923         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
 924         mb->checksum = cpu_to_le32(crc);
 925
 926         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
 927                 __free_page(page);
 928                 return -EIO;
 929         }
 930         __free_page(page);
 931         return 0;
 932 }
 933
 934 static int r5l_recovery_log(struct r5l_log *log)
 935 {
 936         struct r5l_recovery_ctx ctx;
 937
 938         ctx.pos = log->last_checkpoint;
 939         ctx.seq = log->last_cp_seq;
 940         ctx.meta_page = alloc_page(GFP_KERNEL);
 941         if (!ctx.meta_page)
 942                 return -ENOMEM;
 943
 944         r5l_recovery_flush_log(log, &ctx);
 945         __free_page(ctx.meta_page);
 946
 947         /*
 948          * we did a recovery. Now ctx.pos points to an invalid meta block. New
 949          * log will start here. but we can't let superblock point to last valid
 950          * meta block. The log might looks like:
 951          * | meta 1| meta 2| meta 3|
 952          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
 953          * superblock points to meta 1, we write a new valid meta 2n.  if crash
 954          * happens again, new recovery will start from meta 1. Since meta 2n is
 955          * valid now, recovery will think meta 3 is valid, which is wrong.
 956          * The solution is we create a new meta in meta2 with its seq == meta
 957          * 1's seq + 10 and let superblock points to meta2. The same recovery will
 958          * not think meta 3 is a valid meta, because its seq doesn't match
 959          */
 960         if (ctx.seq > log->last_cp_seq + 1) {
 961                 int ret;
 962
 963                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
 964                 if (ret)
 965                         return ret;
 966                 log->seq = ctx.seq + 11;
 967                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
 968                 r5l_write_super(log, ctx.pos);
 969         } else {
 970                 log->log_start = ctx.pos;
 971                 log->seq = ctx.seq;
 972         }
 973         return 0;
 974 }
 975
 976 static void r5l_write_super(struct r5l_log *log, sector_t cp)
 977 {
 978         struct mddev *mddev = log->rdev->mddev;
 979
 980         log->rdev->journal_tail = cp;
 981         set_bit(MD_CHANGE_DEVS, &mddev->flags);
 982 }
 983
 984 static int r5l_load_log(struct r5l_log *log)
 985 {
 986         struct md_rdev *rdev = log->rdev;
 987         struct page *page;
 988         struct r5l_meta_block *mb;
 989         sector_t cp = log->rdev->journal_tail;
 990         u32 stored_crc, expected_crc;
 991         bool create_super = false;
 992         int ret;
 993
 994         /* Make sure it's valid */
 995         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
 996                 cp = 0;
 997         page = alloc_page(GFP_KERNEL);
 998         if (!page)
 999                 return -ENOMEM;
1000
1001         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1002                 ret = -EIO;
1003                 goto ioerr;
1004         }
1005         mb = page_address(page);
1006
1007         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1008             mb->version != R5LOG_VERSION) {
1009                 create_super = true;
1010                 goto create;
1011         }
1012         stored_crc = le32_to_cpu(mb->checksum);
1013         mb->checksum = 0;
1014         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1015         if (stored_crc != expected_crc) {
1016                 create_super = true;
1017                 goto create;
1018         }
1019         if (le64_to_cpu(mb->position) != cp) {
1020                 create_super = true;
1021                 goto create;
1022         }
1023 create:
1024         if (create_super) {
1025                 log->last_cp_seq = prandom_u32();
1026                 cp = 0;
1027                 /*
1028                  * Make sure super points to correct address. Log might have
1029                  * data very soon. If super hasn't correct log tail address,
1030                  * recovery can't find the log
1031                  */
1032                 r5l_write_super(log, cp);
1033         } else
1034                 log->last_cp_seq = le64_to_cpu(mb->seq);
1035
1036         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1037         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1038         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1039                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1040         log->last_checkpoint = cp;
1041
1042         __free_page(page);
1043
1044         return r5l_recovery_log(log);
1045 ioerr:
1046         __free_page(page);
1047         return ret;
1048 }
1049
1050 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1051 {
1052         struct r5l_log *log;
1053
1054         if (PAGE_SIZE != 4096)
1055                 return -EINVAL;
1056         log = kzalloc(sizeof(*log), GFP_KERNEL);
1057         if (!log)
1058                 return -ENOMEM;
1059         log->rdev = rdev;
1060
1061         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1062                                        sizeof(rdev->mddev->uuid));
1063
1064         mutex_init(&log->io_mutex);
1065
1066         spin_lock_init(&log->io_list_lock);
1067         INIT_LIST_HEAD(&log->running_ios);
1068         INIT_LIST_HEAD(&log->io_end_ios);
1069         INIT_LIST_HEAD(&log->stripe_end_ios);
1070         INIT_LIST_HEAD(&log->flushing_ios);
1071         INIT_LIST_HEAD(&log->flushed_ios);
1072         bio_init(&log->flush_bio);
1073
1074         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1075         if (!log->io_kc)
1076                 goto io_kc;
1077
1078         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1079                                                  log->rdev->mddev, "reclaim");
1080         if (!log->reclaim_thread)
1081                 goto reclaim_thread;
1082
1083         INIT_LIST_HEAD(&log->no_space_stripes);
1084         spin_lock_init(&log->no_space_stripes_lock);
1085
1086         if (r5l_load_log(log))
1087                 goto error;
1088
1089         conf->log = log;
1090         return 0;
1091 error:
1092         md_unregister_thread(&log->reclaim_thread);
1093 reclaim_thread:
1094         kmem_cache_destroy(log->io_kc);
1095 io_kc:
1096         kfree(log);
1097         return -EINVAL;
1098 }
1099
1100 void r5l_exit_log(struct r5l_log *log)
1101 {
1102         /*
1103          * at this point all stripes are finished, so io_unit is at least in
1104          * STRIPE_END state
1105          */
1106         r5l_wake_reclaim(log, -1L);
1107         md_unregister_thread(&log->reclaim_thread);
1108         r5l_do_reclaim(log);
1109         /*
1110          * force a super update, r5l_do_reclaim might updated the super.
1111          * mddev->thread is already stopped
1112          */
1113         md_update_sb(log->rdev->mddev, 1);
1114
1115         kmem_cache_destroy(log->io_kc);
1116         kfree(log);
1117 }