]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/md/raid5-cache.c
a81db0a8466a2e5198a0bc37a0e8c0a53157df1a
[karo-tx-linux.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         sector_t next_checkpoint;
55         u64 next_cp_seq;
56
57         struct mutex io_mutex;
58         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
59
60         spinlock_t io_list_lock;
61         struct list_head running_ios;   /* io_units which are still running,
62                                          * and have not yet been completely
63                                          * written to the log */
64         struct list_head io_end_ios;    /* io_units which have been completely
65                                          * written to the log but not yet written
66                                          * to the RAID */
67         struct list_head flushing_ios;  /* io_units which are waiting for log
68                                          * cache flush */
69         struct list_head finished_ios;  /* io_units which settle down in log disk */
70         struct bio flush_bio;
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82         wait_queue_head_t iounit_wait;
83
84         struct list_head no_space_stripes; /* pending stripes, log has no space */
85         spinlock_t no_space_stripes_lock;
86 };
87
88 /*
89  * an IO range starts from a meta data block and end at the next meta data
90  * block. The io unit's the meta data block tracks data/parity followed it. io
91  * unit is written to log disk with normal write, as we always flush log disk
92  * first and then start move data to raid disks, there is no requirement to
93  * write io unit with FLUSH/FUA
94  */
95 struct r5l_io_unit {
96         struct r5l_log *log;
97
98         struct page *meta_page; /* store meta block */
99         int meta_offset;        /* current offset in meta_page */
100
101         struct bio_list bios;
102         atomic_t pending_io;    /* pending bios not written to log yet */
103         struct bio *current_bio;/* current_bio accepting new data */
104
105         atomic_t pending_stripe;/* how many stripes not flushed to raid */
106         u64 seq;                /* seq number of the metablock */
107         sector_t log_start;     /* where the io_unit starts */
108         sector_t log_end;       /* where the io_unit ends */
109         struct list_head log_sibling; /* log->running_ios */
110         struct list_head stripe_list; /* stripes added to the io_unit */
111
112         int state;
113 };
114
115 /* r5l_io_unit state */
116 enum r5l_io_unit_state {
117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
119                                  * don't accepting new bio */
120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
122 };
123
124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125 {
126         start += inc;
127         if (start >= log->device_size)
128                 start = start - log->device_size;
129         return start;
130 }
131
132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133                                   sector_t end)
134 {
135         if (end >= start)
136                 return end - start;
137         else
138                 return end + log->device_size - start;
139 }
140
141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142 {
143         sector_t used_size;
144
145         used_size = r5l_ring_distance(log, log->last_checkpoint,
146                                         log->log_start);
147
148         return log->device_size > used_size + size;
149 }
150
151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152 {
153         struct r5l_io_unit *io;
154         /* We can't handle memory allocate failure so far */
155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156
157         io = kmem_cache_zalloc(log->io_kc, gfp);
158         io->log = log;
159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
160
161         bio_list_init(&io->bios);
162         INIT_LIST_HEAD(&io->log_sibling);
163         INIT_LIST_HEAD(&io->stripe_list);
164         io->state = IO_UNIT_RUNNING;
165         return io;
166 }
167
168 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
169 {
170         __free_page(io->meta_page);
171         kmem_cache_free(log->io_kc, io);
172 }
173
174 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
175                                   enum r5l_io_unit_state state)
176 {
177         struct r5l_io_unit *io;
178
179         while (!list_empty(from)) {
180                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
181                 /* don't change list order */
182                 if (io->state >= state)
183                         list_move_tail(&io->log_sibling, to);
184                 else
185                         break;
186         }
187 }
188
189 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
190                                     enum r5l_io_unit_state state)
191 {
192         if (WARN_ON(io->state >= state))
193                 return;
194         io->state = state;
195 }
196
197 static void r5l_io_run_stripes(struct r5l_io_unit *io)
198 {
199         struct stripe_head *sh, *next;
200
201         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
202                 list_del_init(&sh->log_list);
203                 set_bit(STRIPE_HANDLE, &sh->state);
204                 raid5_release_stripe(sh);
205         }
206 }
207
208 /* XXX: totally ignores I/O errors */
209 static void r5l_log_endio(struct bio *bio)
210 {
211         struct r5l_io_unit *io = bio->bi_private;
212         struct r5l_log *log = io->log;
213         unsigned long flags;
214
215         bio_put(bio);
216
217         if (!atomic_dec_and_test(&io->pending_io))
218                 return;
219
220         spin_lock_irqsave(&log->io_list_lock, flags);
221         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
222         r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
223                         IO_UNIT_IO_END);
224         spin_unlock_irqrestore(&log->io_list_lock, flags);
225
226         md_wakeup_thread(log->rdev->mddev->thread);
227 }
228
229 static void r5l_submit_current_io(struct r5l_log *log)
230 {
231         struct r5l_io_unit *io = log->current_io;
232         struct r5l_meta_block *block;
233         struct bio *bio;
234         unsigned long flags;
235         u32 crc;
236
237         if (!io)
238                 return;
239
240         block = page_address(io->meta_page);
241         block->meta_size = cpu_to_le32(io->meta_offset);
242         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
243         block->checksum = cpu_to_le32(crc);
244
245         log->current_io = NULL;
246         spin_lock_irqsave(&log->io_list_lock, flags);
247         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
248         spin_unlock_irqrestore(&log->io_list_lock, flags);
249
250         while ((bio = bio_list_pop(&io->bios))) {
251                 /* all IO must start from rdev->data_offset */
252                 bio->bi_iter.bi_sector += log->rdev->data_offset;
253                 submit_bio(WRITE, bio);
254         }
255 }
256
257 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
258 {
259         struct r5l_io_unit *io;
260         struct r5l_meta_block *block;
261         struct bio *bio;
262
263         io = r5l_alloc_io_unit(log);
264
265         block = page_address(io->meta_page);
266         block->magic = cpu_to_le32(R5LOG_MAGIC);
267         block->version = R5LOG_VERSION;
268         block->seq = cpu_to_le64(log->seq);
269         block->position = cpu_to_le64(log->log_start);
270
271         io->log_start = log->log_start;
272         io->meta_offset = sizeof(struct r5l_meta_block);
273         io->seq = log->seq;
274
275         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
276         io->current_bio = bio;
277         bio->bi_rw = WRITE;
278         bio->bi_bdev = log->rdev->bdev;
279         bio->bi_iter.bi_sector = log->log_start;
280         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
281         bio->bi_end_io = r5l_log_endio;
282         bio->bi_private = io;
283
284         bio_list_add(&io->bios, bio);
285         atomic_inc(&io->pending_io);
286
287         log->seq++;
288         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
289         io->log_end = log->log_start;
290         /* current bio hit disk end */
291         if (log->log_start == 0)
292                 io->current_bio = NULL;
293
294         spin_lock_irq(&log->io_list_lock);
295         list_add_tail(&io->log_sibling, &log->running_ios);
296         spin_unlock_irq(&log->io_list_lock);
297
298         return io;
299 }
300
301 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
302 {
303         struct r5l_io_unit *io;
304
305         io = log->current_io;
306         if (io && io->meta_offset + payload_size > PAGE_SIZE)
307                 r5l_submit_current_io(log);
308         io = log->current_io;
309         if (io)
310                 return 0;
311
312         log->current_io = r5l_new_meta(log);
313         return 0;
314 }
315
316 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
317                                     sector_t location,
318                                     u32 checksum1, u32 checksum2,
319                                     bool checksum2_valid)
320 {
321         struct r5l_io_unit *io = log->current_io;
322         struct r5l_payload_data_parity *payload;
323
324         payload = page_address(io->meta_page) + io->meta_offset;
325         payload->header.type = cpu_to_le16(type);
326         payload->header.flags = cpu_to_le16(0);
327         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
328                                     (PAGE_SHIFT - 9));
329         payload->location = cpu_to_le64(location);
330         payload->checksum[0] = cpu_to_le32(checksum1);
331         if (checksum2_valid)
332                 payload->checksum[1] = cpu_to_le32(checksum2);
333
334         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
335                 sizeof(__le32) * (1 + !!checksum2_valid);
336 }
337
338 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
339 {
340         struct r5l_io_unit *io = log->current_io;
341
342 alloc_bio:
343         if (!io->current_bio) {
344                 struct bio *bio;
345
346                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
347                 bio->bi_rw = WRITE;
348                 bio->bi_bdev = log->rdev->bdev;
349                 bio->bi_iter.bi_sector = log->log_start;
350                 bio->bi_end_io = r5l_log_endio;
351                 bio->bi_private = io;
352                 bio_list_add(&io->bios, bio);
353                 atomic_inc(&io->pending_io);
354                 io->current_bio = bio;
355         }
356         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
357                 io->current_bio = NULL;
358                 goto alloc_bio;
359         }
360         log->log_start = r5l_ring_add(log, log->log_start,
361                                       BLOCK_SECTORS);
362         /* current bio hit disk end */
363         if (log->log_start == 0)
364                 io->current_bio = NULL;
365
366         io->log_end = log->log_start;
367 }
368
369 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
370                            int data_pages, int parity_pages)
371 {
372         int i;
373         int meta_size;
374         struct r5l_io_unit *io;
375
376         meta_size =
377                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
378                  * data_pages) +
379                 sizeof(struct r5l_payload_data_parity) +
380                 sizeof(__le32) * parity_pages;
381
382         r5l_get_meta(log, meta_size);
383         io = log->current_io;
384
385         for (i = 0; i < sh->disks; i++) {
386                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
387                         continue;
388                 if (i == sh->pd_idx || i == sh->qd_idx)
389                         continue;
390                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
391                                         raid5_compute_blocknr(sh, i, 0),
392                                         sh->dev[i].log_checksum, 0, false);
393                 r5l_append_payload_page(log, sh->dev[i].page);
394         }
395
396         if (sh->qd_idx >= 0) {
397                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
398                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
399                                         sh->dev[sh->qd_idx].log_checksum, true);
400                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
401                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
402         } else {
403                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
404                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
405                                         0, false);
406                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
407         }
408
409         list_add_tail(&sh->log_list, &io->stripe_list);
410         atomic_inc(&io->pending_stripe);
411         sh->log_io = io;
412 }
413
414 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
415 /*
416  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
417  * data from log to raid disks), so we shouldn't wait for reclaim here
418  */
419 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
420 {
421         int write_disks = 0;
422         int data_pages, parity_pages;
423         int meta_size;
424         int reserve;
425         int i;
426
427         if (!log)
428                 return -EAGAIN;
429         /* Don't support stripe batch */
430         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
431             test_bit(STRIPE_SYNCING, &sh->state)) {
432                 /* the stripe is written to log, we start writing it to raid */
433                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
434                 return -EAGAIN;
435         }
436
437         for (i = 0; i < sh->disks; i++) {
438                 void *addr;
439
440                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
441                         continue;
442                 write_disks++;
443                 /* checksum is already calculated in last run */
444                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
445                         continue;
446                 addr = kmap_atomic(sh->dev[i].page);
447                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
448                                                     addr, PAGE_SIZE);
449                 kunmap_atomic(addr);
450         }
451         parity_pages = 1 + !!(sh->qd_idx >= 0);
452         data_pages = write_disks - parity_pages;
453
454         meta_size =
455                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
456                  * data_pages) +
457                 sizeof(struct r5l_payload_data_parity) +
458                 sizeof(__le32) * parity_pages;
459         /* Doesn't work with very big raid array */
460         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
461                 return -EINVAL;
462
463         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
464         /*
465          * The stripe must enter state machine again to finish the write, so
466          * don't delay.
467          */
468         clear_bit(STRIPE_DELAYED, &sh->state);
469         atomic_inc(&sh->count);
470
471         mutex_lock(&log->io_mutex);
472         /* meta + data */
473         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
474         if (r5l_has_free_space(log, reserve))
475                 r5l_log_stripe(log, sh, data_pages, parity_pages);
476         else {
477                 spin_lock(&log->no_space_stripes_lock);
478                 list_add_tail(&sh->log_list, &log->no_space_stripes);
479                 spin_unlock(&log->no_space_stripes_lock);
480
481                 r5l_wake_reclaim(log, reserve);
482         }
483         mutex_unlock(&log->io_mutex);
484
485         return 0;
486 }
487
488 void r5l_write_stripe_run(struct r5l_log *log)
489 {
490         if (!log)
491                 return;
492         mutex_lock(&log->io_mutex);
493         r5l_submit_current_io(log);
494         mutex_unlock(&log->io_mutex);
495 }
496
497 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
498 {
499         if (!log)
500                 return -ENODEV;
501         /*
502          * we flush log disk cache first, then write stripe data to raid disks.
503          * So if bio is finished, the log disk cache is flushed already. The
504          * recovery guarantees we can recovery the bio from log disk, so we
505          * don't need to flush again
506          */
507         if (bio->bi_iter.bi_size == 0) {
508                 bio_endio(bio);
509                 return 0;
510         }
511         bio->bi_rw &= ~REQ_FLUSH;
512         return -EAGAIN;
513 }
514
515 /* This will run after log space is reclaimed */
516 static void r5l_run_no_space_stripes(struct r5l_log *log)
517 {
518         struct stripe_head *sh;
519
520         spin_lock(&log->no_space_stripes_lock);
521         while (!list_empty(&log->no_space_stripes)) {
522                 sh = list_first_entry(&log->no_space_stripes,
523                                       struct stripe_head, log_list);
524                 list_del_init(&sh->log_list);
525                 set_bit(STRIPE_HANDLE, &sh->state);
526                 raid5_release_stripe(sh);
527         }
528         spin_unlock(&log->no_space_stripes_lock);
529 }
530
531 static sector_t r5l_reclaimable_space(struct r5l_log *log)
532 {
533         return r5l_ring_distance(log, log->last_checkpoint,
534                                  log->next_checkpoint);
535 }
536
537 static bool r5l_complete_finished_ios(struct r5l_log *log)
538 {
539         struct r5l_io_unit *io, *next;
540         bool found = false;
541
542         assert_spin_locked(&log->io_list_lock);
543
544         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
545                 /* don't change list order */
546                 if (io->state < IO_UNIT_STRIPE_END)
547                         break;
548
549                 log->next_checkpoint = io->log_start;
550                 log->next_cp_seq = io->seq;
551
552                 list_del(&io->log_sibling);
553                 r5l_free_io_unit(log, io);
554
555                 found = true;
556         }
557
558         return found;
559 }
560
561 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
562 {
563         struct r5l_log *log = io->log;
564         unsigned long flags;
565
566         spin_lock_irqsave(&log->io_list_lock, flags);
567         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
568
569         if (!r5l_complete_finished_ios(log)) {
570                 spin_unlock_irqrestore(&log->io_list_lock, flags);
571                 return;
572         }
573
574         if (r5l_reclaimable_space(log) > log->max_free_space)
575                 r5l_wake_reclaim(log, 0);
576
577         spin_unlock_irqrestore(&log->io_list_lock, flags);
578         wake_up(&log->iounit_wait);
579 }
580
581 void r5l_stripe_write_finished(struct stripe_head *sh)
582 {
583         struct r5l_io_unit *io;
584
585         io = sh->log_io;
586         sh->log_io = NULL;
587
588         if (io && atomic_dec_and_test(&io->pending_stripe))
589                 __r5l_stripe_write_finished(io);
590 }
591
592 static void r5l_log_flush_endio(struct bio *bio)
593 {
594         struct r5l_log *log = container_of(bio, struct r5l_log,
595                 flush_bio);
596         unsigned long flags;
597         struct r5l_io_unit *io;
598
599         spin_lock_irqsave(&log->io_list_lock, flags);
600         list_for_each_entry(io, &log->flushing_ios, log_sibling)
601                 r5l_io_run_stripes(io);
602         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
603         spin_unlock_irqrestore(&log->io_list_lock, flags);
604 }
605
606 /*
607  * Starting dispatch IO to raid.
608  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
609  * broken meta in the middle of a log causes recovery can't find meta at the
610  * head of log. If operations require meta at the head persistent in log, we
611  * must make sure meta before it persistent in log too. A case is:
612  *
613  * stripe data/parity is in log, we start write stripe to raid disks. stripe
614  * data/parity must be persistent in log before we do the write to raid disks.
615  *
616  * The solution is we restrictly maintain io_unit list order. In this case, we
617  * only write stripes of an io_unit to raid disks till the io_unit is the first
618  * one whose data/parity is in log.
619  */
620 void r5l_flush_stripe_to_raid(struct r5l_log *log)
621 {
622         bool do_flush;
623         if (!log)
624                 return;
625
626         spin_lock_irq(&log->io_list_lock);
627         /* flush bio is running */
628         if (!list_empty(&log->flushing_ios)) {
629                 spin_unlock_irq(&log->io_list_lock);
630                 return;
631         }
632         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
633         do_flush = !list_empty(&log->flushing_ios);
634         spin_unlock_irq(&log->io_list_lock);
635
636         if (!do_flush)
637                 return;
638         bio_reset(&log->flush_bio);
639         log->flush_bio.bi_bdev = log->rdev->bdev;
640         log->flush_bio.bi_end_io = r5l_log_flush_endio;
641         submit_bio(WRITE_FLUSH, &log->flush_bio);
642 }
643
644 static void r5l_write_super(struct r5l_log *log, sector_t cp);
645 static void r5l_do_reclaim(struct r5l_log *log)
646 {
647         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
648         sector_t reclaimable;
649         sector_t next_checkpoint;
650         u64 next_cp_seq;
651
652         spin_lock_irq(&log->io_list_lock);
653         /*
654          * move proper io_unit to reclaim list. We should not change the order.
655          * reclaimable/unreclaimable io_unit can be mixed in the list, we
656          * shouldn't reuse space of an unreclaimable io_unit
657          */
658         while (1) {
659                 reclaimable = r5l_reclaimable_space(log);
660                 if (reclaimable >= reclaim_target ||
661                     (list_empty(&log->running_ios) &&
662                      list_empty(&log->io_end_ios) &&
663                      list_empty(&log->flushing_ios) &&
664                      list_empty(&log->finished_ios)))
665                         break;
666
667                 md_wakeup_thread(log->rdev->mddev->thread);
668                 wait_event_lock_irq(log->iounit_wait,
669                                     r5l_reclaimable_space(log) > reclaimable,
670                                     log->io_list_lock);
671         }
672
673         next_checkpoint = log->next_checkpoint;
674         next_cp_seq = log->next_cp_seq;
675         spin_unlock_irq(&log->io_list_lock);
676
677         BUG_ON(reclaimable < 0);
678         if (reclaimable == 0)
679                 return;
680
681         /*
682          * write_super will flush cache of each raid disk. We must write super
683          * here, because the log area might be reused soon and we don't want to
684          * confuse recovery
685          */
686         r5l_write_super(log, next_checkpoint);
687
688         mutex_lock(&log->io_mutex);
689         log->last_checkpoint = next_checkpoint;
690         log->last_cp_seq = next_cp_seq;
691         mutex_unlock(&log->io_mutex);
692
693         r5l_run_no_space_stripes(log);
694 }
695
696 static void r5l_reclaim_thread(struct md_thread *thread)
697 {
698         struct mddev *mddev = thread->mddev;
699         struct r5conf *conf = mddev->private;
700         struct r5l_log *log = conf->log;
701
702         if (!log)
703                 return;
704         r5l_do_reclaim(log);
705 }
706
707 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
708 {
709         unsigned long target;
710         unsigned long new = (unsigned long)space; /* overflow in theory */
711
712         do {
713                 target = log->reclaim_target;
714                 if (new < target)
715                         return;
716         } while (cmpxchg(&log->reclaim_target, target, new) != target);
717         md_wakeup_thread(log->reclaim_thread);
718 }
719
720 void r5l_quiesce(struct r5l_log *log, int state)
721 {
722         if (!log || state == 2)
723                 return;
724         if (state == 0) {
725                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
726                                         log->rdev->mddev, "reclaim");
727         } else if (state == 1) {
728                 /*
729                  * at this point all stripes are finished, so io_unit is at
730                  * least in STRIPE_END state
731                  */
732                 r5l_wake_reclaim(log, -1L);
733                 md_unregister_thread(&log->reclaim_thread);
734                 r5l_do_reclaim(log);
735         }
736 }
737
738 struct r5l_recovery_ctx {
739         struct page *meta_page;         /* current meta */
740         sector_t meta_total_blocks;     /* total size of current meta and data */
741         sector_t pos;                   /* recovery position */
742         u64 seq;                        /* recovery position seq */
743 };
744
745 static int r5l_read_meta_block(struct r5l_log *log,
746                                struct r5l_recovery_ctx *ctx)
747 {
748         struct page *page = ctx->meta_page;
749         struct r5l_meta_block *mb;
750         u32 crc, stored_crc;
751
752         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
753                 return -EIO;
754
755         mb = page_address(page);
756         stored_crc = le32_to_cpu(mb->checksum);
757         mb->checksum = 0;
758
759         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
760             le64_to_cpu(mb->seq) != ctx->seq ||
761             mb->version != R5LOG_VERSION ||
762             le64_to_cpu(mb->position) != ctx->pos)
763                 return -EINVAL;
764
765         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
766         if (stored_crc != crc)
767                 return -EINVAL;
768
769         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
770                 return -EINVAL;
771
772         ctx->meta_total_blocks = BLOCK_SECTORS;
773
774         return 0;
775 }
776
777 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
778                                          struct r5l_recovery_ctx *ctx,
779                                          sector_t stripe_sect,
780                                          int *offset, sector_t *log_offset)
781 {
782         struct r5conf *conf = log->rdev->mddev->private;
783         struct stripe_head *sh;
784         struct r5l_payload_data_parity *payload;
785         int disk_index;
786
787         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
788         while (1) {
789                 payload = page_address(ctx->meta_page) + *offset;
790
791                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
792                         raid5_compute_sector(conf,
793                                              le64_to_cpu(payload->location), 0,
794                                              &disk_index, sh);
795
796                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
797                                      sh->dev[disk_index].page, READ, false);
798                         sh->dev[disk_index].log_checksum =
799                                 le32_to_cpu(payload->checksum[0]);
800                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
801                         ctx->meta_total_blocks += BLOCK_SECTORS;
802                 } else {
803                         disk_index = sh->pd_idx;
804                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
805                                      sh->dev[disk_index].page, READ, false);
806                         sh->dev[disk_index].log_checksum =
807                                 le32_to_cpu(payload->checksum[0]);
808                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
809
810                         if (sh->qd_idx >= 0) {
811                                 disk_index = sh->qd_idx;
812                                 sync_page_io(log->rdev,
813                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
814                                              PAGE_SIZE, sh->dev[disk_index].page,
815                                              READ, false);
816                                 sh->dev[disk_index].log_checksum =
817                                         le32_to_cpu(payload->checksum[1]);
818                                 set_bit(R5_Wantwrite,
819                                         &sh->dev[disk_index].flags);
820                         }
821                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
822                 }
823
824                 *log_offset = r5l_ring_add(log, *log_offset,
825                                            le32_to_cpu(payload->size));
826                 *offset += sizeof(struct r5l_payload_data_parity) +
827                         sizeof(__le32) *
828                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
829                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
830                         break;
831         }
832
833         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
834                 void *addr;
835                 u32 checksum;
836
837                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
838                         continue;
839                 addr = kmap_atomic(sh->dev[disk_index].page);
840                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
841                 kunmap_atomic(addr);
842                 if (checksum != sh->dev[disk_index].log_checksum)
843                         goto error;
844         }
845
846         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
847                 struct md_rdev *rdev, *rrdev;
848
849                 if (!test_and_clear_bit(R5_Wantwrite,
850                                         &sh->dev[disk_index].flags))
851                         continue;
852
853                 /* in case device is broken */
854                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
855                 if (rdev)
856                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
857                                      sh->dev[disk_index].page, WRITE, false);
858                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
859                 if (rrdev)
860                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
861                                      sh->dev[disk_index].page, WRITE, false);
862         }
863         raid5_release_stripe(sh);
864         return 0;
865
866 error:
867         for (disk_index = 0; disk_index < sh->disks; disk_index++)
868                 sh->dev[disk_index].flags = 0;
869         raid5_release_stripe(sh);
870         return -EINVAL;
871 }
872
873 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
874                                        struct r5l_recovery_ctx *ctx)
875 {
876         struct r5conf *conf = log->rdev->mddev->private;
877         struct r5l_payload_data_parity *payload;
878         struct r5l_meta_block *mb;
879         int offset;
880         sector_t log_offset;
881         sector_t stripe_sector;
882
883         mb = page_address(ctx->meta_page);
884         offset = sizeof(struct r5l_meta_block);
885         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
886
887         while (offset < le32_to_cpu(mb->meta_size)) {
888                 int dd;
889
890                 payload = (void *)mb + offset;
891                 stripe_sector = raid5_compute_sector(conf,
892                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
893                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
894                                                   &offset, &log_offset))
895                         return -EINVAL;
896         }
897         return 0;
898 }
899
900 /* copy data/parity from log to raid disks */
901 static void r5l_recovery_flush_log(struct r5l_log *log,
902                                    struct r5l_recovery_ctx *ctx)
903 {
904         while (1) {
905                 if (r5l_read_meta_block(log, ctx))
906                         return;
907                 if (r5l_recovery_flush_one_meta(log, ctx))
908                         return;
909                 ctx->seq++;
910                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
911         }
912 }
913
914 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
915                                           u64 seq)
916 {
917         struct page *page;
918         struct r5l_meta_block *mb;
919         u32 crc;
920
921         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
922         if (!page)
923                 return -ENOMEM;
924         mb = page_address(page);
925         mb->magic = cpu_to_le32(R5LOG_MAGIC);
926         mb->version = R5LOG_VERSION;
927         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
928         mb->seq = cpu_to_le64(seq);
929         mb->position = cpu_to_le64(pos);
930         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
931         mb->checksum = cpu_to_le32(crc);
932
933         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
934                 __free_page(page);
935                 return -EIO;
936         }
937         __free_page(page);
938         return 0;
939 }
940
941 static int r5l_recovery_log(struct r5l_log *log)
942 {
943         struct r5l_recovery_ctx ctx;
944
945         ctx.pos = log->last_checkpoint;
946         ctx.seq = log->last_cp_seq;
947         ctx.meta_page = alloc_page(GFP_KERNEL);
948         if (!ctx.meta_page)
949                 return -ENOMEM;
950
951         r5l_recovery_flush_log(log, &ctx);
952         __free_page(ctx.meta_page);
953
954         /*
955          * we did a recovery. Now ctx.pos points to an invalid meta block. New
956          * log will start here. but we can't let superblock point to last valid
957          * meta block. The log might looks like:
958          * | meta 1| meta 2| meta 3|
959          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
960          * superblock points to meta 1, we write a new valid meta 2n.  if crash
961          * happens again, new recovery will start from meta 1. Since meta 2n is
962          * valid now, recovery will think meta 3 is valid, which is wrong.
963          * The solution is we create a new meta in meta2 with its seq == meta
964          * 1's seq + 10 and let superblock points to meta2. The same recovery will
965          * not think meta 3 is a valid meta, because its seq doesn't match
966          */
967         if (ctx.seq > log->last_cp_seq + 1) {
968                 int ret;
969
970                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
971                 if (ret)
972                         return ret;
973                 log->seq = ctx.seq + 11;
974                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
975                 r5l_write_super(log, ctx.pos);
976         } else {
977                 log->log_start = ctx.pos;
978                 log->seq = ctx.seq;
979         }
980         return 0;
981 }
982
983 static void r5l_write_super(struct r5l_log *log, sector_t cp)
984 {
985         struct mddev *mddev = log->rdev->mddev;
986
987         log->rdev->journal_tail = cp;
988         set_bit(MD_CHANGE_DEVS, &mddev->flags);
989 }
990
991 static int r5l_load_log(struct r5l_log *log)
992 {
993         struct md_rdev *rdev = log->rdev;
994         struct page *page;
995         struct r5l_meta_block *mb;
996         sector_t cp = log->rdev->journal_tail;
997         u32 stored_crc, expected_crc;
998         bool create_super = false;
999         int ret;
1000
1001         /* Make sure it's valid */
1002         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1003                 cp = 0;
1004         page = alloc_page(GFP_KERNEL);
1005         if (!page)
1006                 return -ENOMEM;
1007
1008         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1009                 ret = -EIO;
1010                 goto ioerr;
1011         }
1012         mb = page_address(page);
1013
1014         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1015             mb->version != R5LOG_VERSION) {
1016                 create_super = true;
1017                 goto create;
1018         }
1019         stored_crc = le32_to_cpu(mb->checksum);
1020         mb->checksum = 0;
1021         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1022         if (stored_crc != expected_crc) {
1023                 create_super = true;
1024                 goto create;
1025         }
1026         if (le64_to_cpu(mb->position) != cp) {
1027                 create_super = true;
1028                 goto create;
1029         }
1030 create:
1031         if (create_super) {
1032                 log->last_cp_seq = prandom_u32();
1033                 cp = 0;
1034                 /*
1035                  * Make sure super points to correct address. Log might have
1036                  * data very soon. If super hasn't correct log tail address,
1037                  * recovery can't find the log
1038                  */
1039                 r5l_write_super(log, cp);
1040         } else
1041                 log->last_cp_seq = le64_to_cpu(mb->seq);
1042
1043         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1044         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1045         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1046                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1047         log->last_checkpoint = cp;
1048
1049         __free_page(page);
1050
1051         return r5l_recovery_log(log);
1052 ioerr:
1053         __free_page(page);
1054         return ret;
1055 }
1056
1057 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1058 {
1059         struct r5l_log *log;
1060
1061         if (PAGE_SIZE != 4096)
1062                 return -EINVAL;
1063         log = kzalloc(sizeof(*log), GFP_KERNEL);
1064         if (!log)
1065                 return -ENOMEM;
1066         log->rdev = rdev;
1067
1068         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1069                                        sizeof(rdev->mddev->uuid));
1070
1071         mutex_init(&log->io_mutex);
1072
1073         spin_lock_init(&log->io_list_lock);
1074         INIT_LIST_HEAD(&log->running_ios);
1075         INIT_LIST_HEAD(&log->io_end_ios);
1076         INIT_LIST_HEAD(&log->flushing_ios);
1077         INIT_LIST_HEAD(&log->finished_ios);
1078         bio_init(&log->flush_bio);
1079
1080         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1081         if (!log->io_kc)
1082                 goto io_kc;
1083
1084         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1085                                                  log->rdev->mddev, "reclaim");
1086         if (!log->reclaim_thread)
1087                 goto reclaim_thread;
1088         init_waitqueue_head(&log->iounit_wait);
1089
1090         INIT_LIST_HEAD(&log->no_space_stripes);
1091         spin_lock_init(&log->no_space_stripes_lock);
1092
1093         if (r5l_load_log(log))
1094                 goto error;
1095
1096         conf->log = log;
1097         return 0;
1098 error:
1099         md_unregister_thread(&log->reclaim_thread);
1100 reclaim_thread:
1101         kmem_cache_destroy(log->io_kc);
1102 io_kc:
1103         kfree(log);
1104         return -EINVAL;
1105 }
1106
1107 void r5l_exit_log(struct r5l_log *log)
1108 {
1109         md_unregister_thread(&log->reclaim_thread);
1110         kmem_cache_destroy(log->io_kc);
1111         kfree(log);
1112 }