]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/md/raid5-cache.c
raid5-cache: use bio chaining
[karo-tx-linux.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         sector_t next_checkpoint;
55         u64 next_cp_seq;
56
57         struct mutex io_mutex;
58         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
59
60         spinlock_t io_list_lock;
61         struct list_head running_ios;   /* io_units which are still running,
62                                          * and have not yet been completely
63                                          * written to the log */
64         struct list_head io_end_ios;    /* io_units which have been completely
65                                          * written to the log but not yet written
66                                          * to the RAID */
67         struct list_head flushing_ios;  /* io_units which are waiting for log
68                                          * cache flush */
69         struct list_head finished_ios;  /* io_units which settle down in log disk */
70         struct bio flush_bio;
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82         wait_queue_head_t iounit_wait;
83
84         struct list_head no_space_stripes; /* pending stripes, log has no space */
85         spinlock_t no_space_stripes_lock;
86
87         bool need_cache_flush;
88 };
89
90 /*
91  * an IO range starts from a meta data block and end at the next meta data
92  * block. The io unit's the meta data block tracks data/parity followed it. io
93  * unit is written to log disk with normal write, as we always flush log disk
94  * first and then start move data to raid disks, there is no requirement to
95  * write io unit with FLUSH/FUA
96  */
97 struct r5l_io_unit {
98         struct r5l_log *log;
99
100         struct page *meta_page; /* store meta block */
101         int meta_offset;        /* current offset in meta_page */
102
103         struct bio *current_bio;/* current_bio accepting new data */
104
105         atomic_t pending_stripe;/* how many stripes not flushed to raid */
106         u64 seq;                /* seq number of the metablock */
107         sector_t log_start;     /* where the io_unit starts */
108         sector_t log_end;       /* where the io_unit ends */
109         struct list_head log_sibling; /* log->running_ios */
110         struct list_head stripe_list; /* stripes added to the io_unit */
111
112         int state;
113         bool need_split_bio;
114 };
115
116 /* r5l_io_unit state */
117 enum r5l_io_unit_state {
118         IO_UNIT_RUNNING = 0,    /* accepting new IO */
119         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
120                                  * don't accepting new bio */
121         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
122         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
123 };
124
125 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
126 {
127         start += inc;
128         if (start >= log->device_size)
129                 start = start - log->device_size;
130         return start;
131 }
132
133 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
134                                   sector_t end)
135 {
136         if (end >= start)
137                 return end - start;
138         else
139                 return end + log->device_size - start;
140 }
141
142 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
143 {
144         sector_t used_size;
145
146         used_size = r5l_ring_distance(log, log->last_checkpoint,
147                                         log->log_start);
148
149         return log->device_size > used_size + size;
150 }
151
152 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
153 {
154         __free_page(io->meta_page);
155         kmem_cache_free(log->io_kc, io);
156 }
157
158 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
159                                   enum r5l_io_unit_state state)
160 {
161         struct r5l_io_unit *io;
162
163         while (!list_empty(from)) {
164                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
165                 /* don't change list order */
166                 if (io->state >= state)
167                         list_move_tail(&io->log_sibling, to);
168                 else
169                         break;
170         }
171 }
172
173 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
174                                     enum r5l_io_unit_state state)
175 {
176         if (WARN_ON(io->state >= state))
177                 return;
178         io->state = state;
179 }
180
181 static void r5l_io_run_stripes(struct r5l_io_unit *io)
182 {
183         struct stripe_head *sh, *next;
184
185         list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
186                 list_del_init(&sh->log_list);
187                 set_bit(STRIPE_HANDLE, &sh->state);
188                 raid5_release_stripe(sh);
189         }
190 }
191
192 /* XXX: totally ignores I/O errors */
193 static void r5l_log_run_stripes(struct r5l_log *log)
194 {
195         struct r5l_io_unit *io, *next;
196
197         assert_spin_locked(&log->io_list_lock);
198
199         list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
200                 /* don't change list order */
201                 if (io->state < IO_UNIT_IO_END)
202                         break;
203
204                 list_move_tail(&io->log_sibling, &log->finished_ios);
205                 r5l_io_run_stripes(io);
206         }
207 }
208
209 static void r5l_log_endio(struct bio *bio)
210 {
211         struct r5l_io_unit *io = bio->bi_private;
212         struct r5l_log *log = io->log;
213         unsigned long flags;
214
215         bio_put(bio);
216
217         spin_lock_irqsave(&log->io_list_lock, flags);
218         __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
219         if (log->need_cache_flush)
220                 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
221                                       IO_UNIT_IO_END);
222         else
223                 r5l_log_run_stripes(log);
224         spin_unlock_irqrestore(&log->io_list_lock, flags);
225
226         if (log->need_cache_flush)
227                 md_wakeup_thread(log->rdev->mddev->thread);
228 }
229
230 static void r5l_submit_current_io(struct r5l_log *log)
231 {
232         struct r5l_io_unit *io = log->current_io;
233         struct r5l_meta_block *block;
234         unsigned long flags;
235         u32 crc;
236
237         if (!io)
238                 return;
239
240         block = page_address(io->meta_page);
241         block->meta_size = cpu_to_le32(io->meta_offset);
242         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
243         block->checksum = cpu_to_le32(crc);
244
245         log->current_io = NULL;
246         spin_lock_irqsave(&log->io_list_lock, flags);
247         __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
248         spin_unlock_irqrestore(&log->io_list_lock, flags);
249
250         submit_bio(WRITE, io->current_bio);
251 }
252
253 static struct bio *r5l_bio_alloc(struct r5l_log *log)
254 {
255         struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
256
257         bio->bi_rw = WRITE;
258         bio->bi_bdev = log->rdev->bdev;
259         bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
260
261         return bio;
262 }
263
264 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
265 {
266         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
267
268         /*
269          * If we filled up the log device start from the beginning again,
270          * which will require a new bio.
271          *
272          * Note: for this to work properly the log size needs to me a multiple
273          * of BLOCK_SECTORS.
274          */
275         if (log->log_start == 0)
276                 io->need_split_bio = true;
277
278         io->log_end = log->log_start;
279 }
280
281 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
282 {
283         struct r5l_io_unit *io;
284         struct r5l_meta_block *block;
285
286         /* We can't handle memory allocate failure so far */
287         io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
288         io->log = log;
289         INIT_LIST_HEAD(&io->log_sibling);
290         INIT_LIST_HEAD(&io->stripe_list);
291         io->state = IO_UNIT_RUNNING;
292
293         io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
294         block = page_address(io->meta_page);
295         block->magic = cpu_to_le32(R5LOG_MAGIC);
296         block->version = R5LOG_VERSION;
297         block->seq = cpu_to_le64(log->seq);
298         block->position = cpu_to_le64(log->log_start);
299
300         io->log_start = log->log_start;
301         io->meta_offset = sizeof(struct r5l_meta_block);
302         io->seq = log->seq++;
303
304         io->current_bio = r5l_bio_alloc(log);
305         io->current_bio->bi_end_io = r5l_log_endio;
306         io->current_bio->bi_private = io;
307         bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
308
309         r5_reserve_log_entry(log, io);
310
311         spin_lock_irq(&log->io_list_lock);
312         list_add_tail(&io->log_sibling, &log->running_ios);
313         spin_unlock_irq(&log->io_list_lock);
314
315         return io;
316 }
317
318 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
319 {
320         if (log->current_io &&
321             log->current_io->meta_offset + payload_size > PAGE_SIZE)
322                 r5l_submit_current_io(log);
323
324         if (!log->current_io)
325                 log->current_io = r5l_new_meta(log);
326         return 0;
327 }
328
329 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
330                                     sector_t location,
331                                     u32 checksum1, u32 checksum2,
332                                     bool checksum2_valid)
333 {
334         struct r5l_io_unit *io = log->current_io;
335         struct r5l_payload_data_parity *payload;
336
337         payload = page_address(io->meta_page) + io->meta_offset;
338         payload->header.type = cpu_to_le16(type);
339         payload->header.flags = cpu_to_le16(0);
340         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
341                                     (PAGE_SHIFT - 9));
342         payload->location = cpu_to_le64(location);
343         payload->checksum[0] = cpu_to_le32(checksum1);
344         if (checksum2_valid)
345                 payload->checksum[1] = cpu_to_le32(checksum2);
346
347         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
348                 sizeof(__le32) * (1 + !!checksum2_valid);
349 }
350
351 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
352 {
353         struct r5l_io_unit *io = log->current_io;
354
355         if (io->need_split_bio) {
356                 struct bio *prev = io->current_bio;
357
358                 io->current_bio = r5l_bio_alloc(log);
359                 bio_chain(io->current_bio, prev);
360
361                 submit_bio(WRITE, prev);
362         }
363
364         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
365                 BUG();
366
367         r5_reserve_log_entry(log, io);
368 }
369
370 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
371                            int data_pages, int parity_pages)
372 {
373         int i;
374         int meta_size;
375         struct r5l_io_unit *io;
376
377         meta_size =
378                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
379                  * data_pages) +
380                 sizeof(struct r5l_payload_data_parity) +
381                 sizeof(__le32) * parity_pages;
382
383         r5l_get_meta(log, meta_size);
384         io = log->current_io;
385
386         for (i = 0; i < sh->disks; i++) {
387                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
388                         continue;
389                 if (i == sh->pd_idx || i == sh->qd_idx)
390                         continue;
391                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
392                                         raid5_compute_blocknr(sh, i, 0),
393                                         sh->dev[i].log_checksum, 0, false);
394                 r5l_append_payload_page(log, sh->dev[i].page);
395         }
396
397         if (sh->qd_idx >= 0) {
398                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
399                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
400                                         sh->dev[sh->qd_idx].log_checksum, true);
401                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
402                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
403         } else {
404                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
405                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
406                                         0, false);
407                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
408         }
409
410         list_add_tail(&sh->log_list, &io->stripe_list);
411         atomic_inc(&io->pending_stripe);
412         sh->log_io = io;
413 }
414
415 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
416 /*
417  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
418  * data from log to raid disks), so we shouldn't wait for reclaim here
419  */
420 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
421 {
422         int write_disks = 0;
423         int data_pages, parity_pages;
424         int meta_size;
425         int reserve;
426         int i;
427
428         if (!log)
429                 return -EAGAIN;
430         /* Don't support stripe batch */
431         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
432             test_bit(STRIPE_SYNCING, &sh->state)) {
433                 /* the stripe is written to log, we start writing it to raid */
434                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
435                 return -EAGAIN;
436         }
437
438         for (i = 0; i < sh->disks; i++) {
439                 void *addr;
440
441                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
442                         continue;
443                 write_disks++;
444                 /* checksum is already calculated in last run */
445                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
446                         continue;
447                 addr = kmap_atomic(sh->dev[i].page);
448                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
449                                                     addr, PAGE_SIZE);
450                 kunmap_atomic(addr);
451         }
452         parity_pages = 1 + !!(sh->qd_idx >= 0);
453         data_pages = write_disks - parity_pages;
454
455         meta_size =
456                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
457                  * data_pages) +
458                 sizeof(struct r5l_payload_data_parity) +
459                 sizeof(__le32) * parity_pages;
460         /* Doesn't work with very big raid array */
461         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
462                 return -EINVAL;
463
464         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
465         /*
466          * The stripe must enter state machine again to finish the write, so
467          * don't delay.
468          */
469         clear_bit(STRIPE_DELAYED, &sh->state);
470         atomic_inc(&sh->count);
471
472         mutex_lock(&log->io_mutex);
473         /* meta + data */
474         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
475         if (r5l_has_free_space(log, reserve))
476                 r5l_log_stripe(log, sh, data_pages, parity_pages);
477         else {
478                 spin_lock(&log->no_space_stripes_lock);
479                 list_add_tail(&sh->log_list, &log->no_space_stripes);
480                 spin_unlock(&log->no_space_stripes_lock);
481
482                 r5l_wake_reclaim(log, reserve);
483         }
484         mutex_unlock(&log->io_mutex);
485
486         return 0;
487 }
488
489 void r5l_write_stripe_run(struct r5l_log *log)
490 {
491         if (!log)
492                 return;
493         mutex_lock(&log->io_mutex);
494         r5l_submit_current_io(log);
495         mutex_unlock(&log->io_mutex);
496 }
497
498 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
499 {
500         if (!log)
501                 return -ENODEV;
502         /*
503          * we flush log disk cache first, then write stripe data to raid disks.
504          * So if bio is finished, the log disk cache is flushed already. The
505          * recovery guarantees we can recovery the bio from log disk, so we
506          * don't need to flush again
507          */
508         if (bio->bi_iter.bi_size == 0) {
509                 bio_endio(bio);
510                 return 0;
511         }
512         bio->bi_rw &= ~REQ_FLUSH;
513         return -EAGAIN;
514 }
515
516 /* This will run after log space is reclaimed */
517 static void r5l_run_no_space_stripes(struct r5l_log *log)
518 {
519         struct stripe_head *sh;
520
521         spin_lock(&log->no_space_stripes_lock);
522         while (!list_empty(&log->no_space_stripes)) {
523                 sh = list_first_entry(&log->no_space_stripes,
524                                       struct stripe_head, log_list);
525                 list_del_init(&sh->log_list);
526                 set_bit(STRIPE_HANDLE, &sh->state);
527                 raid5_release_stripe(sh);
528         }
529         spin_unlock(&log->no_space_stripes_lock);
530 }
531
532 static sector_t r5l_reclaimable_space(struct r5l_log *log)
533 {
534         return r5l_ring_distance(log, log->last_checkpoint,
535                                  log->next_checkpoint);
536 }
537
538 static bool r5l_complete_finished_ios(struct r5l_log *log)
539 {
540         struct r5l_io_unit *io, *next;
541         bool found = false;
542
543         assert_spin_locked(&log->io_list_lock);
544
545         list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
546                 /* don't change list order */
547                 if (io->state < IO_UNIT_STRIPE_END)
548                         break;
549
550                 log->next_checkpoint = io->log_start;
551                 log->next_cp_seq = io->seq;
552
553                 list_del(&io->log_sibling);
554                 r5l_free_io_unit(log, io);
555
556                 found = true;
557         }
558
559         return found;
560 }
561
562 static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
563 {
564         struct r5l_log *log = io->log;
565         unsigned long flags;
566
567         spin_lock_irqsave(&log->io_list_lock, flags);
568         __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
569
570         if (!r5l_complete_finished_ios(log)) {
571                 spin_unlock_irqrestore(&log->io_list_lock, flags);
572                 return;
573         }
574
575         if (r5l_reclaimable_space(log) > log->max_free_space)
576                 r5l_wake_reclaim(log, 0);
577
578         spin_unlock_irqrestore(&log->io_list_lock, flags);
579         wake_up(&log->iounit_wait);
580 }
581
582 void r5l_stripe_write_finished(struct stripe_head *sh)
583 {
584         struct r5l_io_unit *io;
585
586         io = sh->log_io;
587         sh->log_io = NULL;
588
589         if (io && atomic_dec_and_test(&io->pending_stripe))
590                 __r5l_stripe_write_finished(io);
591 }
592
593 static void r5l_log_flush_endio(struct bio *bio)
594 {
595         struct r5l_log *log = container_of(bio, struct r5l_log,
596                 flush_bio);
597         unsigned long flags;
598         struct r5l_io_unit *io;
599
600         spin_lock_irqsave(&log->io_list_lock, flags);
601         list_for_each_entry(io, &log->flushing_ios, log_sibling)
602                 r5l_io_run_stripes(io);
603         list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
604         spin_unlock_irqrestore(&log->io_list_lock, flags);
605 }
606
607 /*
608  * Starting dispatch IO to raid.
609  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
610  * broken meta in the middle of a log causes recovery can't find meta at the
611  * head of log. If operations require meta at the head persistent in log, we
612  * must make sure meta before it persistent in log too. A case is:
613  *
614  * stripe data/parity is in log, we start write stripe to raid disks. stripe
615  * data/parity must be persistent in log before we do the write to raid disks.
616  *
617  * The solution is we restrictly maintain io_unit list order. In this case, we
618  * only write stripes of an io_unit to raid disks till the io_unit is the first
619  * one whose data/parity is in log.
620  */
621 void r5l_flush_stripe_to_raid(struct r5l_log *log)
622 {
623         bool do_flush;
624
625         if (!log || !log->need_cache_flush)
626                 return;
627
628         spin_lock_irq(&log->io_list_lock);
629         /* flush bio is running */
630         if (!list_empty(&log->flushing_ios)) {
631                 spin_unlock_irq(&log->io_list_lock);
632                 return;
633         }
634         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
635         do_flush = !list_empty(&log->flushing_ios);
636         spin_unlock_irq(&log->io_list_lock);
637
638         if (!do_flush)
639                 return;
640         bio_reset(&log->flush_bio);
641         log->flush_bio.bi_bdev = log->rdev->bdev;
642         log->flush_bio.bi_end_io = r5l_log_flush_endio;
643         submit_bio(WRITE_FLUSH, &log->flush_bio);
644 }
645
646 static void r5l_write_super(struct r5l_log *log, sector_t cp);
647 static void r5l_do_reclaim(struct r5l_log *log)
648 {
649         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
650         sector_t reclaimable;
651         sector_t next_checkpoint;
652         u64 next_cp_seq;
653
654         spin_lock_irq(&log->io_list_lock);
655         /*
656          * move proper io_unit to reclaim list. We should not change the order.
657          * reclaimable/unreclaimable io_unit can be mixed in the list, we
658          * shouldn't reuse space of an unreclaimable io_unit
659          */
660         while (1) {
661                 reclaimable = r5l_reclaimable_space(log);
662                 if (reclaimable >= reclaim_target ||
663                     (list_empty(&log->running_ios) &&
664                      list_empty(&log->io_end_ios) &&
665                      list_empty(&log->flushing_ios) &&
666                      list_empty(&log->finished_ios)))
667                         break;
668
669                 md_wakeup_thread(log->rdev->mddev->thread);
670                 wait_event_lock_irq(log->iounit_wait,
671                                     r5l_reclaimable_space(log) > reclaimable,
672                                     log->io_list_lock);
673         }
674
675         next_checkpoint = log->next_checkpoint;
676         next_cp_seq = log->next_cp_seq;
677         spin_unlock_irq(&log->io_list_lock);
678
679         BUG_ON(reclaimable < 0);
680         if (reclaimable == 0)
681                 return;
682
683         /*
684          * write_super will flush cache of each raid disk. We must write super
685          * here, because the log area might be reused soon and we don't want to
686          * confuse recovery
687          */
688         r5l_write_super(log, next_checkpoint);
689
690         mutex_lock(&log->io_mutex);
691         log->last_checkpoint = next_checkpoint;
692         log->last_cp_seq = next_cp_seq;
693         mutex_unlock(&log->io_mutex);
694
695         r5l_run_no_space_stripes(log);
696 }
697
698 static void r5l_reclaim_thread(struct md_thread *thread)
699 {
700         struct mddev *mddev = thread->mddev;
701         struct r5conf *conf = mddev->private;
702         struct r5l_log *log = conf->log;
703
704         if (!log)
705                 return;
706         r5l_do_reclaim(log);
707 }
708
709 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
710 {
711         unsigned long target;
712         unsigned long new = (unsigned long)space; /* overflow in theory */
713
714         do {
715                 target = log->reclaim_target;
716                 if (new < target)
717                         return;
718         } while (cmpxchg(&log->reclaim_target, target, new) != target);
719         md_wakeup_thread(log->reclaim_thread);
720 }
721
722 void r5l_quiesce(struct r5l_log *log, int state)
723 {
724         if (!log || state == 2)
725                 return;
726         if (state == 0) {
727                 log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
728                                         log->rdev->mddev, "reclaim");
729         } else if (state == 1) {
730                 /*
731                  * at this point all stripes are finished, so io_unit is at
732                  * least in STRIPE_END state
733                  */
734                 r5l_wake_reclaim(log, -1L);
735                 md_unregister_thread(&log->reclaim_thread);
736                 r5l_do_reclaim(log);
737         }
738 }
739
740 struct r5l_recovery_ctx {
741         struct page *meta_page;         /* current meta */
742         sector_t meta_total_blocks;     /* total size of current meta and data */
743         sector_t pos;                   /* recovery position */
744         u64 seq;                        /* recovery position seq */
745 };
746
747 static int r5l_read_meta_block(struct r5l_log *log,
748                                struct r5l_recovery_ctx *ctx)
749 {
750         struct page *page = ctx->meta_page;
751         struct r5l_meta_block *mb;
752         u32 crc, stored_crc;
753
754         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
755                 return -EIO;
756
757         mb = page_address(page);
758         stored_crc = le32_to_cpu(mb->checksum);
759         mb->checksum = 0;
760
761         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
762             le64_to_cpu(mb->seq) != ctx->seq ||
763             mb->version != R5LOG_VERSION ||
764             le64_to_cpu(mb->position) != ctx->pos)
765                 return -EINVAL;
766
767         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
768         if (stored_crc != crc)
769                 return -EINVAL;
770
771         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
772                 return -EINVAL;
773
774         ctx->meta_total_blocks = BLOCK_SECTORS;
775
776         return 0;
777 }
778
779 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
780                                          struct r5l_recovery_ctx *ctx,
781                                          sector_t stripe_sect,
782                                          int *offset, sector_t *log_offset)
783 {
784         struct r5conf *conf = log->rdev->mddev->private;
785         struct stripe_head *sh;
786         struct r5l_payload_data_parity *payload;
787         int disk_index;
788
789         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
790         while (1) {
791                 payload = page_address(ctx->meta_page) + *offset;
792
793                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
794                         raid5_compute_sector(conf,
795                                              le64_to_cpu(payload->location), 0,
796                                              &disk_index, sh);
797
798                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
799                                      sh->dev[disk_index].page, READ, false);
800                         sh->dev[disk_index].log_checksum =
801                                 le32_to_cpu(payload->checksum[0]);
802                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
803                         ctx->meta_total_blocks += BLOCK_SECTORS;
804                 } else {
805                         disk_index = sh->pd_idx;
806                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
807                                      sh->dev[disk_index].page, READ, false);
808                         sh->dev[disk_index].log_checksum =
809                                 le32_to_cpu(payload->checksum[0]);
810                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
811
812                         if (sh->qd_idx >= 0) {
813                                 disk_index = sh->qd_idx;
814                                 sync_page_io(log->rdev,
815                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
816                                              PAGE_SIZE, sh->dev[disk_index].page,
817                                              READ, false);
818                                 sh->dev[disk_index].log_checksum =
819                                         le32_to_cpu(payload->checksum[1]);
820                                 set_bit(R5_Wantwrite,
821                                         &sh->dev[disk_index].flags);
822                         }
823                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
824                 }
825
826                 *log_offset = r5l_ring_add(log, *log_offset,
827                                            le32_to_cpu(payload->size));
828                 *offset += sizeof(struct r5l_payload_data_parity) +
829                         sizeof(__le32) *
830                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
831                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
832                         break;
833         }
834
835         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
836                 void *addr;
837                 u32 checksum;
838
839                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
840                         continue;
841                 addr = kmap_atomic(sh->dev[disk_index].page);
842                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
843                 kunmap_atomic(addr);
844                 if (checksum != sh->dev[disk_index].log_checksum)
845                         goto error;
846         }
847
848         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
849                 struct md_rdev *rdev, *rrdev;
850
851                 if (!test_and_clear_bit(R5_Wantwrite,
852                                         &sh->dev[disk_index].flags))
853                         continue;
854
855                 /* in case device is broken */
856                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
857                 if (rdev)
858                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
859                                      sh->dev[disk_index].page, WRITE, false);
860                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
861                 if (rrdev)
862                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
863                                      sh->dev[disk_index].page, WRITE, false);
864         }
865         raid5_release_stripe(sh);
866         return 0;
867
868 error:
869         for (disk_index = 0; disk_index < sh->disks; disk_index++)
870                 sh->dev[disk_index].flags = 0;
871         raid5_release_stripe(sh);
872         return -EINVAL;
873 }
874
875 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
876                                        struct r5l_recovery_ctx *ctx)
877 {
878         struct r5conf *conf = log->rdev->mddev->private;
879         struct r5l_payload_data_parity *payload;
880         struct r5l_meta_block *mb;
881         int offset;
882         sector_t log_offset;
883         sector_t stripe_sector;
884
885         mb = page_address(ctx->meta_page);
886         offset = sizeof(struct r5l_meta_block);
887         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
888
889         while (offset < le32_to_cpu(mb->meta_size)) {
890                 int dd;
891
892                 payload = (void *)mb + offset;
893                 stripe_sector = raid5_compute_sector(conf,
894                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
895                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
896                                                   &offset, &log_offset))
897                         return -EINVAL;
898         }
899         return 0;
900 }
901
902 /* copy data/parity from log to raid disks */
903 static void r5l_recovery_flush_log(struct r5l_log *log,
904                                    struct r5l_recovery_ctx *ctx)
905 {
906         while (1) {
907                 if (r5l_read_meta_block(log, ctx))
908                         return;
909                 if (r5l_recovery_flush_one_meta(log, ctx))
910                         return;
911                 ctx->seq++;
912                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
913         }
914 }
915
916 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
917                                           u64 seq)
918 {
919         struct page *page;
920         struct r5l_meta_block *mb;
921         u32 crc;
922
923         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
924         if (!page)
925                 return -ENOMEM;
926         mb = page_address(page);
927         mb->magic = cpu_to_le32(R5LOG_MAGIC);
928         mb->version = R5LOG_VERSION;
929         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
930         mb->seq = cpu_to_le64(seq);
931         mb->position = cpu_to_le64(pos);
932         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
933         mb->checksum = cpu_to_le32(crc);
934
935         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
936                 __free_page(page);
937                 return -EIO;
938         }
939         __free_page(page);
940         return 0;
941 }
942
943 static int r5l_recovery_log(struct r5l_log *log)
944 {
945         struct r5l_recovery_ctx ctx;
946
947         ctx.pos = log->last_checkpoint;
948         ctx.seq = log->last_cp_seq;
949         ctx.meta_page = alloc_page(GFP_KERNEL);
950         if (!ctx.meta_page)
951                 return -ENOMEM;
952
953         r5l_recovery_flush_log(log, &ctx);
954         __free_page(ctx.meta_page);
955
956         /*
957          * we did a recovery. Now ctx.pos points to an invalid meta block. New
958          * log will start here. but we can't let superblock point to last valid
959          * meta block. The log might looks like:
960          * | meta 1| meta 2| meta 3|
961          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
962          * superblock points to meta 1, we write a new valid meta 2n.  if crash
963          * happens again, new recovery will start from meta 1. Since meta 2n is
964          * valid now, recovery will think meta 3 is valid, which is wrong.
965          * The solution is we create a new meta in meta2 with its seq == meta
966          * 1's seq + 10 and let superblock points to meta2. The same recovery will
967          * not think meta 3 is a valid meta, because its seq doesn't match
968          */
969         if (ctx.seq > log->last_cp_seq + 1) {
970                 int ret;
971
972                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
973                 if (ret)
974                         return ret;
975                 log->seq = ctx.seq + 11;
976                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
977                 r5l_write_super(log, ctx.pos);
978         } else {
979                 log->log_start = ctx.pos;
980                 log->seq = ctx.seq;
981         }
982         return 0;
983 }
984
985 static void r5l_write_super(struct r5l_log *log, sector_t cp)
986 {
987         struct mddev *mddev = log->rdev->mddev;
988
989         log->rdev->journal_tail = cp;
990         set_bit(MD_CHANGE_DEVS, &mddev->flags);
991 }
992
993 static int r5l_load_log(struct r5l_log *log)
994 {
995         struct md_rdev *rdev = log->rdev;
996         struct page *page;
997         struct r5l_meta_block *mb;
998         sector_t cp = log->rdev->journal_tail;
999         u32 stored_crc, expected_crc;
1000         bool create_super = false;
1001         int ret;
1002
1003         /* Make sure it's valid */
1004         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
1005                 cp = 0;
1006         page = alloc_page(GFP_KERNEL);
1007         if (!page)
1008                 return -ENOMEM;
1009
1010         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1011                 ret = -EIO;
1012                 goto ioerr;
1013         }
1014         mb = page_address(page);
1015
1016         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1017             mb->version != R5LOG_VERSION) {
1018                 create_super = true;
1019                 goto create;
1020         }
1021         stored_crc = le32_to_cpu(mb->checksum);
1022         mb->checksum = 0;
1023         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1024         if (stored_crc != expected_crc) {
1025                 create_super = true;
1026                 goto create;
1027         }
1028         if (le64_to_cpu(mb->position) != cp) {
1029                 create_super = true;
1030                 goto create;
1031         }
1032 create:
1033         if (create_super) {
1034                 log->last_cp_seq = prandom_u32();
1035                 cp = 0;
1036                 /*
1037                  * Make sure super points to correct address. Log might have
1038                  * data very soon. If super hasn't correct log tail address,
1039                  * recovery can't find the log
1040                  */
1041                 r5l_write_super(log, cp);
1042         } else
1043                 log->last_cp_seq = le64_to_cpu(mb->seq);
1044
1045         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1046         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1047         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1048                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1049         log->last_checkpoint = cp;
1050
1051         __free_page(page);
1052
1053         return r5l_recovery_log(log);
1054 ioerr:
1055         __free_page(page);
1056         return ret;
1057 }
1058
1059 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1060 {
1061         struct r5l_log *log;
1062
1063         if (PAGE_SIZE != 4096)
1064                 return -EINVAL;
1065         log = kzalloc(sizeof(*log), GFP_KERNEL);
1066         if (!log)
1067                 return -ENOMEM;
1068         log->rdev = rdev;
1069
1070         log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
1071
1072         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1073                                        sizeof(rdev->mddev->uuid));
1074
1075         mutex_init(&log->io_mutex);
1076
1077         spin_lock_init(&log->io_list_lock);
1078         INIT_LIST_HEAD(&log->running_ios);
1079         INIT_LIST_HEAD(&log->io_end_ios);
1080         INIT_LIST_HEAD(&log->flushing_ios);
1081         INIT_LIST_HEAD(&log->finished_ios);
1082         bio_init(&log->flush_bio);
1083
1084         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1085         if (!log->io_kc)
1086                 goto io_kc;
1087
1088         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1089                                                  log->rdev->mddev, "reclaim");
1090         if (!log->reclaim_thread)
1091                 goto reclaim_thread;
1092         init_waitqueue_head(&log->iounit_wait);
1093
1094         INIT_LIST_HEAD(&log->no_space_stripes);
1095         spin_lock_init(&log->no_space_stripes_lock);
1096
1097         if (r5l_load_log(log))
1098                 goto error;
1099
1100         conf->log = log;
1101         return 0;
1102 error:
1103         md_unregister_thread(&log->reclaim_thread);
1104 reclaim_thread:
1105         kmem_cache_destroy(log->io_kc);
1106 io_kc:
1107         kfree(log);
1108         return -EINVAL;
1109 }
1110
1111 void r5l_exit_log(struct r5l_log *log)
1112 {
1113         md_unregister_thread(&log->reclaim_thread);
1114         kmem_cache_destroy(log->io_kc);
1115         kfree(log);
1116 }