]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/md/raid5-cache.c
raid5-cache: switching to state machine for log disk cache flush
[karo-tx-linux.git] / drivers / md / raid5-cache.c
1 /*
2  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  */
14 #include <linux/kernel.h>
15 #include <linux/wait.h>
16 #include <linux/blkdev.h>
17 #include <linux/slab.h>
18 #include <linux/raid/md_p.h>
19 #include <linux/crc32c.h>
20 #include <linux/random.h>
21 #include "md.h"
22 #include "raid5.h"
23
24 /*
25  * metadata/data stored in disk with 4k size unit (a block) regardless
26  * underneath hardware sector size. only works with PAGE_SIZE == 4096
27  */
28 #define BLOCK_SECTORS (8)
29
30 /*
31  * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
32  * recovery scans a very long log
33  */
34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
36
37 struct r5l_log {
38         struct md_rdev *rdev;
39
40         u32 uuid_checksum;
41
42         sector_t device_size;           /* log device size, round to
43                                          * BLOCK_SECTORS */
44         sector_t max_free_space;        /* reclaim run if free space is at
45                                          * this size */
46
47         sector_t last_checkpoint;       /* log tail. where recovery scan
48                                          * starts from */
49         u64 last_cp_seq;                /* log tail sequence */
50
51         sector_t log_start;             /* log head. where new data appends */
52         u64 seq;                        /* log head sequence */
53
54         struct mutex io_mutex;
55         struct r5l_io_unit *current_io; /* current io_unit accepting new data */
56
57         spinlock_t io_list_lock;
58         struct list_head running_ios;   /* io_units which are still running,
59                                          * and have not yet been completely
60                                          * written to the log */
61         struct list_head io_end_ios;    /* io_units which have been completely
62                                          * written to the log but not yet written
63                                          * to the RAID */
64         struct list_head flushing_ios;  /* io_units which are waiting for log
65                                          * cache flush */
66         struct list_head flushed_ios;   /* io_units which settle down in log disk */
67         struct bio flush_bio;
68         struct list_head stripe_end_ios;/* io_units which have been completely
69                                          * written to the RAID but have not yet
70                                          * been considered for updating super */
71
72         struct kmem_cache *io_kc;
73
74         struct md_thread *reclaim_thread;
75         unsigned long reclaim_target;   /* number of space that need to be
76                                          * reclaimed.  if it's 0, reclaim spaces
77                                          * used by io_units which are in
78                                          * IO_UNIT_STRIPE_END state (eg, reclaim
79                                          * dones't wait for specific io_unit
80                                          * switching to IO_UNIT_STRIPE_END
81                                          * state) */
82
83         struct list_head no_space_stripes; /* pending stripes, log has no space */
84         spinlock_t no_space_stripes_lock;
85 };
86
87 /*
88  * an IO range starts from a meta data block and end at the next meta data
89  * block. The io unit's the meta data block tracks data/parity followed it. io
90  * unit is written to log disk with normal write, as we always flush log disk
91  * first and then start move data to raid disks, there is no requirement to
92  * write io unit with FLUSH/FUA
93  */
94 struct r5l_io_unit {
95         struct r5l_log *log;
96
97         struct page *meta_page; /* store meta block */
98         int meta_offset;        /* current offset in meta_page */
99
100         struct bio_list bios;
101         atomic_t pending_io;    /* pending bios not written to log yet */
102         struct bio *current_bio;/* current_bio accepting new data */
103
104         atomic_t pending_stripe;/* how many stripes not flushed to raid */
105         u64 seq;                /* seq number of the metablock */
106         sector_t log_start;     /* where the io_unit starts */
107         sector_t log_end;       /* where the io_unit ends */
108         struct list_head log_sibling; /* log->running_ios */
109         struct list_head stripe_list; /* stripes added to the io_unit */
110
111         int state;
112         wait_queue_head_t wait_state;
113 };
114
115 /* r5l_io_unit state */
116 enum r5l_io_unit_state {
117         IO_UNIT_RUNNING = 0,    /* accepting new IO */
118         IO_UNIT_IO_START = 1,   /* io_unit bio start writing to log,
119                                  * don't accepting new bio */
120         IO_UNIT_IO_END = 2,     /* io_unit bio finish writing to log */
121         IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
122 };
123
124 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
125 {
126         start += inc;
127         if (start >= log->device_size)
128                 start = start - log->device_size;
129         return start;
130 }
131
132 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
133                                   sector_t end)
134 {
135         if (end >= start)
136                 return end - start;
137         else
138                 return end + log->device_size - start;
139 }
140
141 static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
142 {
143         sector_t used_size;
144
145         used_size = r5l_ring_distance(log, log->last_checkpoint,
146                                         log->log_start);
147
148         return log->device_size > used_size + size;
149 }
150
151 static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
152 {
153         struct r5l_io_unit *io;
154         /* We can't handle memory allocate failure so far */
155         gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
156
157         io = kmem_cache_zalloc(log->io_kc, gfp);
158         io->log = log;
159         io->meta_page = alloc_page(gfp | __GFP_ZERO);
160
161         bio_list_init(&io->bios);
162         INIT_LIST_HEAD(&io->log_sibling);
163         INIT_LIST_HEAD(&io->stripe_list);
164         io->state = IO_UNIT_RUNNING;
165         init_waitqueue_head(&io->wait_state);
166         return io;
167 }
168
169 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
170 {
171         __free_page(io->meta_page);
172         kmem_cache_free(log->io_kc, io);
173 }
174
175 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
176                                   enum r5l_io_unit_state state)
177 {
178         struct r5l_io_unit *io;
179
180         while (!list_empty(from)) {
181                 io = list_first_entry(from, struct r5l_io_unit, log_sibling);
182                 /* don't change list order */
183                 if (io->state >= state)
184                         list_move_tail(&io->log_sibling, to);
185                 else
186                         break;
187         }
188 }
189
190 /*
191  * We don't want too many io_units reside in stripe_end_ios list, which will
192  * waste a lot of memory. So we try to remove some. But we must keep at least 2
193  * io_units. The superblock must point to a valid meta, if it's the last meta,
194  * recovery can scan less
195  */
196 static void r5l_compress_stripe_end_list(struct r5l_log *log)
197 {
198         struct r5l_io_unit *first, *last, *io;
199
200         first = list_first_entry(&log->stripe_end_ios,
201                                  struct r5l_io_unit, log_sibling);
202         last = list_last_entry(&log->stripe_end_ios,
203                                struct r5l_io_unit, log_sibling);
204         if (first == last)
205                 return;
206         list_del(&first->log_sibling);
207         list_del(&last->log_sibling);
208         while (!list_empty(&log->stripe_end_ios)) {
209                 io = list_first_entry(&log->stripe_end_ios,
210                                       struct r5l_io_unit, log_sibling);
211                 list_del(&io->log_sibling);
212                 first->log_end = io->log_end;
213                 r5l_free_io_unit(log, io);
214         }
215         list_add_tail(&first->log_sibling, &log->stripe_end_ios);
216         list_add_tail(&last->log_sibling, &log->stripe_end_ios);
217 }
218
219 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
220 static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
221                                     enum r5l_io_unit_state state)
222 {
223         struct r5l_log *log = io->log;
224
225         if (WARN_ON(io->state >= state))
226                 return;
227         io->state = state;
228         if (state == IO_UNIT_IO_END)
229                 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
230                                       IO_UNIT_IO_END);
231         if (state == IO_UNIT_STRIPE_END) {
232                 struct r5l_io_unit *last;
233                 sector_t reclaimable_space;
234
235                 r5l_move_io_unit_list(&log->flushed_ios, &log->stripe_end_ios,
236                                       IO_UNIT_STRIPE_END);
237
238                 last = list_last_entry(&log->stripe_end_ios,
239                                        struct r5l_io_unit, log_sibling);
240                 reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
241                                                       last->log_end);
242                 if (reclaimable_space >= log->max_free_space)
243                         r5l_wake_reclaim(log, 0);
244
245                 r5l_compress_stripe_end_list(log);
246         }
247         wake_up(&io->wait_state);
248 }
249
250 static void r5l_set_io_unit_state(struct r5l_io_unit *io,
251                                   enum r5l_io_unit_state state)
252 {
253         struct r5l_log *log = io->log;
254         unsigned long flags;
255
256         spin_lock_irqsave(&log->io_list_lock, flags);
257         __r5l_set_io_unit_state(io, state);
258         spin_unlock_irqrestore(&log->io_list_lock, flags);
259 }
260
261 /* XXX: totally ignores I/O errors */
262 static void r5l_log_endio(struct bio *bio)
263 {
264         struct r5l_io_unit *io = bio->bi_private;
265         struct r5l_log *log = io->log;
266
267         bio_put(bio);
268
269         if (!atomic_dec_and_test(&io->pending_io))
270                 return;
271
272         r5l_set_io_unit_state(io, IO_UNIT_IO_END);
273         md_wakeup_thread(log->rdev->mddev->thread);
274 }
275
276 static void r5l_submit_current_io(struct r5l_log *log)
277 {
278         struct r5l_io_unit *io = log->current_io;
279         struct r5l_meta_block *block;
280         struct bio *bio;
281         u32 crc;
282
283         if (!io)
284                 return;
285
286         block = page_address(io->meta_page);
287         block->meta_size = cpu_to_le32(io->meta_offset);
288         crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
289         block->checksum = cpu_to_le32(crc);
290
291         log->current_io = NULL;
292         r5l_set_io_unit_state(io, IO_UNIT_IO_START);
293
294         while ((bio = bio_list_pop(&io->bios))) {
295                 /* all IO must start from rdev->data_offset */
296                 bio->bi_iter.bi_sector += log->rdev->data_offset;
297                 submit_bio(WRITE, bio);
298         }
299 }
300
301 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
302 {
303         struct r5l_io_unit *io;
304         struct r5l_meta_block *block;
305         struct bio *bio;
306
307         io = r5l_alloc_io_unit(log);
308
309         block = page_address(io->meta_page);
310         block->magic = cpu_to_le32(R5LOG_MAGIC);
311         block->version = R5LOG_VERSION;
312         block->seq = cpu_to_le64(log->seq);
313         block->position = cpu_to_le64(log->log_start);
314
315         io->log_start = log->log_start;
316         io->meta_offset = sizeof(struct r5l_meta_block);
317         io->seq = log->seq;
318
319         bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
320         io->current_bio = bio;
321         bio->bi_rw = WRITE;
322         bio->bi_bdev = log->rdev->bdev;
323         bio->bi_iter.bi_sector = log->log_start;
324         bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
325         bio->bi_end_io = r5l_log_endio;
326         bio->bi_private = io;
327
328         bio_list_add(&io->bios, bio);
329         atomic_inc(&io->pending_io);
330
331         log->seq++;
332         log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
333         io->log_end = log->log_start;
334         /* current bio hit disk end */
335         if (log->log_start == 0)
336                 io->current_bio = NULL;
337
338         spin_lock_irq(&log->io_list_lock);
339         list_add_tail(&io->log_sibling, &log->running_ios);
340         spin_unlock_irq(&log->io_list_lock);
341
342         return io;
343 }
344
345 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
346 {
347         struct r5l_io_unit *io;
348
349         io = log->current_io;
350         if (io && io->meta_offset + payload_size > PAGE_SIZE)
351                 r5l_submit_current_io(log);
352         io = log->current_io;
353         if (io)
354                 return 0;
355
356         log->current_io = r5l_new_meta(log);
357         return 0;
358 }
359
360 static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
361                                     sector_t location,
362                                     u32 checksum1, u32 checksum2,
363                                     bool checksum2_valid)
364 {
365         struct r5l_io_unit *io = log->current_io;
366         struct r5l_payload_data_parity *payload;
367
368         payload = page_address(io->meta_page) + io->meta_offset;
369         payload->header.type = cpu_to_le16(type);
370         payload->header.flags = cpu_to_le16(0);
371         payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
372                                     (PAGE_SHIFT - 9));
373         payload->location = cpu_to_le64(location);
374         payload->checksum[0] = cpu_to_le32(checksum1);
375         if (checksum2_valid)
376                 payload->checksum[1] = cpu_to_le32(checksum2);
377
378         io->meta_offset += sizeof(struct r5l_payload_data_parity) +
379                 sizeof(__le32) * (1 + !!checksum2_valid);
380 }
381
382 static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
383 {
384         struct r5l_io_unit *io = log->current_io;
385
386 alloc_bio:
387         if (!io->current_bio) {
388                 struct bio *bio;
389
390                 bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
391                 bio->bi_rw = WRITE;
392                 bio->bi_bdev = log->rdev->bdev;
393                 bio->bi_iter.bi_sector = log->log_start;
394                 bio->bi_end_io = r5l_log_endio;
395                 bio->bi_private = io;
396                 bio_list_add(&io->bios, bio);
397                 atomic_inc(&io->pending_io);
398                 io->current_bio = bio;
399         }
400         if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
401                 io->current_bio = NULL;
402                 goto alloc_bio;
403         }
404         log->log_start = r5l_ring_add(log, log->log_start,
405                                       BLOCK_SECTORS);
406         /* current bio hit disk end */
407         if (log->log_start == 0)
408                 io->current_bio = NULL;
409
410         io->log_end = log->log_start;
411 }
412
413 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
414                            int data_pages, int parity_pages)
415 {
416         int i;
417         int meta_size;
418         struct r5l_io_unit *io;
419
420         meta_size =
421                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
422                  * data_pages) +
423                 sizeof(struct r5l_payload_data_parity) +
424                 sizeof(__le32) * parity_pages;
425
426         r5l_get_meta(log, meta_size);
427         io = log->current_io;
428
429         for (i = 0; i < sh->disks; i++) {
430                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
431                         continue;
432                 if (i == sh->pd_idx || i == sh->qd_idx)
433                         continue;
434                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
435                                         raid5_compute_blocknr(sh, i, 0),
436                                         sh->dev[i].log_checksum, 0, false);
437                 r5l_append_payload_page(log, sh->dev[i].page);
438         }
439
440         if (sh->qd_idx >= 0) {
441                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
442                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
443                                         sh->dev[sh->qd_idx].log_checksum, true);
444                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
445                 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
446         } else {
447                 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
448                                         sh->sector, sh->dev[sh->pd_idx].log_checksum,
449                                         0, false);
450                 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
451         }
452
453         list_add_tail(&sh->log_list, &io->stripe_list);
454         atomic_inc(&io->pending_stripe);
455         sh->log_io = io;
456 }
457
458 /*
459  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
460  * data from log to raid disks), so we shouldn't wait for reclaim here
461  */
462 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
463 {
464         int write_disks = 0;
465         int data_pages, parity_pages;
466         int meta_size;
467         int reserve;
468         int i;
469
470         if (!log)
471                 return -EAGAIN;
472         /* Don't support stripe batch */
473         if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
474             test_bit(STRIPE_SYNCING, &sh->state)) {
475                 /* the stripe is written to log, we start writing it to raid */
476                 clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
477                 return -EAGAIN;
478         }
479
480         for (i = 0; i < sh->disks; i++) {
481                 void *addr;
482
483                 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
484                         continue;
485                 write_disks++;
486                 /* checksum is already calculated in last run */
487                 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
488                         continue;
489                 addr = kmap_atomic(sh->dev[i].page);
490                 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
491                                                     addr, PAGE_SIZE);
492                 kunmap_atomic(addr);
493         }
494         parity_pages = 1 + !!(sh->qd_idx >= 0);
495         data_pages = write_disks - parity_pages;
496
497         meta_size =
498                 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
499                  * data_pages) +
500                 sizeof(struct r5l_payload_data_parity) +
501                 sizeof(__le32) * parity_pages;
502         /* Doesn't work with very big raid array */
503         if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
504                 return -EINVAL;
505
506         set_bit(STRIPE_LOG_TRAPPED, &sh->state);
507         atomic_inc(&sh->count);
508
509         mutex_lock(&log->io_mutex);
510         /* meta + data */
511         reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
512         if (r5l_has_free_space(log, reserve))
513                 r5l_log_stripe(log, sh, data_pages, parity_pages);
514         else {
515                 spin_lock(&log->no_space_stripes_lock);
516                 list_add_tail(&sh->log_list, &log->no_space_stripes);
517                 spin_unlock(&log->no_space_stripes_lock);
518
519                 r5l_wake_reclaim(log, reserve);
520         }
521         mutex_unlock(&log->io_mutex);
522
523         return 0;
524 }
525
526 void r5l_write_stripe_run(struct r5l_log *log)
527 {
528         if (!log)
529                 return;
530         mutex_lock(&log->io_mutex);
531         r5l_submit_current_io(log);
532         mutex_unlock(&log->io_mutex);
533 }
534
535 /* This will run after log space is reclaimed */
536 static void r5l_run_no_space_stripes(struct r5l_log *log)
537 {
538         struct stripe_head *sh;
539
540         spin_lock(&log->no_space_stripes_lock);
541         while (!list_empty(&log->no_space_stripes)) {
542                 sh = list_first_entry(&log->no_space_stripes,
543                                       struct stripe_head, log_list);
544                 list_del_init(&sh->log_list);
545                 set_bit(STRIPE_HANDLE, &sh->state);
546                 raid5_release_stripe(sh);
547         }
548         spin_unlock(&log->no_space_stripes_lock);
549 }
550
551 void r5l_stripe_write_finished(struct stripe_head *sh)
552 {
553         struct r5l_io_unit *io;
554
555         /* Don't support stripe batch */
556         io = sh->log_io;
557         if (!io)
558                 return;
559         sh->log_io = NULL;
560
561         if (atomic_dec_and_test(&io->pending_stripe))
562                 r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
563 }
564
565 static void r5l_log_flush_endio(struct bio *bio)
566 {
567         struct r5l_log *log = container_of(bio, struct r5l_log,
568                 flush_bio);
569         unsigned long flags;
570         struct r5l_io_unit *io;
571         struct stripe_head *sh;
572
573         spin_lock_irqsave(&log->io_list_lock, flags);
574         list_for_each_entry(io, &log->flushing_ios, log_sibling) {
575                 while (!list_empty(&io->stripe_list)) {
576                         sh = list_first_entry(&io->stripe_list,
577                                 struct stripe_head, log_list);
578                         list_del_init(&sh->log_list);
579                         set_bit(STRIPE_HANDLE, &sh->state);
580                         raid5_release_stripe(sh);
581                 }
582         }
583         list_splice_tail_init(&log->flushing_ios, &log->flushed_ios);
584         spin_unlock_irqrestore(&log->io_list_lock, flags);
585 }
586
587 /*
588  * Starting dispatch IO to raid.
589  * io_unit(meta) consists of a log. There is one situation we want to avoid. A
590  * broken meta in the middle of a log causes recovery can't find meta at the
591  * head of log. If operations require meta at the head persistent in log, we
592  * must make sure meta before it persistent in log too. A case is:
593  *
594  * stripe data/parity is in log, we start write stripe to raid disks. stripe
595  * data/parity must be persistent in log before we do the write to raid disks.
596  *
597  * The solution is we restrictly maintain io_unit list order. In this case, we
598  * only write stripes of an io_unit to raid disks till the io_unit is the first
599  * one whose data/parity is in log.
600  */
601 void r5l_flush_stripe_to_raid(struct r5l_log *log)
602 {
603         bool do_flush;
604         if (!log)
605                 return;
606
607         spin_lock_irq(&log->io_list_lock);
608         /* flush bio is running */
609         if (!list_empty(&log->flushing_ios)) {
610                 spin_unlock_irq(&log->io_list_lock);
611                 return;
612         }
613         list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
614         do_flush = !list_empty(&log->flushing_ios);
615         spin_unlock_irq(&log->io_list_lock);
616
617         if (!do_flush)
618                 return;
619         bio_reset(&log->flush_bio);
620         log->flush_bio.bi_bdev = log->rdev->bdev;
621         log->flush_bio.bi_end_io = r5l_log_flush_endio;
622         submit_bio(WRITE_FLUSH, &log->flush_bio);
623 }
624
625 static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
626 {
627         md_wakeup_thread(log->rdev->mddev->thread);
628         wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
629 }
630
631 static void r5l_write_super(struct r5l_log *log, sector_t cp);
632 static void r5l_do_reclaim(struct r5l_log *log)
633 {
634         struct r5l_io_unit *io, *last;
635         LIST_HEAD(list);
636         sector_t free = 0;
637         sector_t reclaim_target = xchg(&log->reclaim_target, 0);
638
639         spin_lock_irq(&log->io_list_lock);
640         /*
641          * move proper io_unit to reclaim list. We should not change the order.
642          * reclaimable/unreclaimable io_unit can be mixed in the list, we
643          * shouldn't reuse space of an unreclaimable io_unit
644          */
645         while (1) {
646                 struct list_head *target_list = NULL;
647
648                 while (!list_empty(&log->stripe_end_ios)) {
649                         io = list_first_entry(&log->stripe_end_ios,
650                                               struct r5l_io_unit, log_sibling);
651                         list_move_tail(&io->log_sibling, &list);
652                         free += r5l_ring_distance(log, io->log_start,
653                                                   io->log_end);
654                 }
655
656                 if (free >= reclaim_target ||
657                     (list_empty(&log->running_ios) &&
658                      list_empty(&log->io_end_ios) &&
659                      list_empty(&log->flushing_ios) &&
660                      list_empty(&log->flushed_ios)))
661                         break;
662
663                 /* Below waiting mostly happens when we shutdown the raid */
664                 if (!list_empty(&log->flushed_ios))
665                         target_list = &log->flushed_ios;
666                 else if (!list_empty(&log->flushing_ios))
667                         target_list = &log->flushing_ios;
668                 else if (!list_empty(&log->io_end_ios))
669                         target_list = &log->io_end_ios;
670                 else if (!list_empty(&log->running_ios))
671                         target_list = &log->running_ios;
672
673                 io = list_first_entry(target_list,
674                                       struct r5l_io_unit, log_sibling);
675                 spin_unlock_irq(&log->io_list_lock);
676                 /* nobody else can delete the io, we are safe */
677                 r5l_kick_io_unit(log, io);
678                 spin_lock_irq(&log->io_list_lock);
679         }
680         spin_unlock_irq(&log->io_list_lock);
681
682         if (list_empty(&list))
683                 return;
684
685         /* super always point to last valid meta */
686         last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
687         /*
688          * write_super will flush cache of each raid disk. We must write super
689          * here, because the log area might be reused soon and we don't want to
690          * confuse recovery
691          */
692         r5l_write_super(log, last->log_start);
693
694         mutex_lock(&log->io_mutex);
695         log->last_checkpoint = last->log_start;
696         log->last_cp_seq = last->seq;
697         mutex_unlock(&log->io_mutex);
698         r5l_run_no_space_stripes(log);
699
700         while (!list_empty(&list)) {
701                 io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
702                 list_del(&io->log_sibling);
703                 r5l_free_io_unit(log, io);
704         }
705 }
706
707 static void r5l_reclaim_thread(struct md_thread *thread)
708 {
709         struct mddev *mddev = thread->mddev;
710         struct r5conf *conf = mddev->private;
711         struct r5l_log *log = conf->log;
712
713         if (!log)
714                 return;
715         r5l_do_reclaim(log);
716 }
717
718 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
719 {
720         unsigned long target;
721         unsigned long new = (unsigned long)space; /* overflow in theory */
722
723         do {
724                 target = log->reclaim_target;
725                 if (new < target)
726                         return;
727         } while (cmpxchg(&log->reclaim_target, target, new) != target);
728         md_wakeup_thread(log->reclaim_thread);
729 }
730
731 struct r5l_recovery_ctx {
732         struct page *meta_page;         /* current meta */
733         sector_t meta_total_blocks;     /* total size of current meta and data */
734         sector_t pos;                   /* recovery position */
735         u64 seq;                        /* recovery position seq */
736 };
737
738 static int r5l_read_meta_block(struct r5l_log *log,
739                                struct r5l_recovery_ctx *ctx)
740 {
741         struct page *page = ctx->meta_page;
742         struct r5l_meta_block *mb;
743         u32 crc, stored_crc;
744
745         if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
746                 return -EIO;
747
748         mb = page_address(page);
749         stored_crc = le32_to_cpu(mb->checksum);
750         mb->checksum = 0;
751
752         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
753             le64_to_cpu(mb->seq) != ctx->seq ||
754             mb->version != R5LOG_VERSION ||
755             le64_to_cpu(mb->position) != ctx->pos)
756                 return -EINVAL;
757
758         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
759         if (stored_crc != crc)
760                 return -EINVAL;
761
762         if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
763                 return -EINVAL;
764
765         ctx->meta_total_blocks = BLOCK_SECTORS;
766
767         return 0;
768 }
769
770 static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
771                                          struct r5l_recovery_ctx *ctx,
772                                          sector_t stripe_sect,
773                                          int *offset, sector_t *log_offset)
774 {
775         struct r5conf *conf = log->rdev->mddev->private;
776         struct stripe_head *sh;
777         struct r5l_payload_data_parity *payload;
778         int disk_index;
779
780         sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
781         while (1) {
782                 payload = page_address(ctx->meta_page) + *offset;
783
784                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
785                         raid5_compute_sector(conf,
786                                              le64_to_cpu(payload->location), 0,
787                                              &disk_index, sh);
788
789                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
790                                      sh->dev[disk_index].page, READ, false);
791                         sh->dev[disk_index].log_checksum =
792                                 le32_to_cpu(payload->checksum[0]);
793                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
794                         ctx->meta_total_blocks += BLOCK_SECTORS;
795                 } else {
796                         disk_index = sh->pd_idx;
797                         sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
798                                      sh->dev[disk_index].page, READ, false);
799                         sh->dev[disk_index].log_checksum =
800                                 le32_to_cpu(payload->checksum[0]);
801                         set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
802
803                         if (sh->qd_idx >= 0) {
804                                 disk_index = sh->qd_idx;
805                                 sync_page_io(log->rdev,
806                                              r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
807                                              PAGE_SIZE, sh->dev[disk_index].page,
808                                              READ, false);
809                                 sh->dev[disk_index].log_checksum =
810                                         le32_to_cpu(payload->checksum[1]);
811                                 set_bit(R5_Wantwrite,
812                                         &sh->dev[disk_index].flags);
813                         }
814                         ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
815                 }
816
817                 *log_offset = r5l_ring_add(log, *log_offset,
818                                            le32_to_cpu(payload->size));
819                 *offset += sizeof(struct r5l_payload_data_parity) +
820                         sizeof(__le32) *
821                         (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
822                 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
823                         break;
824         }
825
826         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
827                 void *addr;
828                 u32 checksum;
829
830                 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
831                         continue;
832                 addr = kmap_atomic(sh->dev[disk_index].page);
833                 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
834                 kunmap_atomic(addr);
835                 if (checksum != sh->dev[disk_index].log_checksum)
836                         goto error;
837         }
838
839         for (disk_index = 0; disk_index < sh->disks; disk_index++) {
840                 struct md_rdev *rdev, *rrdev;
841
842                 if (!test_and_clear_bit(R5_Wantwrite,
843                                         &sh->dev[disk_index].flags))
844                         continue;
845
846                 /* in case device is broken */
847                 rdev = rcu_dereference(conf->disks[disk_index].rdev);
848                 if (rdev)
849                         sync_page_io(rdev, stripe_sect, PAGE_SIZE,
850                                      sh->dev[disk_index].page, WRITE, false);
851                 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
852                 if (rrdev)
853                         sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
854                                      sh->dev[disk_index].page, WRITE, false);
855         }
856         raid5_release_stripe(sh);
857         return 0;
858
859 error:
860         for (disk_index = 0; disk_index < sh->disks; disk_index++)
861                 sh->dev[disk_index].flags = 0;
862         raid5_release_stripe(sh);
863         return -EINVAL;
864 }
865
866 static int r5l_recovery_flush_one_meta(struct r5l_log *log,
867                                        struct r5l_recovery_ctx *ctx)
868 {
869         struct r5conf *conf = log->rdev->mddev->private;
870         struct r5l_payload_data_parity *payload;
871         struct r5l_meta_block *mb;
872         int offset;
873         sector_t log_offset;
874         sector_t stripe_sector;
875
876         mb = page_address(ctx->meta_page);
877         offset = sizeof(struct r5l_meta_block);
878         log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
879
880         while (offset < le32_to_cpu(mb->meta_size)) {
881                 int dd;
882
883                 payload = (void *)mb + offset;
884                 stripe_sector = raid5_compute_sector(conf,
885                                                      le64_to_cpu(payload->location), 0, &dd, NULL);
886                 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
887                                                   &offset, &log_offset))
888                         return -EINVAL;
889         }
890         return 0;
891 }
892
893 /* copy data/parity from log to raid disks */
894 static void r5l_recovery_flush_log(struct r5l_log *log,
895                                    struct r5l_recovery_ctx *ctx)
896 {
897         while (1) {
898                 if (r5l_read_meta_block(log, ctx))
899                         return;
900                 if (r5l_recovery_flush_one_meta(log, ctx))
901                         return;
902                 ctx->seq++;
903                 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
904         }
905 }
906
907 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
908                                           u64 seq)
909 {
910         struct page *page;
911         struct r5l_meta_block *mb;
912         u32 crc;
913
914         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
915         if (!page)
916                 return -ENOMEM;
917         mb = page_address(page);
918         mb->magic = cpu_to_le32(R5LOG_MAGIC);
919         mb->version = R5LOG_VERSION;
920         mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
921         mb->seq = cpu_to_le64(seq);
922         mb->position = cpu_to_le64(pos);
923         crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
924         mb->checksum = cpu_to_le32(crc);
925
926         if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
927                 __free_page(page);
928                 return -EIO;
929         }
930         __free_page(page);
931         return 0;
932 }
933
934 static int r5l_recovery_log(struct r5l_log *log)
935 {
936         struct r5l_recovery_ctx ctx;
937
938         ctx.pos = log->last_checkpoint;
939         ctx.seq = log->last_cp_seq;
940         ctx.meta_page = alloc_page(GFP_KERNEL);
941         if (!ctx.meta_page)
942                 return -ENOMEM;
943
944         r5l_recovery_flush_log(log, &ctx);
945         __free_page(ctx.meta_page);
946
947         /*
948          * we did a recovery. Now ctx.pos points to an invalid meta block. New
949          * log will start here. but we can't let superblock point to last valid
950          * meta block. The log might looks like:
951          * | meta 1| meta 2| meta 3|
952          * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
953          * superblock points to meta 1, we write a new valid meta 2n.  if crash
954          * happens again, new recovery will start from meta 1. Since meta 2n is
955          * valid now, recovery will think meta 3 is valid, which is wrong.
956          * The solution is we create a new meta in meta2 with its seq == meta
957          * 1's seq + 10 and let superblock points to meta2. The same recovery will
958          * not think meta 3 is a valid meta, because its seq doesn't match
959          */
960         if (ctx.seq > log->last_cp_seq + 1) {
961                 int ret;
962
963                 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
964                 if (ret)
965                         return ret;
966                 log->seq = ctx.seq + 11;
967                 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
968                 r5l_write_super(log, ctx.pos);
969         } else {
970                 log->log_start = ctx.pos;
971                 log->seq = ctx.seq;
972         }
973         return 0;
974 }
975
976 static void r5l_write_super(struct r5l_log *log, sector_t cp)
977 {
978         struct mddev *mddev = log->rdev->mddev;
979
980         log->rdev->journal_tail = cp;
981         set_bit(MD_CHANGE_DEVS, &mddev->flags);
982 }
983
984 static int r5l_load_log(struct r5l_log *log)
985 {
986         struct md_rdev *rdev = log->rdev;
987         struct page *page;
988         struct r5l_meta_block *mb;
989         sector_t cp = log->rdev->journal_tail;
990         u32 stored_crc, expected_crc;
991         bool create_super = false;
992         int ret;
993
994         /* Make sure it's valid */
995         if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
996                 cp = 0;
997         page = alloc_page(GFP_KERNEL);
998         if (!page)
999                 return -ENOMEM;
1000
1001         if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
1002                 ret = -EIO;
1003                 goto ioerr;
1004         }
1005         mb = page_address(page);
1006
1007         if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
1008             mb->version != R5LOG_VERSION) {
1009                 create_super = true;
1010                 goto create;
1011         }
1012         stored_crc = le32_to_cpu(mb->checksum);
1013         mb->checksum = 0;
1014         expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
1015         if (stored_crc != expected_crc) {
1016                 create_super = true;
1017                 goto create;
1018         }
1019         if (le64_to_cpu(mb->position) != cp) {
1020                 create_super = true;
1021                 goto create;
1022         }
1023 create:
1024         if (create_super) {
1025                 log->last_cp_seq = prandom_u32();
1026                 cp = 0;
1027                 /*
1028                  * Make sure super points to correct address. Log might have
1029                  * data very soon. If super hasn't correct log tail address,
1030                  * recovery can't find the log
1031                  */
1032                 r5l_write_super(log, cp);
1033         } else
1034                 log->last_cp_seq = le64_to_cpu(mb->seq);
1035
1036         log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
1037         log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
1038         if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
1039                 log->max_free_space = RECLAIM_MAX_FREE_SPACE;
1040         log->last_checkpoint = cp;
1041
1042         __free_page(page);
1043
1044         return r5l_recovery_log(log);
1045 ioerr:
1046         __free_page(page);
1047         return ret;
1048 }
1049
1050 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
1051 {
1052         struct r5l_log *log;
1053
1054         if (PAGE_SIZE != 4096)
1055                 return -EINVAL;
1056         log = kzalloc(sizeof(*log), GFP_KERNEL);
1057         if (!log)
1058                 return -ENOMEM;
1059         log->rdev = rdev;
1060
1061         log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
1062                                        sizeof(rdev->mddev->uuid));
1063
1064         mutex_init(&log->io_mutex);
1065
1066         spin_lock_init(&log->io_list_lock);
1067         INIT_LIST_HEAD(&log->running_ios);
1068         INIT_LIST_HEAD(&log->io_end_ios);
1069         INIT_LIST_HEAD(&log->stripe_end_ios);
1070         INIT_LIST_HEAD(&log->flushing_ios);
1071         INIT_LIST_HEAD(&log->flushed_ios);
1072         bio_init(&log->flush_bio);
1073
1074         log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
1075         if (!log->io_kc)
1076                 goto io_kc;
1077
1078         log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
1079                                                  log->rdev->mddev, "reclaim");
1080         if (!log->reclaim_thread)
1081                 goto reclaim_thread;
1082
1083         INIT_LIST_HEAD(&log->no_space_stripes);
1084         spin_lock_init(&log->no_space_stripes_lock);
1085
1086         if (r5l_load_log(log))
1087                 goto error;
1088
1089         conf->log = log;
1090         return 0;
1091 error:
1092         md_unregister_thread(&log->reclaim_thread);
1093 reclaim_thread:
1094         kmem_cache_destroy(log->io_kc);
1095 io_kc:
1096         kfree(log);
1097         return -EINVAL;
1098 }
1099
1100 void r5l_exit_log(struct r5l_log *log)
1101 {
1102         /*
1103          * at this point all stripes are finished, so io_unit is at least in
1104          * STRIPE_END state
1105          */
1106         r5l_wake_reclaim(log, -1L);
1107         md_unregister_thread(&log->reclaim_thread);
1108         r5l_do_reclaim(log);
1109         /*
1110          * force a super update, r5l_do_reclaim might updated the super.
1111          * mddev->thread is already stopped
1112          */
1113         md_update_sb(log->rdev->mddev, 1);
1114
1115         kmem_cache_destroy(log->io_kc);
1116         kfree(log);
1117 }