fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <linux/vmalloc.h>
  35 #include <asm/div64.h>
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61 /*
  62  * bbio and raid_map is managed by the caller, so we shouldn't free
  63  * them here. And besides that, all rbios with this flag should not
  64  * be cached, because we need raid_map to check the rbios' stripe
  65  * is the same or not, but it is very likely that the caller has
  66  * free raid_map, so don't cache those rbios.
  67  */
  68 #define RBIO_HOLD_BBIO_MAP_BIT  4
  69
  70 #define RBIO_CACHE_SIZE 1024
  71
  72 struct btrfs_raid_bio {
  73         struct btrfs_fs_info *fs_info;
  74         struct btrfs_bio *bbio;
  75
  76         /*
  77          * logical block numbers for the start of each stripe
  78          * The last one or two are p/q.  These are sorted,
  79          * so raid_map[0] is the start of our full stripe
  80          */
  81         u64 *raid_map;
  82
  83         /* while we're doing rmw on a stripe
  84          * we put it into a hash table so we can
  85          * lock the stripe and merge more rbios
  86          * into it.
  87          */
  88         struct list_head hash_list;
  89
  90         /*
  91          * LRU list for the stripe cache
  92          */
  93         struct list_head stripe_cache;
  94
  95         /*
  96          * for scheduling work in the helper threads
  97          */
  98         struct btrfs_work work;
  99
 100         /*
 101          * bio list and bio_list_lock are used
 102          * to add more bios into the stripe
 103          * in hopes of avoiding the full rmw
 104          */
 105         struct bio_list bio_list;
 106         spinlock_t bio_list_lock;
 107
 108         /* also protected by the bio_list_lock, the
 109          * plug list is used by the plugging code
 110          * to collect partial bios while plugged.  The
 111          * stripe locking code also uses it to hand off
 112          * the stripe lock to the next pending IO
 113          */
 114         struct list_head plug_list;
 115
 116         /*
 117          * flags that tell us if it is safe to
 118          * merge with this bio
 119          */
 120         unsigned long flags;
 121
 122         /* size of each individual stripe on disk */
 123         int stripe_len;
 124
 125         /* number of data stripes (no p/q) */
 126         int nr_data;
 127
 128         /*
 129          * set if we're doing a parity rebuild
 130          * for a read from higher up, which is handled
 131          * differently from a parity rebuild as part of
 132          * rmw
 133          */
 134         int read_rebuild;
 135
 136         /* first bad stripe */
 137         int faila;
 138
 139         /* second bad stripe (for raid6 use) */
 140         int failb;
 141
 142         /*
 143          * number of pages needed to represent the full
 144          * stripe
 145          */
 146         int nr_pages;
 147
 148         /*
 149          * size of all the bios in the bio_list.  This
 150          * helps us decide if the rbio maps to a full
 151          * stripe or not
 152          */
 153         int bio_list_bytes;
 154
 155         atomic_t refs;
 156
 157
 158         atomic_t stripes_pending;
 159
 160         atomic_t error;
 161         /*
 162          * these are two arrays of pointers.  We allocate the
 163          * rbio big enough to hold them both and setup their
 164          * locations when the rbio is allocated
 165          */
 166
 167         /* pointers to pages that we allocated for
 168          * reading/writing stripes directly from the disk (including P/Q)
 169          */
 170         struct page **stripe_pages;
 171
 172         /*
 173          * pointers to the pages in the bio_list.  Stored
 174          * here for faster lookup
 175          */
 176         struct page **bio_pages;
 177 };
 178
 179 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 180 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 181 static void rmw_work(struct btrfs_work *work);
 182 static void read_rebuild_work(struct btrfs_work *work);
 183 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 184 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 185 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 186 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 187 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 188 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 189 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 190
 191 /*
 192  * the stripe hash table is used for locking, and to collect
 193  * bios in hopes of making a full stripe
 194  */
 195 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 196 {
 197         struct btrfs_stripe_hash_table *table;
 198         struct btrfs_stripe_hash_table *x;
 199         struct btrfs_stripe_hash *cur;
 200         struct btrfs_stripe_hash *h;
 201         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 202         int i;
 203         int table_size;
 204
 205         if (info->stripe_hash_table)
 206                 return 0;
 207
 208         /*
 209          * The table is large, starting with order 4 and can go as high as
 210          * order 7 in case lock debugging is turned on.
 211          *
 212          * Try harder to allocate and fallback to vmalloc to lower the chance
 213          * of a failing mount.
 214          */
 215         table_size = sizeof(*table) + sizeof(*h) * num_entries;
 216         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 217         if (!table) {
 218                 table = vzalloc(table_size);
 219                 if (!table)
 220                         return -ENOMEM;
 221         }
 222
 223         spin_lock_init(&table->cache_lock);
 224         INIT_LIST_HEAD(&table->stripe_cache);
 225
 226         h = table->table;
 227
 228         for (i = 0; i < num_entries; i++) {
 229                 cur = h + i;
 230                 INIT_LIST_HEAD(&cur->hash_list);
 231                 spin_lock_init(&cur->lock);
 232                 init_waitqueue_head(&cur->wait);
 233         }
 234
 235         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 236         if (x) {
 237                 if (is_vmalloc_addr(x))
 238                         vfree(x);
 239                 else
 240                         kfree(x);
 241         }
 242         return 0;
 243 }
 244
 245 /*
 246  * caching an rbio means to copy anything from the
 247  * bio_pages array into the stripe_pages array.  We
 248  * use the page uptodate bit in the stripe cache array
 249  * to indicate if it has valid data
 250  *
 251  * once the caching is done, we set the cache ready
 252  * bit.
 253  */
 254 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 255 {
 256         int i;
 257         char *s;
 258         char *d;
 259         int ret;
 260
 261         ret = alloc_rbio_pages(rbio);
 262         if (ret)
 263                 return;
 264
 265         for (i = 0; i < rbio->nr_pages; i++) {
 266                 if (!rbio->bio_pages[i])
 267                         continue;
 268
 269                 s = kmap(rbio->bio_pages[i]);
 270                 d = kmap(rbio->stripe_pages[i]);
 271
 272                 memcpy(d, s, PAGE_CACHE_SIZE);
 273
 274                 kunmap(rbio->bio_pages[i]);
 275                 kunmap(rbio->stripe_pages[i]);
 276                 SetPageUptodate(rbio->stripe_pages[i]);
 277         }
 278         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 279 }
 280
 281 /*
 282  * we hash on the first logical address of the stripe
 283  */
 284 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 285 {
 286         u64 num = rbio->raid_map[0];
 287
 288         /*
 289          * we shift down quite a bit.  We're using byte
 290          * addressing, and most of the lower bits are zeros.
 291          * This tends to upset hash_64, and it consistently
 292          * returns just one or two different values.
 293          *
 294          * shifting off the lower bits fixes things.
 295          */
 296         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 297 }
 298
 299 /*
 300  * stealing an rbio means taking all the uptodate pages from the stripe
 301  * array in the source rbio and putting them into the destination rbio
 302  */
 303 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 304 {
 305         int i;
 306         struct page *s;
 307         struct page *d;
 308
 309         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 310                 return;
 311
 312         for (i = 0; i < dest->nr_pages; i++) {
 313                 s = src->stripe_pages[i];
 314                 if (!s || !PageUptodate(s)) {
 315                         continue;
 316                 }
 317
 318                 d = dest->stripe_pages[i];
 319                 if (d)
 320                         __free_page(d);
 321
 322                 dest->stripe_pages[i] = s;
 323                 src->stripe_pages[i] = NULL;
 324         }
 325 }
 326
 327 /*
 328  * merging means we take the bio_list from the victim and
 329  * splice it into the destination.  The victim should
 330  * be discarded afterwards.
 331  *
 332  * must be called with dest->rbio_list_lock held
 333  */
 334 static void merge_rbio(struct btrfs_raid_bio *dest,
 335                        struct btrfs_raid_bio *victim)
 336 {
 337         bio_list_merge(&dest->bio_list, &victim->bio_list);
 338         dest->bio_list_bytes += victim->bio_list_bytes;
 339         bio_list_init(&victim->bio_list);
 340 }
 341
 342 /*
 343  * used to prune items that are in the cache.  The caller
 344  * must hold the hash table lock.
 345  */
 346 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 347 {
 348         int bucket = rbio_bucket(rbio);
 349         struct btrfs_stripe_hash_table *table;
 350         struct btrfs_stripe_hash *h;
 351         int freeit = 0;
 352
 353         /*
 354          * check the bit again under the hash table lock.
 355          */
 356         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 357                 return;
 358
 359         table = rbio->fs_info->stripe_hash_table;
 360         h = table->table + bucket;
 361
 362         /* hold the lock for the bucket because we may be
 363          * removing it from the hash table
 364          */
 365         spin_lock(&h->lock);
 366
 367         /*
 368          * hold the lock for the bio list because we need
 369          * to make sure the bio list is empty
 370          */
 371         spin_lock(&rbio->bio_list_lock);
 372
 373         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 374                 list_del_init(&rbio->stripe_cache);
 375                 table->cache_size -= 1;
 376                 freeit = 1;
 377
 378                 /* if the bio list isn't empty, this rbio is
 379                  * still involved in an IO.  We take it out
 380                  * of the cache list, and drop the ref that
 381                  * was held for the list.
 382                  *
 383                  * If the bio_list was empty, we also remove
 384                  * the rbio from the hash_table, and drop
 385                  * the corresponding ref
 386                  */
 387                 if (bio_list_empty(&rbio->bio_list)) {
 388                         if (!list_empty(&rbio->hash_list)) {
 389                                 list_del_init(&rbio->hash_list);
 390                                 atomic_dec(&rbio->refs);
 391                                 BUG_ON(!list_empty(&rbio->plug_list));
 392                         }
 393                 }
 394         }
 395
 396         spin_unlock(&rbio->bio_list_lock);
 397         spin_unlock(&h->lock);
 398
 399         if (freeit)
 400                 __free_raid_bio(rbio);
 401 }
 402
 403 /*
 404  * prune a given rbio from the cache
 405  */
 406 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 407 {
 408         struct btrfs_stripe_hash_table *table;
 409         unsigned long flags;
 410
 411         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 412                 return;
 413
 414         table = rbio->fs_info->stripe_hash_table;
 415
 416         spin_lock_irqsave(&table->cache_lock, flags);
 417         __remove_rbio_from_cache(rbio);
 418         spin_unlock_irqrestore(&table->cache_lock, flags);
 419 }
 420
 421 /*
 422  * remove everything in the cache
 423  */
 424 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 425 {
 426         struct btrfs_stripe_hash_table *table;
 427         unsigned long flags;
 428         struct btrfs_raid_bio *rbio;
 429
 430         table = info->stripe_hash_table;
 431
 432         spin_lock_irqsave(&table->cache_lock, flags);
 433         while (!list_empty(&table->stripe_cache)) {
 434                 rbio = list_entry(table->stripe_cache.next,
 435                                   struct btrfs_raid_bio,
 436                                   stripe_cache);
 437                 __remove_rbio_from_cache(rbio);
 438         }
 439         spin_unlock_irqrestore(&table->cache_lock, flags);
 440 }
 441
 442 /*
 443  * remove all cached entries and free the hash table
 444  * used by unmount
 445  */
 446 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 447 {
 448         if (!info->stripe_hash_table)
 449                 return;
 450         btrfs_clear_rbio_cache(info);
 451         if (is_vmalloc_addr(info->stripe_hash_table))
 452                 vfree(info->stripe_hash_table);
 453         else
 454                 kfree(info->stripe_hash_table);
 455         info->stripe_hash_table = NULL;
 456 }
 457
 458 /*
 459  * insert an rbio into the stripe cache.  It
 460  * must have already been prepared by calling
 461  * cache_rbio_pages
 462  *
 463  * If this rbio was already cached, it gets
 464  * moved to the front of the lru.
 465  *
 466  * If the size of the rbio cache is too big, we
 467  * prune an item.
 468  */
 469 static void cache_rbio(struct btrfs_raid_bio *rbio)
 470 {
 471         struct btrfs_stripe_hash_table *table;
 472         unsigned long flags;
 473
 474         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 475                 return;
 476
 477         table = rbio->fs_info->stripe_hash_table;
 478
 479         spin_lock_irqsave(&table->cache_lock, flags);
 480         spin_lock(&rbio->bio_list_lock);
 481
 482         /* bump our ref if we were not in the list before */
 483         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 484                 atomic_inc(&rbio->refs);
 485
 486         if (!list_empty(&rbio->stripe_cache)){
 487                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 488         } else {
 489                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 490                 table->cache_size += 1;
 491         }
 492
 493         spin_unlock(&rbio->bio_list_lock);
 494
 495         if (table->cache_size > RBIO_CACHE_SIZE) {
 496                 struct btrfs_raid_bio *found;
 497
 498                 found = list_entry(table->stripe_cache.prev,
 499                                   struct btrfs_raid_bio,
 500                                   stripe_cache);
 501
 502                 if (found != rbio)
 503                         __remove_rbio_from_cache(found);
 504         }
 505
 506         spin_unlock_irqrestore(&table->cache_lock, flags);
 507         return;
 508 }
 509
 510 /*
 511  * helper function to run the xor_blocks api.  It is only
 512  * able to do MAX_XOR_BLOCKS at a time, so we need to
 513  * loop through.
 514  */
 515 static void run_xor(void **pages, int src_cnt, ssize_t len)
 516 {
 517         int src_off = 0;
 518         int xor_src_cnt = 0;
 519         void *dest = pages[src_cnt];
 520
 521         while(src_cnt > 0) {
 522                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 523                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 524
 525                 src_cnt -= xor_src_cnt;
 526                 src_off += xor_src_cnt;
 527         }
 528 }
 529
 530 /*
 531  * returns true if the bio list inside this rbio
 532  * covers an entire stripe (no rmw required).
 533  * Must be called with the bio list lock held, or
 534  * at a time when you know it is impossible to add
 535  * new bios into the list
 536  */
 537 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 538 {
 539         unsigned long size = rbio->bio_list_bytes;
 540         int ret = 1;
 541
 542         if (size != rbio->nr_data * rbio->stripe_len)
 543                 ret = 0;
 544
 545         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 546         return ret;
 547 }
 548
 549 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 550 {
 551         unsigned long flags;
 552         int ret;
 553
 554         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 555         ret = __rbio_is_full(rbio);
 556         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 557         return ret;
 558 }
 559
 560 /*
 561  * returns 1 if it is safe to merge two rbios together.
 562  * The merging is safe if the two rbios correspond to
 563  * the same stripe and if they are both going in the same
 564  * direction (read vs write), and if neither one is
 565  * locked for final IO
 566  *
 567  * The caller is responsible for locking such that
 568  * rmw_locked is safe to test
 569  */
 570 static int rbio_can_merge(struct btrfs_raid_bio *last,
 571                           struct btrfs_raid_bio *cur)
 572 {
 573         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 574             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 575                 return 0;
 576
 577         /*
 578          * we can't merge with cached rbios, since the
 579          * idea is that when we merge the destination
 580          * rbio is going to run our IO for us.  We can
 581          * steal from cached rbio's though, other functions
 582          * handle that.
 583          */
 584         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 585             test_bit(RBIO_CACHE_BIT, &cur->flags))
 586                 return 0;
 587
 588         if (last->raid_map[0] !=
 589             cur->raid_map[0])
 590                 return 0;
 591
 592         /* reads can't merge with writes */
 593         if (last->read_rebuild !=
 594             cur->read_rebuild) {
 595                 return 0;
 596         }
 597
 598         return 1;
 599 }
 600
 601 /*
 602  * helper to index into the pstripe
 603  */
 604 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 605 {
 606         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 607         return rbio->stripe_pages[index];
 608 }
 609
 610 /*
 611  * helper to index into the qstripe, returns null
 612  * if there is no qstripe
 613  */
 614 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 615 {
 616         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 617                 return NULL;
 618
 619         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 620                 PAGE_CACHE_SHIFT;
 621         return rbio->stripe_pages[index];
 622 }
 623
 624 /*
 625  * The first stripe in the table for a logical address
 626  * has the lock.  rbios are added in one of three ways:
 627  *
 628  * 1) Nobody has the stripe locked yet.  The rbio is given
 629  * the lock and 0 is returned.  The caller must start the IO
 630  * themselves.
 631  *
 632  * 2) Someone has the stripe locked, but we're able to merge
 633  * with the lock owner.  The rbio is freed and the IO will
 634  * start automatically along with the existing rbio.  1 is returned.
 635  *
 636  * 3) Someone has the stripe locked, but we're not able to merge.
 637  * The rbio is added to the lock owner's plug list, or merged into
 638  * an rbio already on the plug list.  When the lock owner unlocks,
 639  * the next rbio on the list is run and the IO is started automatically.
 640  * 1 is returned
 641  *
 642  * If we return 0, the caller still owns the rbio and must continue with
 643  * IO submission.  If we return 1, the caller must assume the rbio has
 644  * already been freed.
 645  */
 646 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 647 {
 648         int bucket = rbio_bucket(rbio);
 649         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 650         struct btrfs_raid_bio *cur;
 651         struct btrfs_raid_bio *pending;
 652         unsigned long flags;
 653         DEFINE_WAIT(wait);
 654         struct btrfs_raid_bio *freeit = NULL;
 655         struct btrfs_raid_bio *cache_drop = NULL;
 656         int ret = 0;
 657         int walk = 0;
 658
 659         spin_lock_irqsave(&h->lock, flags);
 660         list_for_each_entry(cur, &h->hash_list, hash_list) {
 661                 walk++;
 662                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 663                         spin_lock(&cur->bio_list_lock);
 664
 665                         /* can we steal this cached rbio's pages? */
 666                         if (bio_list_empty(&cur->bio_list) &&
 667                             list_empty(&cur->plug_list) &&
 668                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 669                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 670                                 list_del_init(&cur->hash_list);
 671                                 atomic_dec(&cur->refs);
 672
 673                                 steal_rbio(cur, rbio);
 674                                 cache_drop = cur;
 675                                 spin_unlock(&cur->bio_list_lock);
 676
 677                                 goto lockit;
 678                         }
 679
 680                         /* can we merge into the lock owner? */
 681                         if (rbio_can_merge(cur, rbio)) {
 682                                 merge_rbio(cur, rbio);
 683                                 spin_unlock(&cur->bio_list_lock);
 684                                 freeit = rbio;
 685                                 ret = 1;
 686                                 goto out;
 687                         }
 688
 689
 690                         /*
 691                          * we couldn't merge with the running
 692                          * rbio, see if we can merge with the
 693                          * pending ones.  We don't have to
 694                          * check for rmw_locked because there
 695                          * is no way they are inside finish_rmw
 696                          * right now
 697                          */
 698                         list_for_each_entry(pending, &cur->plug_list,
 699                                             plug_list) {
 700                                 if (rbio_can_merge(pending, rbio)) {
 701                                         merge_rbio(pending, rbio);
 702                                         spin_unlock(&cur->bio_list_lock);
 703                                         freeit = rbio;
 704                                         ret = 1;
 705                                         goto out;
 706                                 }
 707                         }
 708
 709                         /* no merging, put us on the tail of the plug list,
 710                          * our rbio will be started with the currently
 711                          * running rbio unlocks
 712                          */
 713                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 714                         spin_unlock(&cur->bio_list_lock);
 715                         ret = 1;
 716                         goto out;
 717                 }
 718         }
 719 lockit:
 720         atomic_inc(&rbio->refs);
 721         list_add(&rbio->hash_list, &h->hash_list);
 722 out:
 723         spin_unlock_irqrestore(&h->lock, flags);
 724         if (cache_drop)
 725                 remove_rbio_from_cache(cache_drop);
 726         if (freeit)
 727                 __free_raid_bio(freeit);
 728         return ret;
 729 }
 730
 731 /*
 732  * called as rmw or parity rebuild is completed.  If the plug list has more
 733  * rbios waiting for this stripe, the next one on the list will be started
 734  */
 735 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 736 {
 737         int bucket;
 738         struct btrfs_stripe_hash *h;
 739         unsigned long flags;
 740         int keep_cache = 0;
 741
 742         bucket = rbio_bucket(rbio);
 743         h = rbio->fs_info->stripe_hash_table->table + bucket;
 744
 745         if (list_empty(&rbio->plug_list))
 746                 cache_rbio(rbio);
 747
 748         spin_lock_irqsave(&h->lock, flags);
 749         spin_lock(&rbio->bio_list_lock);
 750
 751         if (!list_empty(&rbio->hash_list)) {
 752                 /*
 753                  * if we're still cached and there is no other IO
 754                  * to perform, just leave this rbio here for others
 755                  * to steal from later
 756                  */
 757                 if (list_empty(&rbio->plug_list) &&
 758                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 759                         keep_cache = 1;
 760                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 761                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 762                         goto done;
 763                 }
 764
 765                 list_del_init(&rbio->hash_list);
 766                 atomic_dec(&rbio->refs);
 767
 768                 /*
 769                  * we use the plug list to hold all the rbios
 770                  * waiting for the chance to lock this stripe.
 771                  * hand the lock over to one of them.
 772                  */
 773                 if (!list_empty(&rbio->plug_list)) {
 774                         struct btrfs_raid_bio *next;
 775                         struct list_head *head = rbio->plug_list.next;
 776
 777                         next = list_entry(head, struct btrfs_raid_bio,
 778                                           plug_list);
 779
 780                         list_del_init(&rbio->plug_list);
 781
 782                         list_add(&next->hash_list, &h->hash_list);
 783                         atomic_inc(&next->refs);
 784                         spin_unlock(&rbio->bio_list_lock);
 785                         spin_unlock_irqrestore(&h->lock, flags);
 786
 787                         if (next->read_rebuild)
 788                                 async_read_rebuild(next);
 789                         else {
 790                                 steal_rbio(rbio, next);
 791                                 async_rmw_stripe(next);
 792                         }
 793
 794                         goto done_nolock;
 795                 } else  if (waitqueue_active(&h->wait)) {
 796                         spin_unlock(&rbio->bio_list_lock);
 797                         spin_unlock_irqrestore(&h->lock, flags);
 798                         wake_up(&h->wait);
 799                         goto done_nolock;
 800                 }
 801         }
 802 done:
 803         spin_unlock(&rbio->bio_list_lock);
 804         spin_unlock_irqrestore(&h->lock, flags);
 805
 806 done_nolock:
 807         if (!keep_cache)
 808                 remove_rbio_from_cache(rbio);
 809 }
 810
 811 static inline void
 812 __free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
 813 {
 814         if (need) {
 815                 kfree(raid_map);
 816                 kfree(bbio);
 817         }
 818 }
 819
 820 static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
 821 {
 822         __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
 823                         !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
 824 }
 825
 826 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 827 {
 828         int i;
 829
 830         WARN_ON(atomic_read(&rbio->refs) < 0);
 831         if (!atomic_dec_and_test(&rbio->refs))
 832                 return;
 833
 834         WARN_ON(!list_empty(&rbio->stripe_cache));
 835         WARN_ON(!list_empty(&rbio->hash_list));
 836         WARN_ON(!bio_list_empty(&rbio->bio_list));
 837
 838         for (i = 0; i < rbio->nr_pages; i++) {
 839                 if (rbio->stripe_pages[i]) {
 840                         __free_page(rbio->stripe_pages[i]);
 841                         rbio->stripe_pages[i] = NULL;
 842                 }
 843         }
 844
 845         free_bbio_and_raid_map(rbio);
 846
 847         kfree(rbio);
 848 }
 849
 850 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 851 {
 852         unlock_stripe(rbio);
 853         __free_raid_bio(rbio);
 854 }
 855
 856 /*
 857  * this frees the rbio and runs through all the bios in the
 858  * bio_list and calls end_io on them
 859  */
 860 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 861 {
 862         struct bio *cur = bio_list_get(&rbio->bio_list);
 863         struct bio *next;
 864         free_raid_bio(rbio);
 865
 866         while (cur) {
 867                 next = cur->bi_next;
 868                 cur->bi_next = NULL;
 869                 if (uptodate)
 870                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 871                 bio_endio(cur, err);
 872                 cur = next;
 873         }
 874 }
 875
 876 /*
 877  * end io function used by finish_rmw.  When we finally
 878  * get here, we've written a full stripe
 879  */
 880 static void raid_write_end_io(struct bio *bio, int err)
 881 {
 882         struct btrfs_raid_bio *rbio = bio->bi_private;
 883
 884         if (err)
 885                 fail_bio_stripe(rbio, bio);
 886
 887         bio_put(bio);
 888
 889         if (!atomic_dec_and_test(&rbio->stripes_pending))
 890                 return;
 891
 892         err = 0;
 893
 894         /* OK, we have read all the stripes we need to. */
 895         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 896                 err = -EIO;
 897
 898         rbio_orig_end_io(rbio, err, 0);
 899         return;
 900 }
 901
 902 /*
 903  * the read/modify/write code wants to use the original bio for
 904  * any pages it included, and then use the rbio for everything
 905  * else.  This function decides if a given index (stripe number)
 906  * and page number in that stripe fall inside the original bio
 907  * or the rbio.
 908  *
 909  * if you set bio_list_only, you'll get a NULL back for any ranges
 910  * that are outside the bio_list
 911  *
 912  * This doesn't take any refs on anything, you get a bare page pointer
 913  * and the caller must bump refs as required.
 914  *
 915  * You must call index_rbio_pages once before you can trust
 916  * the answers from this function.
 917  */
 918 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 919                                  int index, int pagenr, int bio_list_only)
 920 {
 921         int chunk_page;
 922         struct page *p = NULL;
 923
 924         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 925
 926         spin_lock_irq(&rbio->bio_list_lock);
 927         p = rbio->bio_pages[chunk_page];
 928         spin_unlock_irq(&rbio->bio_list_lock);
 929
 930         if (p || bio_list_only)
 931                 return p;
 932
 933         return rbio->stripe_pages[chunk_page];
 934 }
 935
 936 /*
 937  * number of pages we need for the entire stripe across all the
 938  * drives
 939  */
 940 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 941 {
 942         unsigned long nr = stripe_len * nr_stripes;
 943         return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
 944 }
 945
 946 /*
 947  * allocation and initial setup for the btrfs_raid_bio.  Not
 948  * this does not allocate any pages for rbio->pages.
 949  */
 950 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 951                           struct btrfs_bio *bbio, u64 *raid_map,
 952                           u64 stripe_len)
 953 {
 954         struct btrfs_raid_bio *rbio;
 955         int nr_data = 0;
 956         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 957         void *p;
 958
 959         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 960                         GFP_NOFS);
 961         if (!rbio)
 962                 return ERR_PTR(-ENOMEM);
 963
 964         bio_list_init(&rbio->bio_list);
 965         INIT_LIST_HEAD(&rbio->plug_list);
 966         spin_lock_init(&rbio->bio_list_lock);
 967         INIT_LIST_HEAD(&rbio->stripe_cache);
 968         INIT_LIST_HEAD(&rbio->hash_list);
 969         rbio->bbio = bbio;
 970         rbio->raid_map = raid_map;
 971         rbio->fs_info = root->fs_info;
 972         rbio->stripe_len = stripe_len;
 973         rbio->nr_pages = num_pages;
 974         rbio->faila = -1;
 975         rbio->failb = -1;
 976         atomic_set(&rbio->refs, 1);
 977         atomic_set(&rbio->error, 0);
 978         atomic_set(&rbio->stripes_pending, 0);
 979
 980         /*
 981          * the stripe_pages and bio_pages array point to the extra
 982          * memory we allocated past the end of the rbio
 983          */
 984         p = rbio + 1;
 985         rbio->stripe_pages = p;
 986         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 987
 988         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 989                 nr_data = bbio->num_stripes - 2;
 990         else
 991                 nr_data = bbio->num_stripes - 1;
 992
 993         rbio->nr_data = nr_data;
 994         return rbio;
 995 }
 996
 997 /* allocate pages for all the stripes in the bio, including parity */
 998 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 999 {
1000         int i;
1001         struct page *page;
1002
1003         for (i = 0; i < rbio->nr_pages; i++) {
1004                 if (rbio->stripe_pages[i])
1005                         continue;
1006                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1007                 if (!page)
1008                         return -ENOMEM;
1009                 rbio->stripe_pages[i] = page;
1010                 ClearPageUptodate(page);
1011         }
1012         return 0;
1013 }
1014
1015 /* allocate pages for just the p/q stripes */
1016 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1017 {
1018         int i;
1019         struct page *page;
1020
1021         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1022
1023         for (; i < rbio->nr_pages; i++) {
1024                 if (rbio->stripe_pages[i])
1025                         continue;
1026                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1027                 if (!page)
1028                         return -ENOMEM;
1029                 rbio->stripe_pages[i] = page;
1030         }
1031         return 0;
1032 }
1033
1034 /*
1035  * add a single page from a specific stripe into our list of bios for IO
1036  * this will try to merge into existing bios if possible, and returns
1037  * zero if all went well.
1038  */
1039 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1040                             struct bio_list *bio_list,
1041                             struct page *page,
1042                             int stripe_nr,
1043                             unsigned long page_index,
1044                             unsigned long bio_max_len)
1045 {
1046         struct bio *last = bio_list->tail;
1047         u64 last_end = 0;
1048         int ret;
1049         struct bio *bio;
1050         struct btrfs_bio_stripe *stripe;
1051         u64 disk_start;
1052
1053         stripe = &rbio->bbio->stripes[stripe_nr];
1054         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1055
1056         /* if the device is missing, just fail this stripe */
1057         if (!stripe->dev->bdev)
1058                 return fail_rbio_index(rbio, stripe_nr);
1059
1060         /* see if we can add this page onto our existing bio */
1061         if (last) {
1062                 last_end = (u64)last->bi_iter.bi_sector << 9;
1063                 last_end += last->bi_iter.bi_size;
1064
1065                 /*
1066                  * we can't merge these if they are from different
1067                  * devices or if they are not contiguous
1068                  */
1069                 if (last_end == disk_start && stripe->dev->bdev &&
1070                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1071                     last->bi_bdev == stripe->dev->bdev) {
1072                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1073                         if (ret == PAGE_CACHE_SIZE)
1074                                 return 0;
1075                 }
1076         }
1077
1078         /* put a new bio on the list */
1079         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1080         if (!bio)
1081                 return -ENOMEM;
1082
1083         bio->bi_iter.bi_size = 0;
1084         bio->bi_bdev = stripe->dev->bdev;
1085         bio->bi_iter.bi_sector = disk_start >> 9;
1086         set_bit(BIO_UPTODATE, &bio->bi_flags);
1087
1088         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1089         bio_list_add(bio_list, bio);
1090         return 0;
1091 }
1092
1093 /*
1094  * while we're doing the read/modify/write cycle, we could
1095  * have errors in reading pages off the disk.  This checks
1096  * for errors and if we're not able to read the page it'll
1097  * trigger parity reconstruction.  The rmw will be finished
1098  * after we've reconstructed the failed stripes
1099  */
1100 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1101 {
1102         if (rbio->faila >= 0 || rbio->failb >= 0) {
1103                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1104                 __raid56_parity_recover(rbio);
1105         } else {
1106                 finish_rmw(rbio);
1107         }
1108 }
1109
1110 /*
1111  * these are just the pages from the rbio array, not from anything
1112  * the FS sent down to us
1113  */
1114 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1115 {
1116         int index;
1117         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1118         index += page;
1119         return rbio->stripe_pages[index];
1120 }
1121
1122 /*
1123  * helper function to walk our bio list and populate the bio_pages array with
1124  * the result.  This seems expensive, but it is faster than constantly
1125  * searching through the bio list as we setup the IO in finish_rmw or stripe
1126  * reconstruction.
1127  *
1128  * This must be called before you trust the answers from page_in_rbio
1129  */
1130 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1131 {
1132         struct bio *bio;
1133         u64 start;
1134         unsigned long stripe_offset;
1135         unsigned long page_index;
1136         struct page *p;
1137         int i;
1138
1139         spin_lock_irq(&rbio->bio_list_lock);
1140         bio_list_for_each(bio, &rbio->bio_list) {
1141                 start = (u64)bio->bi_iter.bi_sector << 9;
1142                 stripe_offset = start - rbio->raid_map[0];
1143                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1144
1145                 for (i = 0; i < bio->bi_vcnt; i++) {
1146                         p = bio->bi_io_vec[i].bv_page;
1147                         rbio->bio_pages[page_index + i] = p;
1148                 }
1149         }
1150         spin_unlock_irq(&rbio->bio_list_lock);
1151 }
1152
1153 /*
1154  * this is called from one of two situations.  We either
1155  * have a full stripe from the higher layers, or we've read all
1156  * the missing bits off disk.
1157  *
1158  * This will calculate the parity and then send down any
1159  * changed blocks.
1160  */
1161 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1162 {
1163         struct btrfs_bio *bbio = rbio->bbio;
1164         void *pointers[bbio->num_stripes];
1165         int stripe_len = rbio->stripe_len;
1166         int nr_data = rbio->nr_data;
1167         int stripe;
1168         int pagenr;
1169         int p_stripe = -1;
1170         int q_stripe = -1;
1171         struct bio_list bio_list;
1172         struct bio *bio;
1173         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1174         int ret;
1175
1176         bio_list_init(&bio_list);
1177
1178         if (bbio->num_stripes - rbio->nr_data == 1) {
1179                 p_stripe = bbio->num_stripes - 1;
1180         } else if (bbio->num_stripes - rbio->nr_data == 2) {
1181                 p_stripe = bbio->num_stripes - 2;
1182                 q_stripe = bbio->num_stripes - 1;
1183         } else {
1184                 BUG();
1185         }
1186
1187         /* at this point we either have a full stripe,
1188          * or we've read the full stripe from the drive.
1189          * recalculate the parity and write the new results.
1190          *
1191          * We're not allowed to add any new bios to the
1192          * bio list here, anyone else that wants to
1193          * change this stripe needs to do their own rmw.
1194          */
1195         spin_lock_irq(&rbio->bio_list_lock);
1196         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1197         spin_unlock_irq(&rbio->bio_list_lock);
1198
1199         atomic_set(&rbio->error, 0);
1200
1201         /*
1202          * now that we've set rmw_locked, run through the
1203          * bio list one last time and map the page pointers
1204          *
1205          * We don't cache full rbios because we're assuming
1206          * the higher layers are unlikely to use this area of
1207          * the disk again soon.  If they do use it again,
1208          * hopefully they will send another full bio.
1209          */
1210         index_rbio_pages(rbio);
1211         if (!rbio_is_full(rbio))
1212                 cache_rbio_pages(rbio);
1213         else
1214                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1215
1216         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1217                 struct page *p;
1218                 /* first collect one page from each data stripe */
1219                 for (stripe = 0; stripe < nr_data; stripe++) {
1220                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1221                         pointers[stripe] = kmap(p);
1222                 }
1223
1224                 /* then add the parity stripe */
1225                 p = rbio_pstripe_page(rbio, pagenr);
1226                 SetPageUptodate(p);
1227                 pointers[stripe++] = kmap(p);
1228
1229                 if (q_stripe != -1) {
1230
1231                         /*
1232                          * raid6, add the qstripe and call the
1233                          * library function to fill in our p/q
1234                          */
1235                         p = rbio_qstripe_page(rbio, pagenr);
1236                         SetPageUptodate(p);
1237                         pointers[stripe++] = kmap(p);
1238
1239                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1240                                                 pointers);
1241                 } else {
1242                         /* raid5 */
1243                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1244                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1245                 }
1246
1247
1248                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1249                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1250         }
1251
1252         /*
1253          * time to start writing.  Make bios for everything from the
1254          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1255          * everything else.
1256          */
1257         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1258                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1259                         struct page *page;
1260                         if (stripe < rbio->nr_data) {
1261                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1262                                 if (!page)
1263                                         continue;
1264                         } else {
1265                                page = rbio_stripe_page(rbio, stripe, pagenr);
1266                         }
1267
1268                         ret = rbio_add_io_page(rbio, &bio_list,
1269                                        page, stripe, pagenr, rbio->stripe_len);
1270                         if (ret)
1271                                 goto cleanup;
1272                 }
1273         }
1274
1275         atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1276         BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1277
1278         while (1) {
1279                 bio = bio_list_pop(&bio_list);
1280                 if (!bio)
1281                         break;
1282
1283                 bio->bi_private = rbio;
1284                 bio->bi_end_io = raid_write_end_io;
1285                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1286                 submit_bio(WRITE, bio);
1287         }
1288         return;
1289
1290 cleanup:
1291         rbio_orig_end_io(rbio, -EIO, 0);
1292 }
1293
1294 /*
1295  * helper to find the stripe number for a given bio.  Used to figure out which
1296  * stripe has failed.  This expects the bio to correspond to a physical disk,
1297  * so it looks up based on physical sector numbers.
1298  */
1299 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1300                            struct bio *bio)
1301 {
1302         u64 physical = bio->bi_iter.bi_sector;
1303         u64 stripe_start;
1304         int i;
1305         struct btrfs_bio_stripe *stripe;
1306
1307         physical <<= 9;
1308
1309         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1310                 stripe = &rbio->bbio->stripes[i];
1311                 stripe_start = stripe->physical;
1312                 if (physical >= stripe_start &&
1313                     physical < stripe_start + rbio->stripe_len) {
1314                         return i;
1315                 }
1316         }
1317         return -1;
1318 }
1319
1320 /*
1321  * helper to find the stripe number for a given
1322  * bio (before mapping).  Used to figure out which stripe has
1323  * failed.  This looks up based on logical block numbers.
1324  */
1325 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1326                                    struct bio *bio)
1327 {
1328         u64 logical = bio->bi_iter.bi_sector;
1329         u64 stripe_start;
1330         int i;
1331
1332         logical <<= 9;
1333
1334         for (i = 0; i < rbio->nr_data; i++) {
1335                 stripe_start = rbio->raid_map[i];
1336                 if (logical >= stripe_start &&
1337                     logical < stripe_start + rbio->stripe_len) {
1338                         return i;
1339                 }
1340         }
1341         return -1;
1342 }
1343
1344 /*
1345  * returns -EIO if we had too many failures
1346  */
1347 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1348 {
1349         unsigned long flags;
1350         int ret = 0;
1351
1352         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1353
1354         /* we already know this stripe is bad, move on */
1355         if (rbio->faila == failed || rbio->failb == failed)
1356                 goto out;
1357
1358         if (rbio->faila == -1) {
1359                 /* first failure on this rbio */
1360                 rbio->faila = failed;
1361                 atomic_inc(&rbio->error);
1362         } else if (rbio->failb == -1) {
1363                 /* second failure on this rbio */
1364                 rbio->failb = failed;
1365                 atomic_inc(&rbio->error);
1366         } else {
1367                 ret = -EIO;
1368         }
1369 out:
1370         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1371
1372         return ret;
1373 }
1374
1375 /*
1376  * helper to fail a stripe based on a physical disk
1377  * bio.
1378  */
1379 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1380                            struct bio *bio)
1381 {
1382         int failed = find_bio_stripe(rbio, bio);
1383
1384         if (failed < 0)
1385                 return -EIO;
1386
1387         return fail_rbio_index(rbio, failed);
1388 }
1389
1390 /*
1391  * this sets each page in the bio uptodate.  It should only be used on private
1392  * rbio pages, nothing that comes in from the higher layers
1393  */
1394 static void set_bio_pages_uptodate(struct bio *bio)
1395 {
1396         int i;
1397         struct page *p;
1398
1399         for (i = 0; i < bio->bi_vcnt; i++) {
1400                 p = bio->bi_io_vec[i].bv_page;
1401                 SetPageUptodate(p);
1402         }
1403 }
1404
1405 /*
1406  * end io for the read phase of the rmw cycle.  All the bios here are physical
1407  * stripe bios we've read from the disk so we can recalculate the parity of the
1408  * stripe.
1409  *
1410  * This will usually kick off finish_rmw once all the bios are read in, but it
1411  * may trigger parity reconstruction if we had any errors along the way
1412  */
1413 static void raid_rmw_end_io(struct bio *bio, int err)
1414 {
1415         struct btrfs_raid_bio *rbio = bio->bi_private;
1416
1417         if (err)
1418                 fail_bio_stripe(rbio, bio);
1419         else
1420                 set_bio_pages_uptodate(bio);
1421
1422         bio_put(bio);
1423
1424         if (!atomic_dec_and_test(&rbio->stripes_pending))
1425                 return;
1426
1427         err = 0;
1428         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1429                 goto cleanup;
1430
1431         /*
1432          * this will normally call finish_rmw to start our write
1433          * but if there are any failed stripes we'll reconstruct
1434          * from parity first
1435          */
1436         validate_rbio_for_rmw(rbio);
1437         return;
1438
1439 cleanup:
1440
1441         rbio_orig_end_io(rbio, -EIO, 0);
1442 }
1443
1444 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1445 {
1446         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1447                         rmw_work, NULL, NULL);
1448
1449         btrfs_queue_work(rbio->fs_info->rmw_workers,
1450                          &rbio->work);
1451 }
1452
1453 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1454 {
1455         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1456                         read_rebuild_work, NULL, NULL);
1457
1458         btrfs_queue_work(rbio->fs_info->rmw_workers,
1459                          &rbio->work);
1460 }
1461
1462 /*
1463  * the stripe must be locked by the caller.  It will
1464  * unlock after all the writes are done
1465  */
1466 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1467 {
1468         int bios_to_read = 0;
1469         struct bio_list bio_list;
1470         int ret;
1471         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1472         int pagenr;
1473         int stripe;
1474         struct bio *bio;
1475
1476         bio_list_init(&bio_list);
1477
1478         ret = alloc_rbio_pages(rbio);
1479         if (ret)
1480                 goto cleanup;
1481
1482         index_rbio_pages(rbio);
1483
1484         atomic_set(&rbio->error, 0);
1485         /*
1486          * build a list of bios to read all the missing parts of this
1487          * stripe
1488          */
1489         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1490                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1491                         struct page *page;
1492                         /*
1493                          * we want to find all the pages missing from
1494                          * the rbio and read them from the disk.  If
1495                          * page_in_rbio finds a page in the bio list
1496                          * we don't need to read it off the stripe.
1497                          */
1498                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1499                         if (page)
1500                                 continue;
1501
1502                         page = rbio_stripe_page(rbio, stripe, pagenr);
1503                         /*
1504                          * the bio cache may have handed us an uptodate
1505                          * page.  If so, be happy and use it
1506                          */
1507                         if (PageUptodate(page))
1508                                 continue;
1509
1510                         ret = rbio_add_io_page(rbio, &bio_list, page,
1511                                        stripe, pagenr, rbio->stripe_len);
1512                         if (ret)
1513                                 goto cleanup;
1514                 }
1515         }
1516
1517         bios_to_read = bio_list_size(&bio_list);
1518         if (!bios_to_read) {
1519                 /*
1520                  * this can happen if others have merged with
1521                  * us, it means there is nothing left to read.
1522                  * But if there are missing devices it may not be
1523                  * safe to do the full stripe write yet.
1524                  */
1525                 goto finish;
1526         }
1527
1528         /*
1529          * the bbio may be freed once we submit the last bio.  Make sure
1530          * not to touch it after that
1531          */
1532         atomic_set(&rbio->stripes_pending, bios_to_read);
1533         while (1) {
1534                 bio = bio_list_pop(&bio_list);
1535                 if (!bio)
1536                         break;
1537
1538                 bio->bi_private = rbio;
1539                 bio->bi_end_io = raid_rmw_end_io;
1540
1541                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1542                                     BTRFS_WQ_ENDIO_RAID56);
1543
1544                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1545                 submit_bio(READ, bio);
1546         }
1547         /* the actual write will happen once the reads are done */
1548         return 0;
1549
1550 cleanup:
1551         rbio_orig_end_io(rbio, -EIO, 0);
1552         return -EIO;
1553
1554 finish:
1555         validate_rbio_for_rmw(rbio);
1556         return 0;
1557 }
1558
1559 /*
1560  * if the upper layers pass in a full stripe, we thank them by only allocating
1561  * enough pages to hold the parity, and sending it all down quickly.
1562  */
1563 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1564 {
1565         int ret;
1566
1567         ret = alloc_rbio_parity_pages(rbio);
1568         if (ret) {
1569                 __free_raid_bio(rbio);
1570                 return ret;
1571         }
1572
1573         ret = lock_stripe_add(rbio);
1574         if (ret == 0)
1575                 finish_rmw(rbio);
1576         return 0;
1577 }
1578
1579 /*
1580  * partial stripe writes get handed over to async helpers.
1581  * We're really hoping to merge a few more writes into this
1582  * rbio before calculating new parity
1583  */
1584 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1585 {
1586         int ret;
1587
1588         ret = lock_stripe_add(rbio);
1589         if (ret == 0)
1590                 async_rmw_stripe(rbio);
1591         return 0;
1592 }
1593
1594 /*
1595  * sometimes while we were reading from the drive to
1596  * recalculate parity, enough new bios come into create
1597  * a full stripe.  So we do a check here to see if we can
1598  * go directly to finish_rmw
1599  */
1600 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1601 {
1602         /* head off into rmw land if we don't have a full stripe */
1603         if (!rbio_is_full(rbio))
1604                 return partial_stripe_write(rbio);
1605         return full_stripe_write(rbio);
1606 }
1607
1608 /*
1609  * We use plugging call backs to collect full stripes.
1610  * Any time we get a partial stripe write while plugged
1611  * we collect it into a list.  When the unplug comes down,
1612  * we sort the list by logical block number and merge
1613  * everything we can into the same rbios
1614  */
1615 struct btrfs_plug_cb {
1616         struct blk_plug_cb cb;
1617         struct btrfs_fs_info *info;
1618         struct list_head rbio_list;
1619         struct btrfs_work work;
1620 };
1621
1622 /*
1623  * rbios on the plug list are sorted for easier merging.
1624  */
1625 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1626 {
1627         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1628                                                  plug_list);
1629         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1630                                                  plug_list);
1631         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1632         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1633
1634         if (a_sector < b_sector)
1635                 return -1;
1636         if (a_sector > b_sector)
1637                 return 1;
1638         return 0;
1639 }
1640
1641 static void run_plug(struct btrfs_plug_cb *plug)
1642 {
1643         struct btrfs_raid_bio *cur;
1644         struct btrfs_raid_bio *last = NULL;
1645
1646         /*
1647          * sort our plug list then try to merge
1648          * everything we can in hopes of creating full
1649          * stripes.
1650          */
1651         list_sort(NULL, &plug->rbio_list, plug_cmp);
1652         while (!list_empty(&plug->rbio_list)) {
1653                 cur = list_entry(plug->rbio_list.next,
1654                                  struct btrfs_raid_bio, plug_list);
1655                 list_del_init(&cur->plug_list);
1656
1657                 if (rbio_is_full(cur)) {
1658                         /* we have a full stripe, send it down */
1659                         full_stripe_write(cur);
1660                         continue;
1661                 }
1662                 if (last) {
1663                         if (rbio_can_merge(last, cur)) {
1664                                 merge_rbio(last, cur);
1665                                 __free_raid_bio(cur);
1666                                 continue;
1667
1668                         }
1669                         __raid56_parity_write(last);
1670                 }
1671                 last = cur;
1672         }
1673         if (last) {
1674                 __raid56_parity_write(last);
1675         }
1676         kfree(plug);
1677 }
1678
1679 /*
1680  * if the unplug comes from schedule, we have to push the
1681  * work off to a helper thread
1682  */
1683 static void unplug_work(struct btrfs_work *work)
1684 {
1685         struct btrfs_plug_cb *plug;
1686         plug = container_of(work, struct btrfs_plug_cb, work);
1687         run_plug(plug);
1688 }
1689
1690 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1691 {
1692         struct btrfs_plug_cb *plug;
1693         plug = container_of(cb, struct btrfs_plug_cb, cb);
1694
1695         if (from_schedule) {
1696                 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1697                                 unplug_work, NULL, NULL);
1698                 btrfs_queue_work(plug->info->rmw_workers,
1699                                  &plug->work);
1700                 return;
1701         }
1702         run_plug(plug);
1703 }
1704
1705 /*
1706  * our main entry point for writes from the rest of the FS.
1707  */
1708 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709                         struct btrfs_bio *bbio, u64 *raid_map,
1710                         u64 stripe_len)
1711 {
1712         struct btrfs_raid_bio *rbio;
1713         struct btrfs_plug_cb *plug = NULL;
1714         struct blk_plug_cb *cb;
1715
1716         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1717         if (IS_ERR(rbio)) {
1718                 __free_bbio_and_raid_map(bbio, raid_map, 1);
1719                 return PTR_ERR(rbio);
1720         }
1721         bio_list_add(&rbio->bio_list, bio);
1722         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1723
1724         /*
1725          * don't plug on full rbios, just get them out the door
1726          * as quickly as we can
1727          */
1728         if (rbio_is_full(rbio))
1729                 return full_stripe_write(rbio);
1730
1731         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1732                                sizeof(*plug));
1733         if (cb) {
1734                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1735                 if (!plug->info) {
1736                         plug->info = root->fs_info;
1737                         INIT_LIST_HEAD(&plug->rbio_list);
1738                 }
1739                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1740         } else {
1741                 return __raid56_parity_write(rbio);
1742         }
1743         return 0;
1744 }
1745
1746 /*
1747  * all parity reconstruction happens here.  We've read in everything
1748  * we can find from the drives and this does the heavy lifting of
1749  * sorting the good from the bad.
1750  */
1751 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1752 {
1753         int pagenr, stripe;
1754         void **pointers;
1755         int faila = -1, failb = -1;
1756         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1757         struct page *page;
1758         int err;
1759         int i;
1760
1761         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1762                            GFP_NOFS);
1763         if (!pointers) {
1764                 err = -ENOMEM;
1765                 goto cleanup_io;
1766         }
1767
1768         faila = rbio->faila;
1769         failb = rbio->failb;
1770
1771         if (rbio->read_rebuild) {
1772                 spin_lock_irq(&rbio->bio_list_lock);
1773                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1774                 spin_unlock_irq(&rbio->bio_list_lock);
1775         }
1776
1777         index_rbio_pages(rbio);
1778
1779         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1780                 /* setup our array of pointers with pages
1781                  * from each stripe
1782                  */
1783                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1784                         /*
1785                          * if we're rebuilding a read, we have to use
1786                          * pages from the bio list
1787                          */
1788                         if (rbio->read_rebuild &&
1789                             (stripe == faila || stripe == failb)) {
1790                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1791                         } else {
1792                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1793                         }
1794                         pointers[stripe] = kmap(page);
1795                 }
1796
1797                 /* all raid6 handling here */
1798                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1799                     RAID6_Q_STRIPE) {
1800
1801                         /*
1802                          * single failure, rebuild from parity raid5
1803                          * style
1804                          */
1805                         if (failb < 0) {
1806                                 if (faila == rbio->nr_data) {
1807                                         /*
1808                                          * Just the P stripe has failed, without
1809                                          * a bad data or Q stripe.
1810                                          * TODO, we should redo the xor here.
1811                                          */
1812                                         err = -EIO;
1813                                         goto cleanup;
1814                                 }
1815                                 /*
1816                                  * a single failure in raid6 is rebuilt
1817                                  * in the pstripe code below
1818                                  */
1819                                 goto pstripe;
1820                         }
1821
1822                         /* make sure our ps and qs are in order */
1823                         if (faila > failb) {
1824                                 int tmp = failb;
1825                                 failb = faila;
1826                                 faila = tmp;
1827                         }
1828
1829                         /* if the q stripe is failed, do a pstripe reconstruction
1830                          * from the xors.
1831                          * If both the q stripe and the P stripe are failed, we're
1832                          * here due to a crc mismatch and we can't give them the
1833                          * data they want
1834                          */
1835                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1836                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1837                                         err = -EIO;
1838                                         goto cleanup;
1839                                 }
1840                                 /*
1841                                  * otherwise we have one bad data stripe and
1842                                  * a good P stripe.  raid5!
1843                                  */
1844                                 goto pstripe;
1845                         }
1846
1847                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1848                                 raid6_datap_recov(rbio->bbio->num_stripes,
1849                                                   PAGE_SIZE, faila, pointers);
1850                         } else {
1851                                 raid6_2data_recov(rbio->bbio->num_stripes,
1852                                                   PAGE_SIZE, faila, failb,
1853                                                   pointers);
1854                         }
1855                 } else {
1856                         void *p;
1857
1858                         /* rebuild from P stripe here (raid5 or raid6) */
1859                         BUG_ON(failb != -1);
1860 pstripe:
1861                         /* Copy parity block into failed block to start with */
1862                         memcpy(pointers[faila],
1863                                pointers[rbio->nr_data],
1864                                PAGE_CACHE_SIZE);
1865
1866                         /* rearrange the pointer array */
1867                         p = pointers[faila];
1868                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1869                                 pointers[stripe] = pointers[stripe + 1];
1870                         pointers[rbio->nr_data - 1] = p;
1871
1872                         /* xor in the rest */
1873                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1874                 }
1875                 /* if we're doing this rebuild as part of an rmw, go through
1876                  * and set all of our private rbio pages in the
1877                  * failed stripes as uptodate.  This way finish_rmw will
1878                  * know they can be trusted.  If this was a read reconstruction,
1879                  * other endio functions will fiddle the uptodate bits
1880                  */
1881                 if (!rbio->read_rebuild) {
1882                         for (i = 0;  i < nr_pages; i++) {
1883                                 if (faila != -1) {
1884                                         page = rbio_stripe_page(rbio, faila, i);
1885                                         SetPageUptodate(page);
1886                                 }
1887                                 if (failb != -1) {
1888                                         page = rbio_stripe_page(rbio, failb, i);
1889                                         SetPageUptodate(page);
1890                                 }
1891                         }
1892                 }
1893                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1894                         /*
1895                          * if we're rebuilding a read, we have to use
1896                          * pages from the bio list
1897                          */
1898                         if (rbio->read_rebuild &&
1899                             (stripe == faila || stripe == failb)) {
1900                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1901                         } else {
1902                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1903                         }
1904                         kunmap(page);
1905                 }
1906         }
1907
1908         err = 0;
1909 cleanup:
1910         kfree(pointers);
1911
1912 cleanup_io:
1913
1914         if (rbio->read_rebuild) {
1915                 if (err == 0 &&
1916                     !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1917                         cache_rbio_pages(rbio);
1918                 else
1919                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1920
1921                 rbio_orig_end_io(rbio, err, err == 0);
1922         } else if (err == 0) {
1923                 rbio->faila = -1;
1924                 rbio->failb = -1;
1925                 finish_rmw(rbio);
1926         } else {
1927                 rbio_orig_end_io(rbio, err, 0);
1928         }
1929 }
1930
1931 /*
1932  * This is called only for stripes we've read from disk to
1933  * reconstruct the parity.
1934  */
1935 static void raid_recover_end_io(struct bio *bio, int err)
1936 {
1937         struct btrfs_raid_bio *rbio = bio->bi_private;
1938
1939         /*
1940          * we only read stripe pages off the disk, set them
1941          * up to date if there were no errors
1942          */
1943         if (err)
1944                 fail_bio_stripe(rbio, bio);
1945         else
1946                 set_bio_pages_uptodate(bio);
1947         bio_put(bio);
1948
1949         if (!atomic_dec_and_test(&rbio->stripes_pending))
1950                 return;
1951
1952         if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1953                 rbio_orig_end_io(rbio, -EIO, 0);
1954         else
1955                 __raid_recover_end_io(rbio);
1956 }
1957
1958 /*
1959  * reads everything we need off the disk to reconstruct
1960  * the parity. endio handlers trigger final reconstruction
1961  * when the IO is done.
1962  *
1963  * This is used both for reads from the higher layers and for
1964  * parity construction required to finish a rmw cycle.
1965  */
1966 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1967 {
1968         int bios_to_read = 0;
1969         struct btrfs_bio *bbio = rbio->bbio;
1970         struct bio_list bio_list;
1971         int ret;
1972         int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1973         int pagenr;
1974         int stripe;
1975         struct bio *bio;
1976
1977         bio_list_init(&bio_list);
1978
1979         ret = alloc_rbio_pages(rbio);
1980         if (ret)
1981                 goto cleanup;
1982
1983         atomic_set(&rbio->error, 0);
1984
1985         /*
1986          * read everything that hasn't failed.  Thanks to the
1987          * stripe cache, it is possible that some or all of these
1988          * pages are going to be uptodate.
1989          */
1990         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1991                 if (rbio->faila == stripe || rbio->failb == stripe) {
1992                         atomic_inc(&rbio->error);
1993                         continue;
1994                 }
1995
1996                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1997                         struct page *p;
1998
1999                         /*
2000                          * the rmw code may have already read this
2001                          * page in
2002                          */
2003                         p = rbio_stripe_page(rbio, stripe, pagenr);
2004                         if (PageUptodate(p))
2005                                 continue;
2006
2007                         ret = rbio_add_io_page(rbio, &bio_list,
2008                                        rbio_stripe_page(rbio, stripe, pagenr),
2009                                        stripe, pagenr, rbio->stripe_len);
2010                         if (ret < 0)
2011                                 goto cleanup;
2012                 }
2013         }
2014
2015         bios_to_read = bio_list_size(&bio_list);
2016         if (!bios_to_read) {
2017                 /*
2018                  * we might have no bios to read just because the pages
2019                  * were up to date, or we might have no bios to read because
2020                  * the devices were gone.
2021                  */
2022                 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2023                         __raid_recover_end_io(rbio);
2024                         goto out;
2025                 } else {
2026                         goto cleanup;
2027                 }
2028         }
2029
2030         /*
2031          * the bbio may be freed once we submit the last bio.  Make sure
2032          * not to touch it after that
2033          */
2034         atomic_set(&rbio->stripes_pending, bios_to_read);
2035         while (1) {
2036                 bio = bio_list_pop(&bio_list);
2037                 if (!bio)
2038                         break;
2039
2040                 bio->bi_private = rbio;
2041                 bio->bi_end_io = raid_recover_end_io;
2042
2043                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2044                                     BTRFS_WQ_ENDIO_RAID56);
2045
2046                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2047                 submit_bio(READ, bio);
2048         }
2049 out:
2050         return 0;
2051
2052 cleanup:
2053         if (rbio->read_rebuild)
2054                 rbio_orig_end_io(rbio, -EIO, 0);
2055         return -EIO;
2056 }
2057
2058 /*
2059  * the main entry point for reads from the higher layers.  This
2060  * is really only called when the normal read path had a failure,
2061  * so we assume the bio they send down corresponds to a failed part
2062  * of the drive.
2063  */
2064 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2065                           struct btrfs_bio *bbio, u64 *raid_map,
2066                           u64 stripe_len, int mirror_num, int hold_bbio)
2067 {
2068         struct btrfs_raid_bio *rbio;
2069         int ret;
2070
2071         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2072         if (IS_ERR(rbio)) {
2073                 __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
2074                 return PTR_ERR(rbio);
2075         }
2076
2077         if (hold_bbio)
2078                 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2079         rbio->read_rebuild = 1;
2080         bio_list_add(&rbio->bio_list, bio);
2081         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2082
2083         rbio->faila = find_logical_bio_stripe(rbio, bio);
2084         if (rbio->faila == -1) {
2085                 BUG();
2086                 __free_bbio_and_raid_map(bbio, raid_map, !hold_bbio);
2087                 kfree(rbio);
2088                 return -EIO;
2089         }
2090
2091         /*
2092          * reconstruct from the q stripe if they are
2093          * asking for mirror 3
2094          */
2095         if (mirror_num == 3)
2096                 rbio->failb = bbio->num_stripes - 2;
2097
2098         ret = lock_stripe_add(rbio);
2099
2100         /*
2101          * __raid56_parity_recover will end the bio with
2102          * any errors it hits.  We don't want to return
2103          * its error value up the stack because our caller
2104          * will end up calling bio_endio with any nonzero
2105          * return
2106          */
2107         if (ret == 0)
2108                 __raid56_parity_recover(rbio);
2109         /*
2110          * our rbio has been added to the list of
2111          * rbios that will be handled after the
2112          * currently lock owner is done
2113          */
2114         return 0;
2115
2116 }
2117
2118 static void rmw_work(struct btrfs_work *work)
2119 {
2120         struct btrfs_raid_bio *rbio;
2121
2122         rbio = container_of(work, struct btrfs_raid_bio, work);
2123         raid56_rmw_stripe(rbio);
2124 }
2125
2126 static void read_rebuild_work(struct btrfs_work *work)
2127 {
2128         struct btrfs_raid_bio *rbio;
2129
2130         rbio = container_of(work, struct btrfs_raid_bio, work);
2131         __raid56_parity_recover(rbio);
2132 }