fs/btrfs/volumes.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18 #include <linux/sched.h>
  19 #include <linux/bio.h>
  20 #include <linux/slab.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/iocontext.h>
  24 #include <linux/capability.h>
  25 #include <linux/ratelimit.h>
  26 #include <linux/kthread.h>
  27 #include <linux/raid/pq.h>
  28 #include <linux/semaphore.h>
  29 #include <linux/uuid.h>
  30 #include <asm/div64.h>
  31 #include "ctree.h"
  32 #include "extent_map.h"
  33 #include "disk-io.h"
  34 #include "transaction.h"
  35 #include "print-tree.h"
  36 #include "volumes.h"
  37 #include "raid56.h"
  38 #include "async-thread.h"
  39 #include "check-integrity.h"
  40 #include "rcu-string.h"
  41 #include "math.h"
  42 #include "dev-replace.h"
  43 #include "sysfs.h"
  44
  45 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  46         [BTRFS_RAID_RAID10] = {
  47                 .sub_stripes    = 2,
  48                 .dev_stripes    = 1,
  49                 .devs_max       = 0,    /* 0 == as many as possible */
  50                 .devs_min       = 4,
  51                 .tolerated_failures = 1,
  52                 .devs_increment = 2,
  53                 .ncopies        = 2,
  54         },
  55         [BTRFS_RAID_RAID1] = {
  56                 .sub_stripes    = 1,
  57                 .dev_stripes    = 1,
  58                 .devs_max       = 2,
  59                 .devs_min       = 2,
  60                 .tolerated_failures = 1,
  61                 .devs_increment = 2,
  62                 .ncopies        = 2,
  63         },
  64         [BTRFS_RAID_DUP] = {
  65                 .sub_stripes    = 1,
  66                 .dev_stripes    = 2,
  67                 .devs_max       = 1,
  68                 .devs_min       = 1,
  69                 .tolerated_failures = 0,
  70                 .devs_increment = 1,
  71                 .ncopies        = 2,
  72         },
  73         [BTRFS_RAID_RAID0] = {
  74                 .sub_stripes    = 1,
  75                 .dev_stripes    = 1,
  76                 .devs_max       = 0,
  77                 .devs_min       = 2,
  78                 .tolerated_failures = 0,
  79                 .devs_increment = 1,
  80                 .ncopies        = 1,
  81         },
  82         [BTRFS_RAID_SINGLE] = {
  83                 .sub_stripes    = 1,
  84                 .dev_stripes    = 1,
  85                 .devs_max       = 1,
  86                 .devs_min       = 1,
  87                 .tolerated_failures = 0,
  88                 .devs_increment = 1,
  89                 .ncopies        = 1,
  90         },
  91         [BTRFS_RAID_RAID5] = {
  92                 .sub_stripes    = 1,
  93                 .dev_stripes    = 1,
  94                 .devs_max       = 0,
  95                 .devs_min       = 2,
  96                 .tolerated_failures = 1,
  97                 .devs_increment = 1,
  98                 .ncopies        = 2,
  99         },
 100         [BTRFS_RAID_RAID6] = {
 101                 .sub_stripes    = 1,
 102                 .dev_stripes    = 1,
 103                 .devs_max       = 0,
 104                 .devs_min       = 3,
 105                 .tolerated_failures = 2,
 106                 .devs_increment = 1,
 107                 .ncopies        = 3,
 108         },
 109 };
 110
 111 const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
 112         [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
 113         [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
 114         [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
 115         [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
 116         [BTRFS_RAID_SINGLE] = 0,
 117         [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
 118         [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
 119 };
 120
 121 /*
 122  * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 123  * condition is not met. Zero means there's no corresponding
 124  * BTRFS_ERROR_DEV_*_NOT_MET value.
 125  */
 126 const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
 127         [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
 128         [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
 129         [BTRFS_RAID_DUP]    = 0,
 130         [BTRFS_RAID_RAID0]  = 0,
 131         [BTRFS_RAID_SINGLE] = 0,
 132         [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 133         [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 134 };
 135
 136 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 137                                 struct btrfs_fs_info *fs_info);
 138 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 139 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 140 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 141 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 142 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 143                              enum btrfs_map_op op,
 144                              u64 logical, u64 *length,
 145                              struct btrfs_bio **bbio_ret,
 146                              int mirror_num, int need_raid_map);
 147
 148 DEFINE_MUTEX(uuid_mutex);
 149 static LIST_HEAD(fs_uuids);
 150 struct list_head *btrfs_get_fs_uuids(void)
 151 {
 152         return &fs_uuids;
 153 }
 154
 155 static struct btrfs_fs_devices *__alloc_fs_devices(void)
 156 {
 157         struct btrfs_fs_devices *fs_devs;
 158
 159         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 160         if (!fs_devs)
 161                 return ERR_PTR(-ENOMEM);
 162
 163         mutex_init(&fs_devs->device_list_mutex);
 164
 165         INIT_LIST_HEAD(&fs_devs->devices);
 166         INIT_LIST_HEAD(&fs_devs->resized_devices);
 167         INIT_LIST_HEAD(&fs_devs->alloc_list);
 168         INIT_LIST_HEAD(&fs_devs->list);
 169
 170         return fs_devs;
 171 }
 172
 173 /**
 174  * alloc_fs_devices - allocate struct btrfs_fs_devices
 175  * @fsid:       a pointer to UUID for this FS.  If NULL a new UUID is
 176  *              generated.
 177  *
 178  * Return: a pointer to a new &struct btrfs_fs_devices on success;
 179  * ERR_PTR() on error.  Returned struct is not linked onto any lists and
 180  * can be destroyed with kfree() right away.
 181  */
 182 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 183 {
 184         struct btrfs_fs_devices *fs_devs;
 185
 186         fs_devs = __alloc_fs_devices();
 187         if (IS_ERR(fs_devs))
 188                 return fs_devs;
 189
 190         if (fsid)
 191                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 192         else
 193                 generate_random_uuid(fs_devs->fsid);
 194
 195         return fs_devs;
 196 }
 197
 198 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 199 {
 200         struct btrfs_device *device;
 201         WARN_ON(fs_devices->opened);
 202         while (!list_empty(&fs_devices->devices)) {
 203                 device = list_entry(fs_devices->devices.next,
 204                                     struct btrfs_device, dev_list);
 205                 list_del(&device->dev_list);
 206                 rcu_string_free(device->name);
 207                 kfree(device);
 208         }
 209         kfree(fs_devices);
 210 }
 211
 212 static void btrfs_kobject_uevent(struct block_device *bdev,
 213                                  enum kobject_action action)
 214 {
 215         int ret;
 216
 217         ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 218         if (ret)
 219                 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 220                         action,
 221                         kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 222                         &disk_to_dev(bdev->bd_disk)->kobj);
 223 }
 224
 225 void btrfs_cleanup_fs_uuids(void)
 226 {
 227         struct btrfs_fs_devices *fs_devices;
 228
 229         while (!list_empty(&fs_uuids)) {
 230                 fs_devices = list_entry(fs_uuids.next,
 231                                         struct btrfs_fs_devices, list);
 232                 list_del(&fs_devices->list);
 233                 free_fs_devices(fs_devices);
 234         }
 235 }
 236
 237 static struct btrfs_device *__alloc_device(void)
 238 {
 239         struct btrfs_device *dev;
 240
 241         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 242         if (!dev)
 243                 return ERR_PTR(-ENOMEM);
 244
 245         INIT_LIST_HEAD(&dev->dev_list);
 246         INIT_LIST_HEAD(&dev->dev_alloc_list);
 247         INIT_LIST_HEAD(&dev->resized_list);
 248
 249         spin_lock_init(&dev->io_lock);
 250
 251         spin_lock_init(&dev->reada_lock);
 252         atomic_set(&dev->reada_in_flight, 0);
 253         atomic_set(&dev->dev_stats_ccnt, 0);
 254         btrfs_device_data_ordered_init(dev);
 255         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 256         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 257
 258         return dev;
 259 }
 260
 261 static noinline struct btrfs_device *__find_device(struct list_head *head,
 262                                                    u64 devid, u8 *uuid)
 263 {
 264         struct btrfs_device *dev;
 265
 266         list_for_each_entry(dev, head, dev_list) {
 267                 if (dev->devid == devid &&
 268                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 269                         return dev;
 270                 }
 271         }
 272         return NULL;
 273 }
 274
 275 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 276 {
 277         struct btrfs_fs_devices *fs_devices;
 278
 279         list_for_each_entry(fs_devices, &fs_uuids, list) {
 280                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 281                         return fs_devices;
 282         }
 283         return NULL;
 284 }
 285
 286 static int
 287 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 288                       int flush, struct block_device **bdev,
 289                       struct buffer_head **bh)
 290 {
 291         int ret;
 292
 293         *bdev = blkdev_get_by_path(device_path, flags, holder);
 294
 295         if (IS_ERR(*bdev)) {
 296                 ret = PTR_ERR(*bdev);
 297                 goto error;
 298         }
 299
 300         if (flush)
 301                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 302         ret = set_blocksize(*bdev, 4096);
 303         if (ret) {
 304                 blkdev_put(*bdev, flags);
 305                 goto error;
 306         }
 307         invalidate_bdev(*bdev);
 308         *bh = btrfs_read_dev_super(*bdev);
 309         if (IS_ERR(*bh)) {
 310                 ret = PTR_ERR(*bh);
 311                 blkdev_put(*bdev, flags);
 312                 goto error;
 313         }
 314
 315         return 0;
 316
 317 error:
 318         *bdev = NULL;
 319         *bh = NULL;
 320         return ret;
 321 }
 322
 323 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 324                         struct bio *head, struct bio *tail)
 325 {
 326
 327         struct bio *old_head;
 328
 329         old_head = pending_bios->head;
 330         pending_bios->head = head;
 331         if (pending_bios->tail)
 332                 tail->bi_next = old_head;
 333         else
 334                 pending_bios->tail = tail;
 335 }
 336
 337 /*
 338  * we try to collect pending bios for a device so we don't get a large
 339  * number of procs sending bios down to the same device.  This greatly
 340  * improves the schedulers ability to collect and merge the bios.
 341  *
 342  * But, it also turns into a long list of bios to process and that is sure
 343  * to eventually make the worker thread block.  The solution here is to
 344  * make some progress and then put this work struct back at the end of
 345  * the list if the block device is congested.  This way, multiple devices
 346  * can make progress from a single worker thread.
 347  */
 348 static noinline void run_scheduled_bios(struct btrfs_device *device)
 349 {
 350         struct btrfs_fs_info *fs_info = device->fs_info;
 351         struct bio *pending;
 352         struct backing_dev_info *bdi;
 353         struct btrfs_pending_bios *pending_bios;
 354         struct bio *tail;
 355         struct bio *cur;
 356         int again = 0;
 357         unsigned long num_run;
 358         unsigned long batch_run = 0;
 359         unsigned long limit;
 360         unsigned long last_waited = 0;
 361         int force_reg = 0;
 362         int sync_pending = 0;
 363         struct blk_plug plug;
 364
 365         /*
 366          * this function runs all the bios we've collected for
 367          * a particular device.  We don't want to wander off to
 368          * another device without first sending all of these down.
 369          * So, setup a plug here and finish it off before we return
 370          */
 371         blk_start_plug(&plug);
 372
 373         bdi = device->bdev->bd_bdi;
 374         limit = btrfs_async_submit_limit(fs_info);
 375         limit = limit * 2 / 3;
 376
 377 loop:
 378         spin_lock(&device->io_lock);
 379
 380 loop_lock:
 381         num_run = 0;
 382
 383         /* take all the bios off the list at once and process them
 384          * later on (without the lock held).  But, remember the
 385          * tail and other pointers so the bios can be properly reinserted
 386          * into the list if we hit congestion
 387          */
 388         if (!force_reg && device->pending_sync_bios.head) {
 389                 pending_bios = &device->pending_sync_bios;
 390                 force_reg = 1;
 391         } else {
 392                 pending_bios = &device->pending_bios;
 393                 force_reg = 0;
 394         }
 395
 396         pending = pending_bios->head;
 397         tail = pending_bios->tail;
 398         WARN_ON(pending && !tail);
 399
 400         /*
 401          * if pending was null this time around, no bios need processing
 402          * at all and we can stop.  Otherwise it'll loop back up again
 403          * and do an additional check so no bios are missed.
 404          *
 405          * device->running_pending is used to synchronize with the
 406          * schedule_bio code.
 407          */
 408         if (device->pending_sync_bios.head == NULL &&
 409             device->pending_bios.head == NULL) {
 410                 again = 0;
 411                 device->running_pending = 0;
 412         } else {
 413                 again = 1;
 414                 device->running_pending = 1;
 415         }
 416
 417         pending_bios->head = NULL;
 418         pending_bios->tail = NULL;
 419
 420         spin_unlock(&device->io_lock);
 421
 422         while (pending) {
 423
 424                 rmb();
 425                 /* we want to work on both lists, but do more bios on the
 426                  * sync list than the regular list
 427                  */
 428                 if ((num_run > 32 &&
 429                     pending_bios != &device->pending_sync_bios &&
 430                     device->pending_sync_bios.head) ||
 431                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 432                     device->pending_bios.head)) {
 433                         spin_lock(&device->io_lock);
 434                         requeue_list(pending_bios, pending, tail);
 435                         goto loop_lock;
 436                 }
 437
 438                 cur = pending;
 439                 pending = pending->bi_next;
 440                 cur->bi_next = NULL;
 441
 442                 /*
 443                  * atomic_dec_return implies a barrier for waitqueue_active
 444                  */
 445                 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 446                     waitqueue_active(&fs_info->async_submit_wait))
 447                         wake_up(&fs_info->async_submit_wait);
 448
 449                 BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 450
 451                 /*
 452                  * if we're doing the sync list, record that our
 453                  * plug has some sync requests on it
 454                  *
 455                  * If we're doing the regular list and there are
 456                  * sync requests sitting around, unplug before
 457                  * we add more
 458                  */
 459                 if (pending_bios == &device->pending_sync_bios) {
 460                         sync_pending = 1;
 461                 } else if (sync_pending) {
 462                         blk_finish_plug(&plug);
 463                         blk_start_plug(&plug);
 464                         sync_pending = 0;
 465                 }
 466
 467                 btrfsic_submit_bio(cur);
 468                 num_run++;
 469                 batch_run++;
 470
 471                 cond_resched();
 472
 473                 /*
 474                  * we made progress, there is more work to do and the bdi
 475                  * is now congested.  Back off and let other work structs
 476                  * run instead
 477                  */
 478                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 479                     fs_info->fs_devices->open_devices > 1) {
 480                         struct io_context *ioc;
 481
 482                         ioc = current->io_context;
 483
 484                         /*
 485                          * the main goal here is that we don't want to
 486                          * block if we're going to be able to submit
 487                          * more requests without blocking.
 488                          *
 489                          * This code does two great things, it pokes into
 490                          * the elevator code from a filesystem _and_
 491                          * it makes assumptions about how batching works.
 492                          */
 493                         if (ioc && ioc->nr_batch_requests > 0 &&
 494                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 495                             (last_waited == 0 ||
 496                              ioc->last_waited == last_waited)) {
 497                                 /*
 498                                  * we want to go through our batch of
 499                                  * requests and stop.  So, we copy out
 500                                  * the ioc->last_waited time and test
 501                                  * against it before looping
 502                                  */
 503                                 last_waited = ioc->last_waited;
 504                                 cond_resched();
 505                                 continue;
 506                         }
 507                         spin_lock(&device->io_lock);
 508                         requeue_list(pending_bios, pending, tail);
 509                         device->running_pending = 1;
 510
 511                         spin_unlock(&device->io_lock);
 512                         btrfs_queue_work(fs_info->submit_workers,
 513                                          &device->work);
 514                         goto done;
 515                 }
 516                 /* unplug every 64 requests just for good measure */
 517                 if (batch_run % 64 == 0) {
 518                         blk_finish_plug(&plug);
 519                         blk_start_plug(&plug);
 520                         sync_pending = 0;
 521                 }
 522         }
 523
 524         cond_resched();
 525         if (again)
 526                 goto loop;
 527
 528         spin_lock(&device->io_lock);
 529         if (device->pending_bios.head || device->pending_sync_bios.head)
 530                 goto loop_lock;
 531         spin_unlock(&device->io_lock);
 532
 533 done:
 534         blk_finish_plug(&plug);
 535 }
 536
 537 static void pending_bios_fn(struct btrfs_work *work)
 538 {
 539         struct btrfs_device *device;
 540
 541         device = container_of(work, struct btrfs_device, work);
 542         run_scheduled_bios(device);
 543 }
 544
 545
 546 void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 547 {
 548         struct btrfs_fs_devices *fs_devs;
 549         struct btrfs_device *dev;
 550
 551         if (!cur_dev->name)
 552                 return;
 553
 554         list_for_each_entry(fs_devs, &fs_uuids, list) {
 555                 int del = 1;
 556
 557                 if (fs_devs->opened)
 558                         continue;
 559                 if (fs_devs->seeding)
 560                         continue;
 561
 562                 list_for_each_entry(dev, &fs_devs->devices, dev_list) {
 563
 564                         if (dev == cur_dev)
 565                                 continue;
 566                         if (!dev->name)
 567                                 continue;
 568
 569                         /*
 570                          * Todo: This won't be enough. What if the same device
 571                          * comes back (with new uuid and) with its mapper path?
 572                          * But for now, this does help as mostly an admin will
 573                          * either use mapper or non mapper path throughout.
 574                          */
 575                         rcu_read_lock();
 576                         del = strcmp(rcu_str_deref(dev->name),
 577                                                 rcu_str_deref(cur_dev->name));
 578                         rcu_read_unlock();
 579                         if (!del)
 580                                 break;
 581                 }
 582
 583                 if (!del) {
 584                         /* delete the stale device */
 585                         if (fs_devs->num_devices == 1) {
 586                                 btrfs_sysfs_remove_fsid(fs_devs);
 587                                 list_del(&fs_devs->list);
 588                                 free_fs_devices(fs_devs);
 589                         } else {
 590                                 fs_devs->num_devices--;
 591                                 list_del(&dev->dev_list);
 592                                 rcu_string_free(dev->name);
 593                                 kfree(dev);
 594                         }
 595                         break;
 596                 }
 597         }
 598 }
 599
 600 /*
 601  * Add new device to list of registered devices
 602  *
 603  * Returns:
 604  * 1   - first time device is seen
 605  * 0   - device already known
 606  * < 0 - error
 607  */
 608 static noinline int device_list_add(const char *path,
 609                            struct btrfs_super_block *disk_super,
 610                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 611 {
 612         struct btrfs_device *device;
 613         struct btrfs_fs_devices *fs_devices;
 614         struct rcu_string *name;
 615         int ret = 0;
 616         u64 found_transid = btrfs_super_generation(disk_super);
 617
 618         fs_devices = find_fsid(disk_super->fsid);
 619         if (!fs_devices) {
 620                 fs_devices = alloc_fs_devices(disk_super->fsid);
 621                 if (IS_ERR(fs_devices))
 622                         return PTR_ERR(fs_devices);
 623
 624                 list_add(&fs_devices->list, &fs_uuids);
 625
 626                 device = NULL;
 627         } else {
 628                 device = __find_device(&fs_devices->devices, devid,
 629                                        disk_super->dev_item.uuid);
 630         }
 631
 632         if (!device) {
 633                 if (fs_devices->opened)
 634                         return -EBUSY;
 635
 636                 device = btrfs_alloc_device(NULL, &devid,
 637                                             disk_super->dev_item.uuid);
 638                 if (IS_ERR(device)) {
 639                         /* we can safely leave the fs_devices entry around */
 640                         return PTR_ERR(device);
 641                 }
 642
 643                 name = rcu_string_strdup(path, GFP_NOFS);
 644                 if (!name) {
 645                         kfree(device);
 646                         return -ENOMEM;
 647                 }
 648                 rcu_assign_pointer(device->name, name);
 649
 650                 mutex_lock(&fs_devices->device_list_mutex);
 651                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 652                 fs_devices->num_devices++;
 653                 mutex_unlock(&fs_devices->device_list_mutex);
 654
 655                 ret = 1;
 656                 device->fs_devices = fs_devices;
 657         } else if (!device->name || strcmp(device->name->str, path)) {
 658                 /*
 659                  * When FS is already mounted.
 660                  * 1. If you are here and if the device->name is NULL that
 661                  *    means this device was missing at time of FS mount.
 662                  * 2. If you are here and if the device->name is different
 663                  *    from 'path' that means either
 664                  *      a. The same device disappeared and reappeared with
 665                  *         different name. or
 666                  *      b. The missing-disk-which-was-replaced, has
 667                  *         reappeared now.
 668                  *
 669                  * We must allow 1 and 2a above. But 2b would be a spurious
 670                  * and unintentional.
 671                  *
 672                  * Further in case of 1 and 2a above, the disk at 'path'
 673                  * would have missed some transaction when it was away and
 674                  * in case of 2a the stale bdev has to be updated as well.
 675                  * 2b must not be allowed at all time.
 676                  */
 677
 678                 /*
 679                  * For now, we do allow update to btrfs_fs_device through the
 680                  * btrfs dev scan cli after FS has been mounted.  We're still
 681                  * tracking a problem where systems fail mount by subvolume id
 682                  * when we reject replacement on a mounted FS.
 683                  */
 684                 if (!fs_devices->opened && found_transid < device->generation) {
 685                         /*
 686                          * That is if the FS is _not_ mounted and if you
 687                          * are here, that means there is more than one
 688                          * disk with same uuid and devid.We keep the one
 689                          * with larger generation number or the last-in if
 690                          * generation are equal.
 691                          */
 692                         return -EEXIST;
 693                 }
 694
 695                 name = rcu_string_strdup(path, GFP_NOFS);
 696                 if (!name)
 697                         return -ENOMEM;
 698                 rcu_string_free(device->name);
 699                 rcu_assign_pointer(device->name, name);
 700                 if (device->missing) {
 701                         fs_devices->missing_devices--;
 702                         device->missing = 0;
 703                 }
 704         }
 705
 706         /*
 707          * Unmount does not free the btrfs_device struct but would zero
 708          * generation along with most of the other members. So just update
 709          * it back. We need it to pick the disk with largest generation
 710          * (as above).
 711          */
 712         if (!fs_devices->opened)
 713                 device->generation = found_transid;
 714
 715         /*
 716          * if there is new btrfs on an already registered device,
 717          * then remove the stale device entry.
 718          */
 719         if (ret > 0)
 720                 btrfs_free_stale_device(device);
 721
 722         *fs_devices_ret = fs_devices;
 723
 724         return ret;
 725 }
 726
 727 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 728 {
 729         struct btrfs_fs_devices *fs_devices;
 730         struct btrfs_device *device;
 731         struct btrfs_device *orig_dev;
 732
 733         fs_devices = alloc_fs_devices(orig->fsid);
 734         if (IS_ERR(fs_devices))
 735                 return fs_devices;
 736
 737         mutex_lock(&orig->device_list_mutex);
 738         fs_devices->total_devices = orig->total_devices;
 739
 740         /* We have held the volume lock, it is safe to get the devices. */
 741         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 742                 struct rcu_string *name;
 743
 744                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
 745                                             orig_dev->uuid);
 746                 if (IS_ERR(device))
 747                         goto error;
 748
 749                 /*
 750                  * This is ok to do without rcu read locked because we hold the
 751                  * uuid mutex so nothing we touch in here is going to disappear.
 752                  */
 753                 if (orig_dev->name) {
 754                         name = rcu_string_strdup(orig_dev->name->str,
 755                                         GFP_KERNEL);
 756                         if (!name) {
 757                                 kfree(device);
 758                                 goto error;
 759                         }
 760                         rcu_assign_pointer(device->name, name);
 761                 }
 762
 763                 list_add(&device->dev_list, &fs_devices->devices);
 764                 device->fs_devices = fs_devices;
 765                 fs_devices->num_devices++;
 766         }
 767         mutex_unlock(&orig->device_list_mutex);
 768         return fs_devices;
 769 error:
 770         mutex_unlock(&orig->device_list_mutex);
 771         free_fs_devices(fs_devices);
 772         return ERR_PTR(-ENOMEM);
 773 }
 774
 775 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 776 {
 777         struct btrfs_device *device, *next;
 778         struct btrfs_device *latest_dev = NULL;
 779
 780         mutex_lock(&uuid_mutex);
 781 again:
 782         /* This is the initialized path, it is safe to release the devices. */
 783         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 784                 if (device->in_fs_metadata) {
 785                         if (!device->is_tgtdev_for_dev_replace &&
 786                             (!latest_dev ||
 787                              device->generation > latest_dev->generation)) {
 788                                 latest_dev = device;
 789                         }
 790                         continue;
 791                 }
 792
 793                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 794                         /*
 795                          * In the first step, keep the device which has
 796                          * the correct fsid and the devid that is used
 797                          * for the dev_replace procedure.
 798                          * In the second step, the dev_replace state is
 799                          * read from the device tree and it is known
 800                          * whether the procedure is really active or
 801                          * not, which means whether this device is
 802                          * used or whether it should be removed.
 803                          */
 804                         if (step == 0 || device->is_tgtdev_for_dev_replace) {
 805                                 continue;
 806                         }
 807                 }
 808                 if (device->bdev) {
 809                         blkdev_put(device->bdev, device->mode);
 810                         device->bdev = NULL;
 811                         fs_devices->open_devices--;
 812                 }
 813                 if (device->writeable) {
 814                         list_del_init(&device->dev_alloc_list);
 815                         device->writeable = 0;
 816                         if (!device->is_tgtdev_for_dev_replace)
 817                                 fs_devices->rw_devices--;
 818                 }
 819                 list_del_init(&device->dev_list);
 820                 fs_devices->num_devices--;
 821                 rcu_string_free(device->name);
 822                 kfree(device);
 823         }
 824
 825         if (fs_devices->seed) {
 826                 fs_devices = fs_devices->seed;
 827                 goto again;
 828         }
 829
 830         fs_devices->latest_bdev = latest_dev->bdev;
 831
 832         mutex_unlock(&uuid_mutex);
 833 }
 834
 835 static void __free_device(struct work_struct *work)
 836 {
 837         struct btrfs_device *device;
 838
 839         device = container_of(work, struct btrfs_device, rcu_work);
 840         rcu_string_free(device->name);
 841         kfree(device);
 842 }
 843
 844 static void free_device(struct rcu_head *head)
 845 {
 846         struct btrfs_device *device;
 847
 848         device = container_of(head, struct btrfs_device, rcu);
 849
 850         INIT_WORK(&device->rcu_work, __free_device);
 851         schedule_work(&device->rcu_work);
 852 }
 853
 854 static void btrfs_close_bdev(struct btrfs_device *device)
 855 {
 856         if (device->bdev && device->writeable) {
 857                 sync_blockdev(device->bdev);
 858                 invalidate_bdev(device->bdev);
 859         }
 860
 861         if (device->bdev)
 862                 blkdev_put(device->bdev, device->mode);
 863 }
 864
 865 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
 866 {
 867         struct btrfs_fs_devices *fs_devices = device->fs_devices;
 868         struct btrfs_device *new_device;
 869         struct rcu_string *name;
 870
 871         if (device->bdev)
 872                 fs_devices->open_devices--;
 873
 874         if (device->writeable &&
 875             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 876                 list_del_init(&device->dev_alloc_list);
 877                 fs_devices->rw_devices--;
 878         }
 879
 880         if (device->missing)
 881                 fs_devices->missing_devices--;
 882
 883         new_device = btrfs_alloc_device(NULL, &device->devid,
 884                                         device->uuid);
 885         BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
 886
 887         /* Safe because we are under uuid_mutex */
 888         if (device->name) {
 889                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
 890                 BUG_ON(!name); /* -ENOMEM */
 891                 rcu_assign_pointer(new_device->name, name);
 892         }
 893
 894         list_replace_rcu(&device->dev_list, &new_device->dev_list);
 895         new_device->fs_devices = device->fs_devices;
 896 }
 897
 898 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 899 {
 900         struct btrfs_device *device, *tmp;
 901         struct list_head pending_put;
 902
 903         INIT_LIST_HEAD(&pending_put);
 904
 905         if (--fs_devices->opened > 0)
 906                 return 0;
 907
 908         mutex_lock(&fs_devices->device_list_mutex);
 909         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
 910                 btrfs_prepare_close_one_device(device);
 911                 list_add(&device->dev_list, &pending_put);
 912         }
 913         mutex_unlock(&fs_devices->device_list_mutex);
 914
 915         /*
 916          * btrfs_show_devname() is using the device_list_mutex,
 917          * sometimes call to blkdev_put() leads vfs calling
 918          * into this func. So do put outside of device_list_mutex,
 919          * as of now.
 920          */
 921         while (!list_empty(&pending_put)) {
 922                 device = list_first_entry(&pending_put,
 923                                 struct btrfs_device, dev_list);
 924                 list_del(&device->dev_list);
 925                 btrfs_close_bdev(device);
 926                 call_rcu(&device->rcu, free_device);
 927         }
 928
 929         WARN_ON(fs_devices->open_devices);
 930         WARN_ON(fs_devices->rw_devices);
 931         fs_devices->opened = 0;
 932         fs_devices->seeding = 0;
 933
 934         return 0;
 935 }
 936
 937 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 938 {
 939         struct btrfs_fs_devices *seed_devices = NULL;
 940         int ret;
 941
 942         mutex_lock(&uuid_mutex);
 943         ret = __btrfs_close_devices(fs_devices);
 944         if (!fs_devices->opened) {
 945                 seed_devices = fs_devices->seed;
 946                 fs_devices->seed = NULL;
 947         }
 948         mutex_unlock(&uuid_mutex);
 949
 950         while (seed_devices) {
 951                 fs_devices = seed_devices;
 952                 seed_devices = fs_devices->seed;
 953                 __btrfs_close_devices(fs_devices);
 954                 free_fs_devices(fs_devices);
 955         }
 956         /*
 957          * Wait for rcu kworkers under __btrfs_close_devices
 958          * to finish all blkdev_puts so device is really
 959          * free when umount is done.
 960          */
 961         rcu_barrier();
 962         return ret;
 963 }
 964
 965 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 966                                 fmode_t flags, void *holder)
 967 {
 968         struct request_queue *q;
 969         struct block_device *bdev;
 970         struct list_head *head = &fs_devices->devices;
 971         struct btrfs_device *device;
 972         struct btrfs_device *latest_dev = NULL;
 973         struct buffer_head *bh;
 974         struct btrfs_super_block *disk_super;
 975         u64 devid;
 976         int seeding = 1;
 977         int ret = 0;
 978
 979         flags |= FMODE_EXCL;
 980
 981         list_for_each_entry(device, head, dev_list) {
 982                 if (device->bdev)
 983                         continue;
 984                 if (!device->name)
 985                         continue;
 986
 987                 /* Just open everything we can; ignore failures here */
 988                 if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 989                                             &bdev, &bh))
 990                         continue;
 991
 992                 disk_super = (struct btrfs_super_block *)bh->b_data;
 993                 devid = btrfs_stack_device_id(&disk_super->dev_item);
 994                 if (devid != device->devid)
 995                         goto error_brelse;
 996
 997                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
 998                            BTRFS_UUID_SIZE))
 999                         goto error_brelse;
1000
1001                 device->generation = btrfs_super_generation(disk_super);
1002                 if (!latest_dev ||
1003                     device->generation > latest_dev->generation)
1004                         latest_dev = device;
1005
1006                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
1007                         device->writeable = 0;
1008                 } else {
1009                         device->writeable = !bdev_read_only(bdev);
1010                         seeding = 0;
1011                 }
1012
1013                 q = bdev_get_queue(bdev);
1014                 if (blk_queue_discard(q))
1015                         device->can_discard = 1;
1016                 if (!blk_queue_nonrot(q))
1017                         fs_devices->rotating = 1;
1018
1019                 device->bdev = bdev;
1020                 device->in_fs_metadata = 0;
1021                 device->mode = flags;
1022
1023                 fs_devices->open_devices++;
1024                 if (device->writeable &&
1025                     device->devid != BTRFS_DEV_REPLACE_DEVID) {
1026                         fs_devices->rw_devices++;
1027                         list_add(&device->dev_alloc_list,
1028                                  &fs_devices->alloc_list);
1029                 }
1030                 brelse(bh);
1031                 continue;
1032
1033 error_brelse:
1034                 brelse(bh);
1035                 blkdev_put(bdev, flags);
1036                 continue;
1037         }
1038         if (fs_devices->open_devices == 0) {
1039                 ret = -EINVAL;
1040                 goto out;
1041         }
1042         fs_devices->seeding = seeding;
1043         fs_devices->opened = 1;
1044         fs_devices->latest_bdev = latest_dev->bdev;
1045         fs_devices->total_rw_bytes = 0;
1046 out:
1047         return ret;
1048 }
1049
1050 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1051                        fmode_t flags, void *holder)
1052 {
1053         int ret;
1054
1055         mutex_lock(&uuid_mutex);
1056         if (fs_devices->opened) {
1057                 fs_devices->opened++;
1058                 ret = 0;
1059         } else {
1060                 ret = __btrfs_open_devices(fs_devices, flags, holder);
1061         }
1062         mutex_unlock(&uuid_mutex);
1063         return ret;
1064 }
1065
1066 void btrfs_release_disk_super(struct page *page)
1067 {
1068         kunmap(page);
1069         put_page(page);
1070 }
1071
1072 int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1073                 struct page **page, struct btrfs_super_block **disk_super)
1074 {
1075         void *p;
1076         pgoff_t index;
1077
1078         /* make sure our super fits in the device */
1079         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1080                 return 1;
1081
1082         /* make sure our super fits in the page */
1083         if (sizeof(**disk_super) > PAGE_SIZE)
1084                 return 1;
1085
1086         /* make sure our super doesn't straddle pages on disk */
1087         index = bytenr >> PAGE_SHIFT;
1088         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1089                 return 1;
1090
1091         /* pull in the page with our super */
1092         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1093                                    index, GFP_KERNEL);
1094
1095         if (IS_ERR_OR_NULL(*page))
1096                 return 1;
1097
1098         p = kmap(*page);
1099
1100         /* align our pointer to the offset of the super block */
1101         *disk_super = p + (bytenr & ~PAGE_MASK);
1102
1103         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1104             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1105                 btrfs_release_disk_super(*page);
1106                 return 1;
1107         }
1108
1109         if ((*disk_super)->label[0] &&
1110                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1111                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1112
1113         return 0;
1114 }
1115
1116 /*
1117  * Look for a btrfs signature on a device. This may be called out of the mount path
1118  * and we are not allowed to call set_blocksize during the scan. The superblock
1119  * is read via pagecache
1120  */
1121 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1122                           struct btrfs_fs_devices **fs_devices_ret)
1123 {
1124         struct btrfs_super_block *disk_super;
1125         struct block_device *bdev;
1126         struct page *page;
1127         int ret = -EINVAL;
1128         u64 devid;
1129         u64 transid;
1130         u64 total_devices;
1131         u64 bytenr;
1132
1133         /*
1134          * we would like to check all the supers, but that would make
1135          * a btrfs mount succeed after a mkfs from a different FS.
1136          * So, we need to add a special mount option to scan for
1137          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1138          */
1139         bytenr = btrfs_sb_offset(0);
1140         flags |= FMODE_EXCL;
1141         mutex_lock(&uuid_mutex);
1142
1143         bdev = blkdev_get_by_path(path, flags, holder);
1144         if (IS_ERR(bdev)) {
1145                 ret = PTR_ERR(bdev);
1146                 goto error;
1147         }
1148
1149         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
1150                 goto error_bdev_put;
1151
1152         devid = btrfs_stack_device_id(&disk_super->dev_item);
1153         transid = btrfs_super_generation(disk_super);
1154         total_devices = btrfs_super_num_devices(disk_super);
1155
1156         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
1157         if (ret > 0) {
1158                 if (disk_super->label[0]) {
1159                         pr_info("BTRFS: device label %s ", disk_super->label);
1160                 } else {
1161                         pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
1162                 }
1163
1164                 pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
1165                 ret = 0;
1166         }
1167         if (!ret && fs_devices_ret)
1168                 (*fs_devices_ret)->total_devices = total_devices;
1169
1170         btrfs_release_disk_super(page);
1171
1172 error_bdev_put:
1173         blkdev_put(bdev, flags);
1174 error:
1175         mutex_unlock(&uuid_mutex);
1176         return ret;
1177 }
1178
1179 /* helper to account the used device space in the range */
1180 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1181                                    u64 end, u64 *length)
1182 {
1183         struct btrfs_key key;
1184         struct btrfs_root *root = device->fs_info->dev_root;
1185         struct btrfs_dev_extent *dev_extent;
1186         struct btrfs_path *path;
1187         u64 extent_end;
1188         int ret;
1189         int slot;
1190         struct extent_buffer *l;
1191
1192         *length = 0;
1193
1194         if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1195                 return 0;
1196
1197         path = btrfs_alloc_path();
1198         if (!path)
1199                 return -ENOMEM;
1200         path->reada = READA_FORWARD;
1201
1202         key.objectid = device->devid;
1203         key.offset = start;
1204         key.type = BTRFS_DEV_EXTENT_KEY;
1205
1206         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1207         if (ret < 0)
1208                 goto out;
1209         if (ret > 0) {
1210                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1211                 if (ret < 0)
1212                         goto out;
1213         }
1214
1215         while (1) {
1216                 l = path->nodes[0];
1217                 slot = path->slots[0];
1218                 if (slot >= btrfs_header_nritems(l)) {
1219                         ret = btrfs_next_leaf(root, path);
1220                         if (ret == 0)
1221                                 continue;
1222                         if (ret < 0)
1223                                 goto out;
1224
1225                         break;
1226                 }
1227                 btrfs_item_key_to_cpu(l, &key, slot);
1228
1229                 if (key.objectid < device->devid)
1230                         goto next;
1231
1232                 if (key.objectid > device->devid)
1233                         break;
1234
1235                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1236                         goto next;
1237
1238                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1239                 extent_end = key.offset + btrfs_dev_extent_length(l,
1240                                                                   dev_extent);
1241                 if (key.offset <= start && extent_end > end) {
1242                         *length = end - start + 1;
1243                         break;
1244                 } else if (key.offset <= start && extent_end > start)
1245                         *length += extent_end - start;
1246                 else if (key.offset > start && extent_end <= end)
1247                         *length += extent_end - key.offset;
1248                 else if (key.offset > start && key.offset <= end) {
1249                         *length += end - key.offset + 1;
1250                         break;
1251                 } else if (key.offset > end)
1252                         break;
1253
1254 next:
1255                 path->slots[0]++;
1256         }
1257         ret = 0;
1258 out:
1259         btrfs_free_path(path);
1260         return ret;
1261 }
1262
1263 static int contains_pending_extent(struct btrfs_transaction *transaction,
1264                                    struct btrfs_device *device,
1265                                    u64 *start, u64 len)
1266 {
1267         struct btrfs_fs_info *fs_info = device->fs_info;
1268         struct extent_map *em;
1269         struct list_head *search_list = &fs_info->pinned_chunks;
1270         int ret = 0;
1271         u64 physical_start = *start;
1272
1273         if (transaction)
1274                 search_list = &transaction->pending_chunks;
1275 again:
1276         list_for_each_entry(em, search_list, list) {
1277                 struct map_lookup *map;
1278                 int i;
1279
1280                 map = em->map_lookup;
1281                 for (i = 0; i < map->num_stripes; i++) {
1282                         u64 end;
1283
1284                         if (map->stripes[i].dev != device)
1285                                 continue;
1286                         if (map->stripes[i].physical >= physical_start + len ||
1287                             map->stripes[i].physical + em->orig_block_len <=
1288                             physical_start)
1289                                 continue;
1290                         /*
1291                          * Make sure that while processing the pinned list we do
1292                          * not override our *start with a lower value, because
1293                          * we can have pinned chunks that fall within this
1294                          * device hole and that have lower physical addresses
1295                          * than the pending chunks we processed before. If we
1296                          * do not take this special care we can end up getting
1297                          * 2 pending chunks that start at the same physical
1298                          * device offsets because the end offset of a pinned
1299                          * chunk can be equal to the start offset of some
1300                          * pending chunk.
1301                          */
1302                         end = map->stripes[i].physical + em->orig_block_len;
1303                         if (end > *start) {
1304                                 *start = end;
1305                                 ret = 1;
1306                         }
1307                 }
1308         }
1309         if (search_list != &fs_info->pinned_chunks) {
1310                 search_list = &fs_info->pinned_chunks;
1311                 goto again;
1312         }
1313
1314         return ret;
1315 }
1316
1317
1318 /*
1319  * find_free_dev_extent_start - find free space in the specified device
1320  * @device:       the device which we search the free space in
1321  * @num_bytes:    the size of the free space that we need
1322  * @search_start: the position from which to begin the search
1323  * @start:        store the start of the free space.
1324  * @len:          the size of the free space. that we find, or the size
1325  *                of the max free space if we don't find suitable free space
1326  *
1327  * this uses a pretty simple search, the expectation is that it is
1328  * called very infrequently and that a given device has a small number
1329  * of extents
1330  *
1331  * @start is used to store the start of the free space if we find. But if we
1332  * don't find suitable free space, it will be used to store the start position
1333  * of the max free space.
1334  *
1335  * @len is used to store the size of the free space that we find.
1336  * But if we don't find suitable free space, it is used to store the size of
1337  * the max free space.
1338  */
1339 int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1340                                struct btrfs_device *device, u64 num_bytes,
1341                                u64 search_start, u64 *start, u64 *len)
1342 {
1343         struct btrfs_fs_info *fs_info = device->fs_info;
1344         struct btrfs_root *root = fs_info->dev_root;
1345         struct btrfs_key key;
1346         struct btrfs_dev_extent *dev_extent;
1347         struct btrfs_path *path;
1348         u64 hole_size;
1349         u64 max_hole_start;
1350         u64 max_hole_size;
1351         u64 extent_end;
1352         u64 search_end = device->total_bytes;
1353         int ret;
1354         int slot;
1355         struct extent_buffer *l;
1356         u64 min_search_start;
1357
1358         /*
1359          * We don't want to overwrite the superblock on the drive nor any area
1360          * used by the boot loader (grub for example), so we make sure to start
1361          * at an offset of at least 1MB.
1362          */
1363         min_search_start = max(fs_info->alloc_start, 1024ull * 1024);
1364         search_start = max(search_start, min_search_start);
1365
1366         path = btrfs_alloc_path();
1367         if (!path)
1368                 return -ENOMEM;
1369
1370         max_hole_start = search_start;
1371         max_hole_size = 0;
1372
1373 again:
1374         if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1375                 ret = -ENOSPC;
1376                 goto out;
1377         }
1378
1379         path->reada = READA_FORWARD;
1380         path->search_commit_root = 1;
1381         path->skip_locking = 1;
1382
1383         key.objectid = device->devid;
1384         key.offset = search_start;
1385         key.type = BTRFS_DEV_EXTENT_KEY;
1386
1387         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1388         if (ret < 0)
1389                 goto out;
1390         if (ret > 0) {
1391                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1392                 if (ret < 0)
1393                         goto out;
1394         }
1395
1396         while (1) {
1397                 l = path->nodes[0];
1398                 slot = path->slots[0];
1399                 if (slot >= btrfs_header_nritems(l)) {
1400                         ret = btrfs_next_leaf(root, path);
1401                         if (ret == 0)
1402                                 continue;
1403                         if (ret < 0)
1404                                 goto out;
1405
1406                         break;
1407                 }
1408                 btrfs_item_key_to_cpu(l, &key, slot);
1409
1410                 if (key.objectid < device->devid)
1411                         goto next;
1412
1413                 if (key.objectid > device->devid)
1414                         break;
1415
1416                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1417                         goto next;
1418
1419                 if (key.offset > search_start) {
1420                         hole_size = key.offset - search_start;
1421
1422                         /*
1423                          * Have to check before we set max_hole_start, otherwise
1424                          * we could end up sending back this offset anyway.
1425                          */
1426                         if (contains_pending_extent(transaction, device,
1427                                                     &search_start,
1428                                                     hole_size)) {
1429                                 if (key.offset >= search_start) {
1430                                         hole_size = key.offset - search_start;
1431                                 } else {
1432                                         WARN_ON_ONCE(1);
1433                                         hole_size = 0;
1434                                 }
1435                         }
1436
1437                         if (hole_size > max_hole_size) {
1438                                 max_hole_start = search_start;
1439                                 max_hole_size = hole_size;
1440                         }
1441
1442                         /*
1443                          * If this free space is greater than which we need,
1444                          * it must be the max free space that we have found
1445                          * until now, so max_hole_start must point to the start
1446                          * of this free space and the length of this free space
1447                          * is stored in max_hole_size. Thus, we return
1448                          * max_hole_start and max_hole_size and go back to the
1449                          * caller.
1450                          */
1451                         if (hole_size >= num_bytes) {
1452                                 ret = 0;
1453                                 goto out;
1454                         }
1455                 }
1456
1457                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1458                 extent_end = key.offset + btrfs_dev_extent_length(l,
1459                                                                   dev_extent);
1460                 if (extent_end > search_start)
1461                         search_start = extent_end;
1462 next:
1463                 path->slots[0]++;
1464                 cond_resched();
1465         }
1466
1467         /*
1468          * At this point, search_start should be the end of
1469          * allocated dev extents, and when shrinking the device,
1470          * search_end may be smaller than search_start.
1471          */
1472         if (search_end > search_start) {
1473                 hole_size = search_end - search_start;
1474
1475                 if (contains_pending_extent(transaction, device, &search_start,
1476                                             hole_size)) {
1477                         btrfs_release_path(path);
1478                         goto again;
1479                 }
1480
1481                 if (hole_size > max_hole_size) {
1482                         max_hole_start = search_start;
1483                         max_hole_size = hole_size;
1484                 }
1485         }
1486
1487         /* See above. */
1488         if (max_hole_size < num_bytes)
1489                 ret = -ENOSPC;
1490         else
1491                 ret = 0;
1492
1493 out:
1494         btrfs_free_path(path);
1495         *start = max_hole_start;
1496         if (len)
1497                 *len = max_hole_size;
1498         return ret;
1499 }
1500
1501 int find_free_dev_extent(struct btrfs_trans_handle *trans,
1502                          struct btrfs_device *device, u64 num_bytes,
1503                          u64 *start, u64 *len)
1504 {
1505         /* FIXME use last free of some kind */
1506         return find_free_dev_extent_start(trans->transaction, device,
1507                                           num_bytes, 0, start, len);
1508 }
1509
1510 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1511                           struct btrfs_device *device,
1512                           u64 start, u64 *dev_extent_len)
1513 {
1514         struct btrfs_fs_info *fs_info = device->fs_info;
1515         struct btrfs_root *root = fs_info->dev_root;
1516         int ret;
1517         struct btrfs_path *path;
1518         struct btrfs_key key;
1519         struct btrfs_key found_key;
1520         struct extent_buffer *leaf = NULL;
1521         struct btrfs_dev_extent *extent = NULL;
1522
1523         path = btrfs_alloc_path();
1524         if (!path)
1525                 return -ENOMEM;
1526
1527         key.objectid = device->devid;
1528         key.offset = start;
1529         key.type = BTRFS_DEV_EXTENT_KEY;
1530 again:
1531         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1532         if (ret > 0) {
1533                 ret = btrfs_previous_item(root, path, key.objectid,
1534                                           BTRFS_DEV_EXTENT_KEY);
1535                 if (ret)
1536                         goto out;
1537                 leaf = path->nodes[0];
1538                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1539                 extent = btrfs_item_ptr(leaf, path->slots[0],
1540                                         struct btrfs_dev_extent);
1541                 BUG_ON(found_key.offset > start || found_key.offset +
1542                        btrfs_dev_extent_length(leaf, extent) < start);
1543                 key = found_key;
1544                 btrfs_release_path(path);
1545                 goto again;
1546         } else if (ret == 0) {
1547                 leaf = path->nodes[0];
1548                 extent = btrfs_item_ptr(leaf, path->slots[0],
1549                                         struct btrfs_dev_extent);
1550         } else {
1551                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1552                 goto out;
1553         }
1554
1555         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1556
1557         ret = btrfs_del_item(trans, root, path);
1558         if (ret) {
1559                 btrfs_handle_fs_error(fs_info, ret,
1560                                       "Failed to remove dev extent item");
1561         } else {
1562                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1563         }
1564 out:
1565         btrfs_free_path(path);
1566         return ret;
1567 }
1568
1569 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1570                                   struct btrfs_device *device,
1571                                   u64 chunk_tree, u64 chunk_objectid,
1572                                   u64 chunk_offset, u64 start, u64 num_bytes)
1573 {
1574         int ret;
1575         struct btrfs_path *path;
1576         struct btrfs_fs_info *fs_info = device->fs_info;
1577         struct btrfs_root *root = fs_info->dev_root;
1578         struct btrfs_dev_extent *extent;
1579         struct extent_buffer *leaf;
1580         struct btrfs_key key;
1581
1582         WARN_ON(!device->in_fs_metadata);
1583         WARN_ON(device->is_tgtdev_for_dev_replace);
1584         path = btrfs_alloc_path();
1585         if (!path)
1586                 return -ENOMEM;
1587
1588         key.objectid = device->devid;
1589         key.offset = start;
1590         key.type = BTRFS_DEV_EXTENT_KEY;
1591         ret = btrfs_insert_empty_item(trans, root, path, &key,
1592                                       sizeof(*extent));
1593         if (ret)
1594                 goto out;
1595
1596         leaf = path->nodes[0];
1597         extent = btrfs_item_ptr(leaf, path->slots[0],
1598                                 struct btrfs_dev_extent);
1599         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1600         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1601         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1602
1603         write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
1604
1605         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1606         btrfs_mark_buffer_dirty(leaf);
1607 out:
1608         btrfs_free_path(path);
1609         return ret;
1610 }
1611
1612 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1613 {
1614         struct extent_map_tree *em_tree;
1615         struct extent_map *em;
1616         struct rb_node *n;
1617         u64 ret = 0;
1618
1619         em_tree = &fs_info->mapping_tree.map_tree;
1620         read_lock(&em_tree->lock);
1621         n = rb_last(&em_tree->map);
1622         if (n) {
1623                 em = rb_entry(n, struct extent_map, rb_node);
1624                 ret = em->start + em->len;
1625         }
1626         read_unlock(&em_tree->lock);
1627
1628         return ret;
1629 }
1630
1631 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1632                                     u64 *devid_ret)
1633 {
1634         int ret;
1635         struct btrfs_key key;
1636         struct btrfs_key found_key;
1637         struct btrfs_path *path;
1638
1639         path = btrfs_alloc_path();
1640         if (!path)
1641                 return -ENOMEM;
1642
1643         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1644         key.type = BTRFS_DEV_ITEM_KEY;
1645         key.offset = (u64)-1;
1646
1647         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1648         if (ret < 0)
1649                 goto error;
1650
1651         BUG_ON(ret == 0); /* Corruption */
1652
1653         ret = btrfs_previous_item(fs_info->chunk_root, path,
1654                                   BTRFS_DEV_ITEMS_OBJECTID,
1655                                   BTRFS_DEV_ITEM_KEY);
1656         if (ret) {
1657                 *devid_ret = 1;
1658         } else {
1659                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1660                                       path->slots[0]);
1661                 *devid_ret = found_key.offset + 1;
1662         }
1663         ret = 0;
1664 error:
1665         btrfs_free_path(path);
1666         return ret;
1667 }
1668
1669 /*
1670  * the device information is stored in the chunk root
1671  * the btrfs_device struct should be fully filled in
1672  */
1673 static int btrfs_add_device(struct btrfs_trans_handle *trans,
1674                             struct btrfs_fs_info *fs_info,
1675                             struct btrfs_device *device)
1676 {
1677         struct btrfs_root *root = fs_info->chunk_root;
1678         int ret;
1679         struct btrfs_path *path;
1680         struct btrfs_dev_item *dev_item;
1681         struct extent_buffer *leaf;
1682         struct btrfs_key key;
1683         unsigned long ptr;
1684
1685         path = btrfs_alloc_path();
1686         if (!path)
1687                 return -ENOMEM;
1688
1689         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1690         key.type = BTRFS_DEV_ITEM_KEY;
1691         key.offset = device->devid;
1692
1693         ret = btrfs_insert_empty_item(trans, root, path, &key,
1694                                       sizeof(*dev_item));
1695         if (ret)
1696                 goto out;
1697
1698         leaf = path->nodes[0];
1699         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1700
1701         btrfs_set_device_id(leaf, dev_item, device->devid);
1702         btrfs_set_device_generation(leaf, dev_item, 0);
1703         btrfs_set_device_type(leaf, dev_item, device->type);
1704         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1705         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1706         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1707         btrfs_set_device_total_bytes(leaf, dev_item,
1708                                      btrfs_device_get_disk_total_bytes(device));
1709         btrfs_set_device_bytes_used(leaf, dev_item,
1710                                     btrfs_device_get_bytes_used(device));
1711         btrfs_set_device_group(leaf, dev_item, 0);
1712         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1713         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1714         btrfs_set_device_start_offset(leaf, dev_item, 0);
1715
1716         ptr = btrfs_device_uuid(dev_item);
1717         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1718         ptr = btrfs_device_fsid(dev_item);
1719         write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1720         btrfs_mark_buffer_dirty(leaf);
1721
1722         ret = 0;
1723 out:
1724         btrfs_free_path(path);
1725         return ret;
1726 }
1727
1728 /*
1729  * Function to update ctime/mtime for a given device path.
1730  * Mainly used for ctime/mtime based probe like libblkid.
1731  */
1732 static void update_dev_time(const char *path_name)
1733 {
1734         struct file *filp;
1735
1736         filp = filp_open(path_name, O_RDWR, 0);
1737         if (IS_ERR(filp))
1738                 return;
1739         file_update_time(filp);
1740         filp_close(filp, NULL);
1741 }
1742
1743 static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1744                              struct btrfs_device *device)
1745 {
1746         struct btrfs_root *root = fs_info->chunk_root;
1747         int ret;
1748         struct btrfs_path *path;
1749         struct btrfs_key key;
1750         struct btrfs_trans_handle *trans;
1751
1752         path = btrfs_alloc_path();
1753         if (!path)
1754                 return -ENOMEM;
1755
1756         trans = btrfs_start_transaction(root, 0);
1757         if (IS_ERR(trans)) {
1758                 btrfs_free_path(path);
1759                 return PTR_ERR(trans);
1760         }
1761         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1762         key.type = BTRFS_DEV_ITEM_KEY;
1763         key.offset = device->devid;
1764
1765         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1766         if (ret < 0)
1767                 goto out;
1768
1769         if (ret > 0) {
1770                 ret = -ENOENT;
1771                 goto out;
1772         }
1773
1774         ret = btrfs_del_item(trans, root, path);
1775         if (ret)
1776                 goto out;
1777 out:
1778         btrfs_free_path(path);
1779         btrfs_commit_transaction(trans);
1780         return ret;
1781 }
1782
1783 /*
1784  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1785  * filesystem. It's up to the caller to adjust that number regarding eg. device
1786  * replace.
1787  */
1788 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1789                 u64 num_devices)
1790 {
1791         u64 all_avail;
1792         unsigned seq;
1793         int i;
1794
1795         do {
1796                 seq = read_seqbegin(&fs_info->profiles_lock);
1797
1798                 all_avail = fs_info->avail_data_alloc_bits |
1799                             fs_info->avail_system_alloc_bits |
1800                             fs_info->avail_metadata_alloc_bits;
1801         } while (read_seqretry(&fs_info->profiles_lock, seq));
1802
1803         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1804                 if (!(all_avail & btrfs_raid_group[i]))
1805                         continue;
1806
1807                 if (num_devices < btrfs_raid_array[i].devs_min) {
1808                         int ret = btrfs_raid_mindev_error[i];
1809
1810                         if (ret)
1811                                 return ret;
1812                 }
1813         }
1814
1815         return 0;
1816 }
1817
1818 struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
1819                                         struct btrfs_device *device)
1820 {
1821         struct btrfs_device *next_device;
1822
1823         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1824                 if (next_device != device &&
1825                         !next_device->missing && next_device->bdev)
1826                         return next_device;
1827         }
1828
1829         return NULL;
1830 }
1831
1832 /*
1833  * Helper function to check if the given device is part of s_bdev / latest_bdev
1834  * and replace it with the provided or the next active device, in the context
1835  * where this function called, there should be always be another device (or
1836  * this_dev) which is active.
1837  */
1838 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1839                 struct btrfs_device *device, struct btrfs_device *this_dev)
1840 {
1841         struct btrfs_device *next_device;
1842
1843         if (this_dev)
1844                 next_device = this_dev;
1845         else
1846                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1847                                                                 device);
1848         ASSERT(next_device);
1849
1850         if (fs_info->sb->s_bdev &&
1851                         (fs_info->sb->s_bdev == device->bdev))
1852                 fs_info->sb->s_bdev = next_device->bdev;
1853
1854         if (fs_info->fs_devices->latest_bdev == device->bdev)
1855                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1856 }
1857
1858 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1859                 u64 devid)
1860 {
1861         struct btrfs_device *device;
1862         struct btrfs_fs_devices *cur_devices;
1863         u64 num_devices;
1864         int ret = 0;
1865         bool clear_super = false;
1866
1867         mutex_lock(&uuid_mutex);
1868
1869         num_devices = fs_info->fs_devices->num_devices;
1870         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
1871         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1872                 WARN_ON(num_devices < 1);
1873                 num_devices--;
1874         }
1875         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
1876
1877         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1878         if (ret)
1879                 goto out;
1880
1881         ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1882                                            &device);
1883         if (ret)
1884                 goto out;
1885
1886         if (device->is_tgtdev_for_dev_replace) {
1887                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1888                 goto out;
1889         }
1890
1891         if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
1892                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1893                 goto out;
1894         }
1895
1896         if (device->writeable) {
1897                 mutex_lock(&fs_info->chunk_mutex);
1898                 list_del_init(&device->dev_alloc_list);
1899                 device->fs_devices->rw_devices--;
1900                 mutex_unlock(&fs_info->chunk_mutex);
1901                 clear_super = true;
1902         }
1903
1904         mutex_unlock(&uuid_mutex);
1905         ret = btrfs_shrink_device(device, 0);
1906         mutex_lock(&uuid_mutex);
1907         if (ret)
1908                 goto error_undo;
1909
1910         /*
1911          * TODO: the superblock still includes this device in its num_devices
1912          * counter although write_all_supers() is not locked out. This
1913          * could give a filesystem state which requires a degraded mount.
1914          */
1915         ret = btrfs_rm_dev_item(fs_info, device);
1916         if (ret)
1917                 goto error_undo;
1918
1919         device->in_fs_metadata = 0;
1920         btrfs_scrub_cancel_dev(fs_info, device);
1921
1922         /*
1923          * the device list mutex makes sure that we don't change
1924          * the device list while someone else is writing out all
1925          * the device supers. Whoever is writing all supers, should
1926          * lock the device list mutex before getting the number of
1927          * devices in the super block (super_copy). Conversely,
1928          * whoever updates the number of devices in the super block
1929          * (super_copy) should hold the device list mutex.
1930          */
1931
1932         cur_devices = device->fs_devices;
1933         mutex_lock(&fs_info->fs_devices->device_list_mutex);
1934         list_del_rcu(&device->dev_list);
1935
1936         device->fs_devices->num_devices--;
1937         device->fs_devices->total_devices--;
1938
1939         if (device->missing)
1940                 device->fs_devices->missing_devices--;
1941
1942         btrfs_assign_next_active_device(fs_info, device, NULL);
1943
1944         if (device->bdev) {
1945                 device->fs_devices->open_devices--;
1946                 /* remove sysfs entry */
1947                 btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
1948         }
1949
1950         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1951         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1952         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1953
1954         /*
1955          * at this point, the device is zero sized and detached from
1956          * the devices list.  All that's left is to zero out the old
1957          * supers and free the device.
1958          */
1959         if (device->writeable)
1960                 btrfs_scratch_superblocks(device->bdev, device->name->str);
1961
1962         btrfs_close_bdev(device);
1963         call_rcu(&device->rcu, free_device);
1964
1965         if (cur_devices->open_devices == 0) {
1966                 struct btrfs_fs_devices *fs_devices;
1967                 fs_devices = fs_info->fs_devices;
1968                 while (fs_devices) {
1969                         if (fs_devices->seed == cur_devices) {
1970                                 fs_devices->seed = cur_devices->seed;
1971                                 break;
1972                         }
1973                         fs_devices = fs_devices->seed;
1974                 }
1975                 cur_devices->seed = NULL;
1976                 __btrfs_close_devices(cur_devices);
1977                 free_fs_devices(cur_devices);
1978         }
1979
1980         fs_info->num_tolerated_disk_barrier_failures =
1981                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
1982
1983 out:
1984         mutex_unlock(&uuid_mutex);
1985         return ret;
1986
1987 error_undo:
1988         if (device->writeable) {
1989                 mutex_lock(&fs_info->chunk_mutex);
1990                 list_add(&device->dev_alloc_list,
1991                          &fs_info->fs_devices->alloc_list);
1992                 device->fs_devices->rw_devices++;
1993                 mutex_unlock(&fs_info->chunk_mutex);
1994         }
1995         goto out;
1996 }
1997
1998 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1999                                         struct btrfs_device *srcdev)
2000 {
2001         struct btrfs_fs_devices *fs_devices;
2002
2003         WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
2004
2005         /*
2006          * in case of fs with no seed, srcdev->fs_devices will point
2007          * to fs_devices of fs_info. However when the dev being replaced is
2008          * a seed dev it will point to the seed's local fs_devices. In short
2009          * srcdev will have its correct fs_devices in both the cases.
2010          */
2011         fs_devices = srcdev->fs_devices;
2012
2013         list_del_rcu(&srcdev->dev_list);
2014         list_del_rcu(&srcdev->dev_alloc_list);
2015         fs_devices->num_devices--;
2016         if (srcdev->missing)
2017                 fs_devices->missing_devices--;
2018
2019         if (srcdev->writeable)
2020                 fs_devices->rw_devices--;
2021
2022         if (srcdev->bdev)
2023                 fs_devices->open_devices--;
2024 }
2025
2026 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2027                                       struct btrfs_device *srcdev)
2028 {
2029         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2030
2031         if (srcdev->writeable) {
2032                 /* zero out the old super if it is writable */
2033                 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2034         }
2035
2036         btrfs_close_bdev(srcdev);
2037
2038         call_rcu(&srcdev->rcu, free_device);
2039
2040         /*
2041          * unless fs_devices is seed fs, num_devices shouldn't go
2042          * zero
2043          */
2044         BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
2045
2046         /* if this is no devs we rather delete the fs_devices */
2047         if (!fs_devices->num_devices) {
2048                 struct btrfs_fs_devices *tmp_fs_devices;
2049
2050                 tmp_fs_devices = fs_info->fs_devices;
2051                 while (tmp_fs_devices) {
2052                         if (tmp_fs_devices->seed == fs_devices) {
2053                                 tmp_fs_devices->seed = fs_devices->seed;
2054                                 break;
2055                         }
2056                         tmp_fs_devices = tmp_fs_devices->seed;
2057                 }
2058                 fs_devices->seed = NULL;
2059                 __btrfs_close_devices(fs_devices);
2060                 free_fs_devices(fs_devices);
2061         }
2062 }
2063
2064 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2065                                       struct btrfs_device *tgtdev)
2066 {
2067         mutex_lock(&uuid_mutex);
2068         WARN_ON(!tgtdev);
2069         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2070
2071         btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2072
2073         if (tgtdev->bdev)
2074                 fs_info->fs_devices->open_devices--;
2075
2076         fs_info->fs_devices->num_devices--;
2077
2078         btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2079
2080         list_del_rcu(&tgtdev->dev_list);
2081
2082         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2083         mutex_unlock(&uuid_mutex);
2084
2085         /*
2086          * The update_dev_time() with in btrfs_scratch_superblocks()
2087          * may lead to a call to btrfs_show_devname() which will try
2088          * to hold device_list_mutex. And here this device
2089          * is already out of device list, so we don't have to hold
2090          * the device_list_mutex lock.
2091          */
2092         btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2093
2094         btrfs_close_bdev(tgtdev);
2095         call_rcu(&tgtdev->rcu, free_device);
2096 }
2097
2098 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2099                                      const char *device_path,
2100                                      struct btrfs_device **device)
2101 {
2102         int ret = 0;
2103         struct btrfs_super_block *disk_super;
2104         u64 devid;
2105         u8 *dev_uuid;
2106         struct block_device *bdev;
2107         struct buffer_head *bh;
2108
2109         *device = NULL;
2110         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2111                                     fs_info->bdev_holder, 0, &bdev, &bh);
2112         if (ret)
2113                 return ret;
2114         disk_super = (struct btrfs_super_block *)bh->b_data;
2115         devid = btrfs_stack_device_id(&disk_super->dev_item);
2116         dev_uuid = disk_super->dev_item.uuid;
2117         *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2118         brelse(bh);
2119         if (!*device)
2120                 ret = -ENOENT;
2121         blkdev_put(bdev, FMODE_READ);
2122         return ret;
2123 }
2124
2125 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2126                                          const char *device_path,
2127                                          struct btrfs_device **device)
2128 {
2129         *device = NULL;
2130         if (strcmp(device_path, "missing") == 0) {
2131                 struct list_head *devices;
2132                 struct btrfs_device *tmp;
2133
2134                 devices = &fs_info->fs_devices->devices;
2135                 /*
2136                  * It is safe to read the devices since the volume_mutex
2137                  * is held by the caller.
2138                  */
2139                 list_for_each_entry(tmp, devices, dev_list) {
2140                         if (tmp->in_fs_metadata && !tmp->bdev) {
2141                                 *device = tmp;
2142                                 break;
2143                         }
2144                 }
2145
2146                 if (!*device)
2147                         return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2148
2149                 return 0;
2150         } else {
2151                 return btrfs_find_device_by_path(fs_info, device_path, device);
2152         }
2153 }
2154
2155 /*
2156  * Lookup a device given by device id, or the path if the id is 0.
2157  */
2158 int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2159                                  const char *devpath,
2160                                  struct btrfs_device **device)
2161 {
2162         int ret;
2163
2164         if (devid) {
2165                 ret = 0;
2166                 *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2167                 if (!*device)
2168                         ret = -ENOENT;
2169         } else {
2170                 if (!devpath || !devpath[0])
2171                         return -EINVAL;
2172
2173                 ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2174                                                            device);
2175         }
2176         return ret;
2177 }
2178
2179 /*
2180  * does all the dirty work required for changing file system's UUID.
2181  */
2182 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2183 {
2184         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2185         struct btrfs_fs_devices *old_devices;
2186         struct btrfs_fs_devices *seed_devices;
2187         struct btrfs_super_block *disk_super = fs_info->super_copy;
2188         struct btrfs_device *device;
2189         u64 super_flags;
2190
2191         BUG_ON(!mutex_is_locked(&uuid_mutex));
2192         if (!fs_devices->seeding)
2193                 return -EINVAL;
2194
2195         seed_devices = __alloc_fs_devices();
2196         if (IS_ERR(seed_devices))
2197                 return PTR_ERR(seed_devices);
2198
2199         old_devices = clone_fs_devices(fs_devices);
2200         if (IS_ERR(old_devices)) {
2201                 kfree(seed_devices);
2202                 return PTR_ERR(old_devices);
2203         }
2204
2205         list_add(&old_devices->list, &fs_uuids);
2206
2207         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2208         seed_devices->opened = 1;
2209         INIT_LIST_HEAD(&seed_devices->devices);
2210         INIT_LIST_HEAD(&seed_devices->alloc_list);
2211         mutex_init(&seed_devices->device_list_mutex);
2212
2213         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2214         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2215                               synchronize_rcu);
2216         list_for_each_entry(device, &seed_devices->devices, dev_list)
2217                 device->fs_devices = seed_devices;
2218
2219         mutex_lock(&fs_info->chunk_mutex);
2220         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2221         mutex_unlock(&fs_info->chunk_mutex);
2222
2223         fs_devices->seeding = 0;
2224         fs_devices->num_devices = 0;
2225         fs_devices->open_devices = 0;
2226         fs_devices->missing_devices = 0;
2227         fs_devices->rotating = 0;
2228         fs_devices->seed = seed_devices;
2229
2230         generate_random_uuid(fs_devices->fsid);
2231         memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2232         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2233         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2234
2235         super_flags = btrfs_super_flags(disk_super) &
2236                       ~BTRFS_SUPER_FLAG_SEEDING;
2237         btrfs_set_super_flags(disk_super, super_flags);
2238
2239         return 0;
2240 }
2241
2242 /*
2243  * Store the expected generation for seed devices in device items.
2244  */
2245 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2246                                struct btrfs_fs_info *fs_info)
2247 {
2248         struct btrfs_root *root = fs_info->chunk_root;
2249         struct btrfs_path *path;
2250         struct extent_buffer *leaf;
2251         struct btrfs_dev_item *dev_item;
2252         struct btrfs_device *device;
2253         struct btrfs_key key;
2254         u8 fs_uuid[BTRFS_UUID_SIZE];
2255         u8 dev_uuid[BTRFS_UUID_SIZE];
2256         u64 devid;
2257         int ret;
2258
2259         path = btrfs_alloc_path();
2260         if (!path)
2261                 return -ENOMEM;
2262
2263         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2264         key.offset = 0;
2265         key.type = BTRFS_DEV_ITEM_KEY;
2266
2267         while (1) {
2268                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2269                 if (ret < 0)
2270                         goto error;
2271
2272                 leaf = path->nodes[0];
2273 next_slot:
2274                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2275                         ret = btrfs_next_leaf(root, path);
2276                         if (ret > 0)
2277                                 break;
2278                         if (ret < 0)
2279                                 goto error;
2280                         leaf = path->nodes[0];
2281                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2282                         btrfs_release_path(path);
2283                         continue;
2284                 }
2285
2286                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2287                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2288                     key.type != BTRFS_DEV_ITEM_KEY)
2289                         break;
2290
2291                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2292                                           struct btrfs_dev_item);
2293                 devid = btrfs_device_id(leaf, dev_item);
2294                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2295                                    BTRFS_UUID_SIZE);
2296                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2297                                    BTRFS_UUID_SIZE);
2298                 device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2299                 BUG_ON(!device); /* Logic error */
2300
2301                 if (device->fs_devices->seeding) {
2302                         btrfs_set_device_generation(leaf, dev_item,
2303                                                     device->generation);
2304                         btrfs_mark_buffer_dirty(leaf);
2305                 }
2306
2307                 path->slots[0]++;
2308                 goto next_slot;
2309         }
2310         ret = 0;
2311 error:
2312         btrfs_free_path(path);
2313         return ret;
2314 }
2315
2316 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2317 {
2318         struct btrfs_root *root = fs_info->dev_root;
2319         struct request_queue *q;
2320         struct btrfs_trans_handle *trans;
2321         struct btrfs_device *device;
2322         struct block_device *bdev;
2323         struct list_head *devices;
2324         struct super_block *sb = fs_info->sb;
2325         struct rcu_string *name;
2326         u64 tmp;
2327         int seeding_dev = 0;
2328         int ret = 0;
2329
2330         if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding)
2331                 return -EROFS;
2332
2333         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2334                                   fs_info->bdev_holder);
2335         if (IS_ERR(bdev))
2336                 return PTR_ERR(bdev);
2337
2338         if (fs_info->fs_devices->seeding) {
2339                 seeding_dev = 1;
2340                 down_write(&sb->s_umount);
2341                 mutex_lock(&uuid_mutex);
2342         }
2343
2344         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2345
2346         devices = &fs_info->fs_devices->devices;
2347
2348         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2349         list_for_each_entry(device, devices, dev_list) {
2350                 if (device->bdev == bdev) {
2351                         ret = -EEXIST;
2352                         mutex_unlock(
2353                                 &fs_info->fs_devices->device_list_mutex);
2354                         goto error;
2355                 }
2356         }
2357         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2358
2359         device = btrfs_alloc_device(fs_info, NULL, NULL);
2360         if (IS_ERR(device)) {
2361                 /* we can safely leave the fs_devices entry around */
2362                 ret = PTR_ERR(device);
2363                 goto error;
2364         }
2365
2366         name = rcu_string_strdup(device_path, GFP_KERNEL);
2367         if (!name) {
2368                 kfree(device);
2369                 ret = -ENOMEM;
2370                 goto error;
2371         }
2372         rcu_assign_pointer(device->name, name);
2373
2374         trans = btrfs_start_transaction(root, 0);
2375         if (IS_ERR(trans)) {
2376                 rcu_string_free(device->name);
2377                 kfree(device);
2378                 ret = PTR_ERR(trans);
2379                 goto error;
2380         }
2381
2382         q = bdev_get_queue(bdev);
2383         if (blk_queue_discard(q))
2384                 device->can_discard = 1;
2385         device->writeable = 1;
2386         device->generation = trans->transid;
2387         device->io_width = fs_info->sectorsize;
2388         device->io_align = fs_info->sectorsize;
2389         device->sector_size = fs_info->sectorsize;
2390         device->total_bytes = i_size_read(bdev->bd_inode);
2391         device->disk_total_bytes = device->total_bytes;
2392         device->commit_total_bytes = device->total_bytes;
2393         device->fs_info = fs_info;
2394         device->bdev = bdev;
2395         device->in_fs_metadata = 1;
2396         device->is_tgtdev_for_dev_replace = 0;
2397         device->mode = FMODE_EXCL;
2398         device->dev_stats_valid = 1;
2399         set_blocksize(device->bdev, 4096);
2400
2401         if (seeding_dev) {
2402                 sb->s_flags &= ~MS_RDONLY;
2403                 ret = btrfs_prepare_sprout(fs_info);
2404                 BUG_ON(ret); /* -ENOMEM */
2405         }
2406
2407         device->fs_devices = fs_info->fs_devices;
2408
2409         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2410         mutex_lock(&fs_info->chunk_mutex);
2411         list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2412         list_add(&device->dev_alloc_list,
2413                  &fs_info->fs_devices->alloc_list);
2414         fs_info->fs_devices->num_devices++;
2415         fs_info->fs_devices->open_devices++;
2416         fs_info->fs_devices->rw_devices++;
2417         fs_info->fs_devices->total_devices++;
2418         fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2419
2420         spin_lock(&fs_info->free_chunk_lock);
2421         fs_info->free_chunk_space += device->total_bytes;
2422         spin_unlock(&fs_info->free_chunk_lock);
2423
2424         if (!blk_queue_nonrot(q))
2425                 fs_info->fs_devices->rotating = 1;
2426
2427         tmp = btrfs_super_total_bytes(fs_info->super_copy);
2428         btrfs_set_super_total_bytes(fs_info->super_copy,
2429                                     tmp + device->total_bytes);
2430
2431         tmp = btrfs_super_num_devices(fs_info->super_copy);
2432         btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2433
2434         /* add sysfs device entry */
2435         btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2436
2437         /*
2438          * we've got more storage, clear any full flags on the space
2439          * infos
2440          */
2441         btrfs_clear_space_info_full(fs_info);
2442
2443         mutex_unlock(&fs_info->chunk_mutex);
2444         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2445
2446         if (seeding_dev) {
2447                 mutex_lock(&fs_info->chunk_mutex);
2448                 ret = init_first_rw_device(trans, fs_info);
2449                 mutex_unlock(&fs_info->chunk_mutex);
2450                 if (ret) {
2451                         btrfs_abort_transaction(trans, ret);
2452                         goto error_trans;
2453                 }
2454         }
2455
2456         ret = btrfs_add_device(trans, fs_info, device);
2457         if (ret) {
2458                 btrfs_abort_transaction(trans, ret);
2459                 goto error_trans;
2460         }
2461
2462         if (seeding_dev) {
2463                 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2464
2465                 ret = btrfs_finish_sprout(trans, fs_info);
2466                 if (ret) {
2467                         btrfs_abort_transaction(trans, ret);
2468                         goto error_trans;
2469                 }
2470
2471                 /* Sprouting would change fsid of the mounted root,
2472                  * so rename the fsid on the sysfs
2473                  */
2474                 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2475                                                 fs_info->fsid);
2476                 if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2477                         btrfs_warn(fs_info,
2478                                    "sysfs: failed to create fsid for sprout");
2479         }
2480
2481         fs_info->num_tolerated_disk_barrier_failures =
2482                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2483         ret = btrfs_commit_transaction(trans);
2484
2485         if (seeding_dev) {
2486                 mutex_unlock(&uuid_mutex);
2487                 up_write(&sb->s_umount);
2488
2489                 if (ret) /* transaction commit */
2490                         return ret;
2491
2492                 ret = btrfs_relocate_sys_chunks(fs_info);
2493                 if (ret < 0)
2494                         btrfs_handle_fs_error(fs_info, ret,
2495                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2496                 trans = btrfs_attach_transaction(root);
2497                 if (IS_ERR(trans)) {
2498                         if (PTR_ERR(trans) == -ENOENT)
2499                                 return 0;
2500                         return PTR_ERR(trans);
2501                 }
2502                 ret = btrfs_commit_transaction(trans);
2503         }
2504
2505         /* Update ctime/mtime for libblkid */
2506         update_dev_time(device_path);
2507         return ret;
2508
2509 error_trans:
2510         btrfs_end_transaction(trans);
2511         rcu_string_free(device->name);
2512         btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2513         kfree(device);
2514 error:
2515         blkdev_put(bdev, FMODE_EXCL);
2516         if (seeding_dev) {
2517                 mutex_unlock(&uuid_mutex);
2518                 up_write(&sb->s_umount);
2519         }
2520         return ret;
2521 }
2522
2523 int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2524                                   const char *device_path,
2525                                   struct btrfs_device *srcdev,
2526                                   struct btrfs_device **device_out)
2527 {
2528         struct request_queue *q;
2529         struct btrfs_device *device;
2530         struct block_device *bdev;
2531         struct list_head *devices;
2532         struct rcu_string *name;
2533         u64 devid = BTRFS_DEV_REPLACE_DEVID;
2534         int ret = 0;
2535
2536         *device_out = NULL;
2537         if (fs_info->fs_devices->seeding) {
2538                 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2539                 return -EINVAL;
2540         }
2541
2542         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2543                                   fs_info->bdev_holder);
2544         if (IS_ERR(bdev)) {
2545                 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2546                 return PTR_ERR(bdev);
2547         }
2548
2549         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2550
2551         devices = &fs_info->fs_devices->devices;
2552         list_for_each_entry(device, devices, dev_list) {
2553                 if (device->bdev == bdev) {
2554                         btrfs_err(fs_info,
2555                                   "target device is in the filesystem!");
2556                         ret = -EEXIST;
2557                         goto error;
2558                 }
2559         }
2560
2561
2562         if (i_size_read(bdev->bd_inode) <
2563             btrfs_device_get_total_bytes(srcdev)) {
2564                 btrfs_err(fs_info,
2565                           "target device is smaller than source device!");
2566                 ret = -EINVAL;
2567                 goto error;
2568         }
2569
2570
2571         device = btrfs_alloc_device(NULL, &devid, NULL);
2572         if (IS_ERR(device)) {
2573                 ret = PTR_ERR(device);
2574                 goto error;
2575         }
2576
2577         name = rcu_string_strdup(device_path, GFP_NOFS);
2578         if (!name) {
2579                 kfree(device);
2580                 ret = -ENOMEM;
2581                 goto error;
2582         }
2583         rcu_assign_pointer(device->name, name);
2584
2585         q = bdev_get_queue(bdev);
2586         if (blk_queue_discard(q))
2587                 device->can_discard = 1;
2588         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2589         device->writeable = 1;
2590         device->generation = 0;
2591         device->io_width = fs_info->sectorsize;
2592         device->io_align = fs_info->sectorsize;
2593         device->sector_size = fs_info->sectorsize;
2594         device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2595         device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2596         device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2597         ASSERT(list_empty(&srcdev->resized_list));
2598         device->commit_total_bytes = srcdev->commit_total_bytes;
2599         device->commit_bytes_used = device->bytes_used;
2600         device->fs_info = fs_info;
2601         device->bdev = bdev;
2602         device->in_fs_metadata = 1;
2603         device->is_tgtdev_for_dev_replace = 1;
2604         device->mode = FMODE_EXCL;
2605         device->dev_stats_valid = 1;
2606         set_blocksize(device->bdev, 4096);
2607         device->fs_devices = fs_info->fs_devices;
2608         list_add(&device->dev_list, &fs_info->fs_devices->devices);
2609         fs_info->fs_devices->num_devices++;
2610         fs_info->fs_devices->open_devices++;
2611         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2612
2613         *device_out = device;
2614         return ret;
2615
2616 error:
2617         blkdev_put(bdev, FMODE_EXCL);
2618         return ret;
2619 }
2620
2621 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2622                                               struct btrfs_device *tgtdev)
2623 {
2624         u32 sectorsize = fs_info->sectorsize;
2625
2626         WARN_ON(fs_info->fs_devices->rw_devices == 0);
2627         tgtdev->io_width = sectorsize;
2628         tgtdev->io_align = sectorsize;
2629         tgtdev->sector_size = sectorsize;
2630         tgtdev->fs_info = fs_info;
2631         tgtdev->in_fs_metadata = 1;
2632 }
2633
2634 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2635                                         struct btrfs_device *device)
2636 {
2637         int ret;
2638         struct btrfs_path *path;
2639         struct btrfs_root *root = device->fs_info->chunk_root;
2640         struct btrfs_dev_item *dev_item;
2641         struct extent_buffer *leaf;
2642         struct btrfs_key key;
2643
2644         path = btrfs_alloc_path();
2645         if (!path)
2646                 return -ENOMEM;
2647
2648         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2649         key.type = BTRFS_DEV_ITEM_KEY;
2650         key.offset = device->devid;
2651
2652         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2653         if (ret < 0)
2654                 goto out;
2655
2656         if (ret > 0) {
2657                 ret = -ENOENT;
2658                 goto out;
2659         }
2660
2661         leaf = path->nodes[0];
2662         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2663
2664         btrfs_set_device_id(leaf, dev_item, device->devid);
2665         btrfs_set_device_type(leaf, dev_item, device->type);
2666         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2667         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2668         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2669         btrfs_set_device_total_bytes(leaf, dev_item,
2670                                      btrfs_device_get_disk_total_bytes(device));
2671         btrfs_set_device_bytes_used(leaf, dev_item,
2672                                     btrfs_device_get_bytes_used(device));
2673         btrfs_mark_buffer_dirty(leaf);
2674
2675 out:
2676         btrfs_free_path(path);
2677         return ret;
2678 }
2679
2680 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2681                       struct btrfs_device *device, u64 new_size)
2682 {
2683         struct btrfs_fs_info *fs_info = device->fs_info;
2684         struct btrfs_super_block *super_copy = fs_info->super_copy;
2685         struct btrfs_fs_devices *fs_devices;
2686         u64 old_total;
2687         u64 diff;
2688
2689         if (!device->writeable)
2690                 return -EACCES;
2691
2692         mutex_lock(&fs_info->chunk_mutex);
2693         old_total = btrfs_super_total_bytes(super_copy);
2694         diff = new_size - device->total_bytes;
2695
2696         if (new_size <= device->total_bytes ||
2697             device->is_tgtdev_for_dev_replace) {
2698                 mutex_unlock(&fs_info->chunk_mutex);
2699                 return -EINVAL;
2700         }
2701
2702         fs_devices = fs_info->fs_devices;
2703
2704         btrfs_set_super_total_bytes(super_copy, old_total + diff);
2705         device->fs_devices->total_rw_bytes += diff;
2706
2707         btrfs_device_set_total_bytes(device, new_size);
2708         btrfs_device_set_disk_total_bytes(device, new_size);
2709         btrfs_clear_space_info_full(device->fs_info);
2710         if (list_empty(&device->resized_list))
2711                 list_add_tail(&device->resized_list,
2712                               &fs_devices->resized_devices);
2713         mutex_unlock(&fs_info->chunk_mutex);
2714
2715         return btrfs_update_device(trans, device);
2716 }
2717
2718 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2719                             struct btrfs_fs_info *fs_info, u64 chunk_objectid,
2720                             u64 chunk_offset)
2721 {
2722         struct btrfs_root *root = fs_info->chunk_root;
2723         int ret;
2724         struct btrfs_path *path;
2725         struct btrfs_key key;
2726
2727         path = btrfs_alloc_path();
2728         if (!path)
2729                 return -ENOMEM;
2730
2731         key.objectid = chunk_objectid;
2732         key.offset = chunk_offset;
2733         key.type = BTRFS_CHUNK_ITEM_KEY;
2734
2735         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2736         if (ret < 0)
2737                 goto out;
2738         else if (ret > 0) { /* Logic error or corruption */
2739                 btrfs_handle_fs_error(fs_info, -ENOENT,
2740                                       "Failed lookup while freeing chunk.");
2741                 ret = -ENOENT;
2742                 goto out;
2743         }
2744
2745         ret = btrfs_del_item(trans, root, path);
2746         if (ret < 0)
2747                 btrfs_handle_fs_error(fs_info, ret,
2748                                       "Failed to delete chunk item.");
2749 out:
2750         btrfs_free_path(path);
2751         return ret;
2752 }
2753
2754 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info,
2755                                u64 chunk_objectid, u64 chunk_offset)
2756 {
2757         struct btrfs_super_block *super_copy = fs_info->super_copy;
2758         struct btrfs_disk_key *disk_key;
2759         struct btrfs_chunk *chunk;
2760         u8 *ptr;
2761         int ret = 0;
2762         u32 num_stripes;
2763         u32 array_size;
2764         u32 len = 0;
2765         u32 cur;
2766         struct btrfs_key key;
2767
2768         mutex_lock(&fs_info->chunk_mutex);
2769         array_size = btrfs_super_sys_array_size(super_copy);
2770
2771         ptr = super_copy->sys_chunk_array;
2772         cur = 0;
2773
2774         while (cur < array_size) {
2775                 disk_key = (struct btrfs_disk_key *)ptr;
2776                 btrfs_disk_key_to_cpu(&key, disk_key);
2777
2778                 len = sizeof(*disk_key);
2779
2780                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2781                         chunk = (struct btrfs_chunk *)(ptr + len);
2782                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2783                         len += btrfs_chunk_item_size(num_stripes);
2784                 } else {
2785                         ret = -EIO;
2786                         break;
2787                 }
2788                 if (key.objectid == chunk_objectid &&
2789                     key.offset == chunk_offset) {
2790                         memmove(ptr, ptr + len, array_size - (cur + len));
2791                         array_size -= len;
2792                         btrfs_set_super_sys_array_size(super_copy, array_size);
2793                 } else {
2794                         ptr += len;
2795                         cur += len;
2796                 }
2797         }
2798         mutex_unlock(&fs_info->chunk_mutex);
2799         return ret;
2800 }
2801
2802 static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2803                                         u64 logical, u64 length)
2804 {
2805         struct extent_map_tree *em_tree;
2806         struct extent_map *em;
2807
2808         em_tree = &fs_info->mapping_tree.map_tree;
2809         read_lock(&em_tree->lock);
2810         em = lookup_extent_mapping(em_tree, logical, length);
2811         read_unlock(&em_tree->lock);
2812
2813         if (!em) {
2814                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2815                            logical, length);
2816                 return ERR_PTR(-EINVAL);
2817         }
2818
2819         if (em->start > logical || em->start + em->len < logical) {
2820                 btrfs_crit(fs_info,
2821                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2822                            logical, length, em->start, em->start + em->len);
2823                 free_extent_map(em);
2824                 return ERR_PTR(-EINVAL);
2825         }
2826
2827         /* callers are responsible for dropping em's ref. */
2828         return em;
2829 }
2830
2831 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2832                        struct btrfs_fs_info *fs_info, u64 chunk_offset)
2833 {
2834         struct extent_map *em;
2835         struct map_lookup *map;
2836         u64 dev_extent_len = 0;
2837         u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2838         int i, ret = 0;
2839         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2840
2841         em = get_chunk_map(fs_info, chunk_offset, 1);
2842         if (IS_ERR(em)) {
2843                 /*
2844                  * This is a logic error, but we don't want to just rely on the
2845                  * user having built with ASSERT enabled, so if ASSERT doesn't
2846                  * do anything we still error out.
2847                  */
2848                 ASSERT(0);
2849                 return PTR_ERR(em);
2850         }
2851         map = em->map_lookup;
2852         mutex_lock(&fs_info->chunk_mutex);
2853         check_system_chunk(trans, fs_info, map->type);
2854         mutex_unlock(&fs_info->chunk_mutex);
2855
2856         /*
2857          * Take the device list mutex to prevent races with the final phase of
2858          * a device replace operation that replaces the device object associated
2859          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2860          */
2861         mutex_lock(&fs_devices->device_list_mutex);
2862         for (i = 0; i < map->num_stripes; i++) {
2863                 struct btrfs_device *device = map->stripes[i].dev;
2864                 ret = btrfs_free_dev_extent(trans, device,
2865                                             map->stripes[i].physical,
2866                                             &dev_extent_len);
2867                 if (ret) {
2868                         mutex_unlock(&fs_devices->device_list_mutex);
2869                         btrfs_abort_transaction(trans, ret);
2870                         goto out;
2871                 }
2872
2873                 if (device->bytes_used > 0) {
2874                         mutex_lock(&fs_info->chunk_mutex);
2875                         btrfs_device_set_bytes_used(device,
2876                                         device->bytes_used - dev_extent_len);
2877                         spin_lock(&fs_info->free_chunk_lock);
2878                         fs_info->free_chunk_space += dev_extent_len;
2879                         spin_unlock(&fs_info->free_chunk_lock);
2880                         btrfs_clear_space_info_full(fs_info);
2881                         mutex_unlock(&fs_info->chunk_mutex);
2882                 }
2883
2884                 if (map->stripes[i].dev) {
2885                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2886                         if (ret) {
2887                                 mutex_unlock(&fs_devices->device_list_mutex);
2888                                 btrfs_abort_transaction(trans, ret);
2889                                 goto out;
2890                         }
2891                 }
2892         }
2893         mutex_unlock(&fs_devices->device_list_mutex);
2894
2895         ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset);
2896         if (ret) {
2897                 btrfs_abort_transaction(trans, ret);
2898                 goto out;
2899         }
2900
2901         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2902
2903         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2904                 ret = btrfs_del_sys_chunk(fs_info, chunk_objectid,
2905                                           chunk_offset);
2906                 if (ret) {
2907                         btrfs_abort_transaction(trans, ret);
2908                         goto out;
2909                 }
2910         }
2911
2912         ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2913         if (ret) {
2914                 btrfs_abort_transaction(trans, ret);
2915                 goto out;
2916         }
2917
2918 out:
2919         /* once for us */
2920         free_extent_map(em);
2921         return ret;
2922 }
2923
2924 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2925 {
2926         struct btrfs_root *root = fs_info->chunk_root;
2927         struct btrfs_trans_handle *trans;
2928         int ret;
2929
2930         /*
2931          * Prevent races with automatic removal of unused block groups.
2932          * After we relocate and before we remove the chunk with offset
2933          * chunk_offset, automatic removal of the block group can kick in,
2934          * resulting in a failure when calling btrfs_remove_chunk() below.
2935          *
2936          * Make sure to acquire this mutex before doing a tree search (dev
2937          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2938          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2939          * we release the path used to search the chunk/dev tree and before
2940          * the current task acquires this mutex and calls us.
2941          */
2942         ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
2943
2944         ret = btrfs_can_relocate(fs_info, chunk_offset);
2945         if (ret)
2946                 return -ENOSPC;
2947
2948         /* step one, relocate all the extents inside this chunk */
2949         btrfs_scrub_pause(fs_info);
2950         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2951         btrfs_scrub_continue(fs_info);
2952         if (ret)
2953                 return ret;
2954
2955         trans = btrfs_start_trans_remove_block_group(root->fs_info,
2956                                                      chunk_offset);
2957         if (IS_ERR(trans)) {
2958                 ret = PTR_ERR(trans);
2959                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2960                 return ret;
2961         }
2962
2963         /*
2964          * step two, delete the device extents and the
2965          * chunk tree entries
2966          */
2967         ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2968         btrfs_end_transaction(trans);
2969         return ret;
2970 }
2971
2972 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2973 {
2974         struct btrfs_root *chunk_root = fs_info->chunk_root;
2975         struct btrfs_path *path;
2976         struct extent_buffer *leaf;
2977         struct btrfs_chunk *chunk;
2978         struct btrfs_key key;
2979         struct btrfs_key found_key;
2980         u64 chunk_type;
2981         bool retried = false;
2982         int failed = 0;
2983         int ret;
2984
2985         path = btrfs_alloc_path();
2986         if (!path)
2987                 return -ENOMEM;
2988
2989 again:
2990         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2991         key.offset = (u64)-1;
2992         key.type = BTRFS_CHUNK_ITEM_KEY;
2993
2994         while (1) {
2995                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
2996                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2997                 if (ret < 0) {
2998                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2999                         goto error;
3000                 }
3001                 BUG_ON(ret == 0); /* Corruption */
3002
3003                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3004                                           key.type);
3005                 if (ret)
3006                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3007                 if (ret < 0)
3008                         goto error;
3009                 if (ret > 0)
3010                         break;
3011
3012                 leaf = path->nodes[0];
3013                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3014
3015                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3016                                        struct btrfs_chunk);
3017                 chunk_type = btrfs_chunk_type(leaf, chunk);
3018                 btrfs_release_path(path);
3019
3020                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3021                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3022                         if (ret == -ENOSPC)
3023                                 failed++;
3024                         else
3025                                 BUG_ON(ret);
3026                 }
3027                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3028
3029                 if (found_key.offset == 0)
3030                         break;
3031                 key.offset = found_key.offset - 1;
3032         }
3033         ret = 0;
3034         if (failed && !retried) {
3035                 failed = 0;
3036                 retried = true;
3037                 goto again;
3038         } else if (WARN_ON(failed && retried)) {
3039                 ret = -ENOSPC;
3040         }
3041 error:
3042         btrfs_free_path(path);
3043         return ret;
3044 }
3045
3046 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3047                                struct btrfs_balance_control *bctl)
3048 {
3049         struct btrfs_root *root = fs_info->tree_root;
3050         struct btrfs_trans_handle *trans;
3051         struct btrfs_balance_item *item;
3052         struct btrfs_disk_balance_args disk_bargs;
3053         struct btrfs_path *path;
3054         struct extent_buffer *leaf;
3055         struct btrfs_key key;
3056         int ret, err;
3057
3058         path = btrfs_alloc_path();
3059         if (!path)
3060                 return -ENOMEM;
3061
3062         trans = btrfs_start_transaction(root, 0);
3063         if (IS_ERR(trans)) {
3064                 btrfs_free_path(path);
3065                 return PTR_ERR(trans);
3066         }
3067
3068         key.objectid = BTRFS_BALANCE_OBJECTID;
3069         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3070         key.offset = 0;
3071
3072         ret = btrfs_insert_empty_item(trans, root, path, &key,
3073                                       sizeof(*item));
3074         if (ret)
3075                 goto out;
3076
3077         leaf = path->nodes[0];
3078         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3079
3080         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3081
3082         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3083         btrfs_set_balance_data(leaf, item, &disk_bargs);
3084         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3085         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3086         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3087         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3088
3089         btrfs_set_balance_flags(leaf, item, bctl->flags);
3090
3091         btrfs_mark_buffer_dirty(leaf);
3092 out:
3093         btrfs_free_path(path);
3094         err = btrfs_commit_transaction(trans);
3095         if (err && !ret)
3096                 ret = err;
3097         return ret;
3098 }
3099
3100 static int del_balance_item(struct btrfs_fs_info *fs_info)
3101 {
3102         struct btrfs_root *root = fs_info->tree_root;
3103         struct btrfs_trans_handle *trans;
3104         struct btrfs_path *path;
3105         struct btrfs_key key;
3106         int ret, err;
3107
3108         path = btrfs_alloc_path();
3109         if (!path)
3110                 return -ENOMEM;
3111
3112         trans = btrfs_start_transaction(root, 0);
3113         if (IS_ERR(trans)) {
3114                 btrfs_free_path(path);
3115                 return PTR_ERR(trans);
3116         }
3117
3118         key.objectid = BTRFS_BALANCE_OBJECTID;
3119         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3120         key.offset = 0;
3121
3122         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3123         if (ret < 0)
3124                 goto out;
3125         if (ret > 0) {
3126                 ret = -ENOENT;
3127                 goto out;
3128         }
3129
3130         ret = btrfs_del_item(trans, root, path);
3131 out:
3132         btrfs_free_path(path);
3133         err = btrfs_commit_transaction(trans);
3134         if (err && !ret)
3135                 ret = err;
3136         return ret;
3137 }
3138
3139 /*
3140  * This is a heuristic used to reduce the number of chunks balanced on
3141  * resume after balance was interrupted.
3142  */
3143 static void update_balance_args(struct btrfs_balance_control *bctl)
3144 {
3145         /*
3146          * Turn on soft mode for chunk types that were being converted.
3147          */
3148         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3149                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3150         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3151                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3152         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3153                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3154
3155         /*
3156          * Turn on usage filter if is not already used.  The idea is
3157          * that chunks that we have already balanced should be
3158          * reasonably full.  Don't do it for chunks that are being
3159          * converted - that will keep us from relocating unconverted
3160          * (albeit full) chunks.
3161          */
3162         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3163             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3164             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3165                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3166                 bctl->data.usage = 90;
3167         }
3168         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3169             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3170             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3171                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3172                 bctl->sys.usage = 90;
3173         }
3174         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3175             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3176             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3177                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3178                 bctl->meta.usage = 90;
3179         }
3180 }
3181
3182 /*
3183  * Should be called with both balance and volume mutexes held to
3184  * serialize other volume operations (add_dev/rm_dev/resize) with
3185  * restriper.  Same goes for unset_balance_control.
3186  */
3187 static void set_balance_control(struct btrfs_balance_control *bctl)
3188 {
3189         struct btrfs_fs_info *fs_info = bctl->fs_info;
3190
3191         BUG_ON(fs_info->balance_ctl);
3192
3193         spin_lock(&fs_info->balance_lock);
3194         fs_info->balance_ctl = bctl;
3195         spin_unlock(&fs_info->balance_lock);
3196 }
3197
3198 static void unset_balance_control(struct btrfs_fs_info *fs_info)
3199 {
3200         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3201
3202         BUG_ON(!fs_info->balance_ctl);
3203
3204         spin_lock(&fs_info->balance_lock);
3205         fs_info->balance_ctl = NULL;
3206         spin_unlock(&fs_info->balance_lock);
3207
3208         kfree(bctl);
3209 }
3210
3211 /*
3212  * Balance filters.  Return 1 if chunk should be filtered out
3213  * (should not be balanced).
3214  */
3215 static int chunk_profiles_filter(u64 chunk_type,
3216                                  struct btrfs_balance_args *bargs)
3217 {
3218         chunk_type = chunk_to_extended(chunk_type) &
3219                                 BTRFS_EXTENDED_PROFILE_MASK;
3220
3221         if (bargs->profiles & chunk_type)
3222                 return 0;
3223
3224         return 1;
3225 }
3226
3227 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3228                               struct btrfs_balance_args *bargs)
3229 {
3230         struct btrfs_block_group_cache *cache;
3231         u64 chunk_used;
3232         u64 user_thresh_min;
3233         u64 user_thresh_max;
3234         int ret = 1;
3235
3236         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3237         chunk_used = btrfs_block_group_used(&cache->item);
3238
3239         if (bargs->usage_min == 0)
3240                 user_thresh_min = 0;
3241         else
3242                 user_thresh_min = div_factor_fine(cache->key.offset,
3243                                         bargs->usage_min);
3244
3245         if (bargs->usage_max == 0)
3246                 user_thresh_max = 1;
3247         else if (bargs->usage_max > 100)
3248                 user_thresh_max = cache->key.offset;
3249         else
3250                 user_thresh_max = div_factor_fine(cache->key.offset,
3251                                         bargs->usage_max);
3252
3253         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3254                 ret = 0;
3255
3256         btrfs_put_block_group(cache);
3257         return ret;
3258 }
3259
3260 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3261                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3262 {
3263         struct btrfs_block_group_cache *cache;
3264         u64 chunk_used, user_thresh;
3265         int ret = 1;
3266
3267         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3268         chunk_used = btrfs_block_group_used(&cache->item);
3269
3270         if (bargs->usage_min == 0)
3271                 user_thresh = 1;
3272         else if (bargs->usage > 100)
3273                 user_thresh = cache->key.offset;
3274         else
3275                 user_thresh = div_factor_fine(cache->key.offset,
3276                                               bargs->usage);
3277
3278         if (chunk_used < user_thresh)
3279                 ret = 0;
3280
3281         btrfs_put_block_group(cache);
3282         return ret;
3283 }
3284
3285 static int chunk_devid_filter(struct extent_buffer *leaf,
3286                               struct btrfs_chunk *chunk,
3287                               struct btrfs_balance_args *bargs)
3288 {
3289         struct btrfs_stripe *stripe;
3290         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3291         int i;
3292
3293         for (i = 0; i < num_stripes; i++) {
3294                 stripe = btrfs_stripe_nr(chunk, i);
3295                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3296                         return 0;
3297         }
3298
3299         return 1;
3300 }
3301
3302 /* [pstart, pend) */
3303 static int chunk_drange_filter(struct extent_buffer *leaf,
3304                                struct btrfs_chunk *chunk,
3305                                u64 chunk_offset,
3306                                struct btrfs_balance_args *bargs)
3307 {
3308         struct btrfs_stripe *stripe;
3309         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3310         u64 stripe_offset;
3311         u64 stripe_length;
3312         int factor;
3313         int i;
3314
3315         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3316                 return 0;
3317
3318         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3319              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3320                 factor = num_stripes / 2;
3321         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3322                 factor = num_stripes - 1;
3323         } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3324                 factor = num_stripes - 2;
3325         } else {
3326                 factor = num_stripes;
3327         }
3328
3329         for (i = 0; i < num_stripes; i++) {
3330                 stripe = btrfs_stripe_nr(chunk, i);
3331                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3332                         continue;
3333
3334                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3335                 stripe_length = btrfs_chunk_length(leaf, chunk);
3336                 stripe_length = div_u64(stripe_length, factor);
3337
3338                 if (stripe_offset < bargs->pend &&
3339                     stripe_offset + stripe_length > bargs->pstart)
3340                         return 0;
3341         }
3342
3343         return 1;
3344 }
3345
3346 /* [vstart, vend) */
3347 static int chunk_vrange_filter(struct extent_buffer *leaf,
3348                                struct btrfs_chunk *chunk,
3349                                u64 chunk_offset,
3350                                struct btrfs_balance_args *bargs)
3351 {
3352         if (chunk_offset < bargs->vend &&
3353             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3354                 /* at least part of the chunk is inside this vrange */
3355                 return 0;
3356
3357         return 1;
3358 }
3359
3360 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3361                                struct btrfs_chunk *chunk,
3362                                struct btrfs_balance_args *bargs)
3363 {
3364         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3365
3366         if (bargs->stripes_min <= num_stripes
3367                         && num_stripes <= bargs->stripes_max)
3368                 return 0;
3369
3370         return 1;
3371 }
3372
3373 static int chunk_soft_convert_filter(u64 chunk_type,
3374                                      struct btrfs_balance_args *bargs)
3375 {
3376         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3377                 return 0;
3378
3379         chunk_type = chunk_to_extended(chunk_type) &
3380                                 BTRFS_EXTENDED_PROFILE_MASK;
3381
3382         if (bargs->target == chunk_type)
3383                 return 1;
3384
3385         return 0;
3386 }
3387
3388 static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3389                                 struct extent_buffer *leaf,
3390                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3391 {
3392         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3393         struct btrfs_balance_args *bargs = NULL;
3394         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3395
3396         /* type filter */
3397         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3398               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3399                 return 0;
3400         }
3401
3402         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3403                 bargs = &bctl->data;
3404         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3405                 bargs = &bctl->sys;
3406         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3407                 bargs = &bctl->meta;
3408
3409         /* profiles filter */
3410         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3411             chunk_profiles_filter(chunk_type, bargs)) {
3412                 return 0;
3413         }
3414
3415         /* usage filter */
3416         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3417             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3418                 return 0;
3419         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3420             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3421                 return 0;
3422         }
3423
3424         /* devid filter */
3425         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3426             chunk_devid_filter(leaf, chunk, bargs)) {
3427                 return 0;
3428         }
3429
3430         /* drange filter, makes sense only with devid filter */
3431         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3432             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3433                 return 0;
3434         }
3435
3436         /* vrange filter */
3437         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3438             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3439                 return 0;
3440         }
3441
3442         /* stripes filter */
3443         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3444             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3445                 return 0;
3446         }
3447
3448         /* soft profile changing mode */
3449         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3450             chunk_soft_convert_filter(chunk_type, bargs)) {
3451                 return 0;
3452         }
3453
3454         /*
3455          * limited by count, must be the last filter
3456          */
3457         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3458                 if (bargs->limit == 0)
3459                         return 0;
3460                 else
3461                         bargs->limit--;
3462         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3463                 /*
3464                  * Same logic as the 'limit' filter; the minimum cannot be
3465                  * determined here because we do not have the global information
3466                  * about the count of all chunks that satisfy the filters.
3467                  */
3468                 if (bargs->limit_max == 0)
3469                         return 0;
3470                 else
3471                         bargs->limit_max--;
3472         }
3473
3474         return 1;
3475 }
3476
3477 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3478 {
3479         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3480         struct btrfs_root *chunk_root = fs_info->chunk_root;
3481         struct btrfs_root *dev_root = fs_info->dev_root;
3482         struct list_head *devices;
3483         struct btrfs_device *device;
3484         u64 old_size;
3485         u64 size_to_free;
3486         u64 chunk_type;
3487         struct btrfs_chunk *chunk;
3488         struct btrfs_path *path = NULL;
3489         struct btrfs_key key;
3490         struct btrfs_key found_key;
3491         struct btrfs_trans_handle *trans;
3492         struct extent_buffer *leaf;
3493         int slot;
3494         int ret;
3495         int enospc_errors = 0;
3496         bool counting = true;
3497         /* The single value limit and min/max limits use the same bytes in the */
3498         u64 limit_data = bctl->data.limit;
3499         u64 limit_meta = bctl->meta.limit;
3500         u64 limit_sys = bctl->sys.limit;
3501         u32 count_data = 0;
3502         u32 count_meta = 0;
3503         u32 count_sys = 0;
3504         int chunk_reserved = 0;
3505         u64 bytes_used = 0;
3506
3507         /* step one make some room on all the devices */
3508         devices = &fs_info->fs_devices->devices;
3509         list_for_each_entry(device, devices, dev_list) {
3510                 old_size = btrfs_device_get_total_bytes(device);
3511                 size_to_free = div_factor(old_size, 1);
3512                 size_to_free = min_t(u64, size_to_free, SZ_1M);
3513                 if (!device->writeable ||
3514                     btrfs_device_get_total_bytes(device) -
3515                     btrfs_device_get_bytes_used(device) > size_to_free ||
3516                     device->is_tgtdev_for_dev_replace)
3517                         continue;
3518
3519                 ret = btrfs_shrink_device(device, old_size - size_to_free);
3520                 if (ret == -ENOSPC)
3521                         break;
3522                 if (ret) {
3523                         /* btrfs_shrink_device never returns ret > 0 */
3524                         WARN_ON(ret > 0);
3525                         goto error;
3526                 }
3527
3528                 trans = btrfs_start_transaction(dev_root, 0);
3529                 if (IS_ERR(trans)) {
3530                         ret = PTR_ERR(trans);
3531                         btrfs_info_in_rcu(fs_info,
3532                  "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3533                                           rcu_str_deref(device->name), ret,
3534                                           old_size, old_size - size_to_free);
3535                         goto error;
3536                 }
3537
3538                 ret = btrfs_grow_device(trans, device, old_size);
3539                 if (ret) {
3540                         btrfs_end_transaction(trans);
3541                         /* btrfs_grow_device never returns ret > 0 */
3542                         WARN_ON(ret > 0);
3543                         btrfs_info_in_rcu(fs_info,
3544                  "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3545                                           rcu_str_deref(device->name), ret,
3546                                           old_size, old_size - size_to_free);
3547                         goto error;
3548                 }
3549
3550                 btrfs_end_transaction(trans);
3551         }
3552
3553         /* step two, relocate all the chunks */
3554         path = btrfs_alloc_path();
3555         if (!path) {
3556                 ret = -ENOMEM;
3557                 goto error;
3558         }
3559
3560         /* zero out stat counters */
3561         spin_lock(&fs_info->balance_lock);
3562         memset(&bctl->stat, 0, sizeof(bctl->stat));
3563         spin_unlock(&fs_info->balance_lock);
3564 again:
3565         if (!counting) {
3566                 /*
3567                  * The single value limit and min/max limits use the same bytes
3568                  * in the
3569                  */
3570                 bctl->data.limit = limit_data;
3571                 bctl->meta.limit = limit_meta;
3572                 bctl->sys.limit = limit_sys;
3573         }
3574         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3575         key.offset = (u64)-1;
3576         key.type = BTRFS_CHUNK_ITEM_KEY;
3577
3578         while (1) {
3579                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3580                     atomic_read(&fs_info->balance_cancel_req)) {
3581                         ret = -ECANCELED;
3582                         goto error;
3583                 }
3584
3585                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3586                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3587                 if (ret < 0) {
3588                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3589                         goto error;
3590                 }
3591
3592                 /*
3593                  * this shouldn't happen, it means the last relocate
3594                  * failed
3595                  */
3596                 if (ret == 0)
3597                         BUG(); /* FIXME break ? */
3598
3599                 ret = btrfs_previous_item(chunk_root, path, 0,
3600                                           BTRFS_CHUNK_ITEM_KEY);
3601                 if (ret) {
3602                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3603                         ret = 0;
3604                         break;
3605                 }
3606
3607                 leaf = path->nodes[0];
3608                 slot = path->slots[0];
3609                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3610
3611                 if (found_key.objectid != key.objectid) {
3612                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3613                         break;
3614                 }
3615
3616                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3617                 chunk_type = btrfs_chunk_type(leaf, chunk);
3618
3619                 if (!counting) {
3620                         spin_lock(&fs_info->balance_lock);
3621                         bctl->stat.considered++;
3622                         spin_unlock(&fs_info->balance_lock);
3623                 }
3624
3625                 ret = should_balance_chunk(fs_info, leaf, chunk,
3626                                            found_key.offset);
3627
3628                 btrfs_release_path(path);
3629                 if (!ret) {
3630                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3631                         goto loop;
3632                 }
3633
3634                 if (counting) {
3635                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3636                         spin_lock(&fs_info->balance_lock);
3637                         bctl->stat.expected++;
3638                         spin_unlock(&fs_info->balance_lock);
3639
3640                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3641                                 count_data++;
3642                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3643                                 count_sys++;
3644                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3645                                 count_meta++;
3646
3647                         goto loop;
3648                 }
3649
3650                 /*
3651                  * Apply limit_min filter, no need to check if the LIMITS
3652                  * filter is used, limit_min is 0 by default
3653                  */
3654                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3655                                         count_data < bctl->data.limit_min)
3656                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3657                                         count_meta < bctl->meta.limit_min)
3658                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3659                                         count_sys < bctl->sys.limit_min)) {
3660                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3661                         goto loop;
3662                 }
3663
3664                 ASSERT(fs_info->data_sinfo);
3665                 spin_lock(&fs_info->data_sinfo->lock);
3666                 bytes_used = fs_info->data_sinfo->bytes_used;
3667                 spin_unlock(&fs_info->data_sinfo->lock);
3668
3669                 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3670                     !chunk_reserved && !bytes_used) {
3671                         trans = btrfs_start_transaction(chunk_root, 0);
3672                         if (IS_ERR(trans)) {
3673                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3674                                 ret = PTR_ERR(trans);
3675                                 goto error;
3676                         }
3677
3678                         ret = btrfs_force_chunk_alloc(trans, fs_info,
3679                                                       BTRFS_BLOCK_GROUP_DATA);
3680                         btrfs_end_transaction(trans);
3681                         if (ret < 0) {
3682                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3683                                 goto error;
3684                         }
3685                         chunk_reserved = 1;
3686                 }
3687
3688                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3689                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3690                 if (ret && ret != -ENOSPC)
3691                         goto error;
3692                 if (ret == -ENOSPC) {
3693                         enospc_errors++;
3694                 } else {
3695                         spin_lock(&fs_info->balance_lock);
3696                         bctl->stat.completed++;
3697                         spin_unlock(&fs_info->balance_lock);
3698                 }
3699 loop:
3700                 if (found_key.offset == 0)
3701                         break;
3702                 key.offset = found_key.offset - 1;
3703         }
3704
3705         if (counting) {
3706                 btrfs_release_path(path);
3707                 counting = false;
3708                 goto again;
3709         }
3710 error:
3711         btrfs_free_path(path);
3712         if (enospc_errors) {
3713                 btrfs_info(fs_info, "%d enospc errors during balance",
3714                            enospc_errors);
3715                 if (!ret)
3716                         ret = -ENOSPC;
3717         }
3718
3719         return ret;
3720 }
3721
3722 /**
3723  * alloc_profile_is_valid - see if a given profile is valid and reduced
3724  * @flags: profile to validate
3725  * @extended: if true @flags is treated as an extended profile
3726  */
3727 static int alloc_profile_is_valid(u64 flags, int extended)
3728 {
3729         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3730                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3731
3732         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3733
3734         /* 1) check that all other bits are zeroed */
3735         if (flags & ~mask)
3736                 return 0;
3737
3738         /* 2) see if profile is reduced */
3739         if (flags == 0)
3740                 return !extended; /* "0" is valid for usual profiles */
3741
3742         /* true if exactly one bit set */
3743         return (flags & (flags - 1)) == 0;
3744 }
3745
3746 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3747 {
3748         /* cancel requested || normal exit path */
3749         return atomic_read(&fs_info->balance_cancel_req) ||
3750                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3751                  atomic_read(&fs_info->balance_cancel_req) == 0);
3752 }
3753
3754 static void __cancel_balance(struct btrfs_fs_info *fs_info)
3755 {
3756         int ret;
3757
3758         unset_balance_control(fs_info);
3759         ret = del_balance_item(fs_info);
3760         if (ret)
3761                 btrfs_handle_fs_error(fs_info, ret, NULL);
3762
3763         clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3764 }
3765
3766 /* Non-zero return value signifies invalidity */
3767 static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3768                 u64 allowed)
3769 {
3770         return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3771                 (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3772                  (bctl_arg->target & ~allowed)));
3773 }
3774
3775 /*
3776  * Should be called with both balance and volume mutexes held
3777  */
3778 int btrfs_balance(struct btrfs_balance_control *bctl,
3779                   struct btrfs_ioctl_balance_args *bargs)
3780 {
3781         struct btrfs_fs_info *fs_info = bctl->fs_info;
3782         u64 meta_target, data_target;
3783         u64 allowed;
3784         int mixed = 0;
3785         int ret;
3786         u64 num_devices;
3787         unsigned seq;
3788
3789         if (btrfs_fs_closing(fs_info) ||
3790             atomic_read(&fs_info->balance_pause_req) ||
3791             atomic_read(&fs_info->balance_cancel_req)) {
3792                 ret = -EINVAL;
3793                 goto out;
3794         }
3795
3796         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3797         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3798                 mixed = 1;
3799
3800         /*
3801          * In case of mixed groups both data and meta should be picked,
3802          * and identical options should be given for both of them.
3803          */
3804         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3805         if (mixed && (bctl->flags & allowed)) {
3806                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3807                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3808                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3809                         btrfs_err(fs_info,
3810                                   "with mixed groups data and metadata balance options must be the same");
3811                         ret = -EINVAL;
3812                         goto out;
3813                 }
3814         }
3815
3816         num_devices = fs_info->fs_devices->num_devices;
3817         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3818         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3819                 BUG_ON(num_devices < 1);
3820                 num_devices--;
3821         }
3822         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3823         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3824         if (num_devices > 1)
3825                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3826         if (num_devices > 2)
3827                 allowed |= BTRFS_BLOCK_GROUP_RAID5;
3828         if (num_devices > 3)
3829                 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3830                             BTRFS_BLOCK_GROUP_RAID6);
3831         if (validate_convert_profile(&bctl->data, allowed)) {
3832                 btrfs_err(fs_info,
3833                           "unable to start balance with target data profile %llu",
3834                           bctl->data.target);
3835                 ret = -EINVAL;
3836                 goto out;
3837         }
3838         if (validate_convert_profile(&bctl->meta, allowed)) {
3839                 btrfs_err(fs_info,
3840                           "unable to start balance with target metadata profile %llu",
3841                           bctl->meta.target);
3842                 ret = -EINVAL;
3843                 goto out;
3844         }
3845         if (validate_convert_profile(&bctl->sys, allowed)) {
3846                 btrfs_err(fs_info,
3847                           "unable to start balance with target system profile %llu",
3848                           bctl->sys.target);
3849                 ret = -EINVAL;
3850                 goto out;
3851         }
3852
3853         /* allow to reduce meta or sys integrity only if force set */
3854         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3855                         BTRFS_BLOCK_GROUP_RAID10 |
3856                         BTRFS_BLOCK_GROUP_RAID5 |
3857                         BTRFS_BLOCK_GROUP_RAID6;
3858         do {
3859                 seq = read_seqbegin(&fs_info->profiles_lock);
3860
3861                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3862                      (fs_info->avail_system_alloc_bits & allowed) &&
3863                      !(bctl->sys.target & allowed)) ||
3864                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3865                      (fs_info->avail_metadata_alloc_bits & allowed) &&
3866                      !(bctl->meta.target & allowed))) {
3867                         if (bctl->flags & BTRFS_BALANCE_FORCE) {
3868                                 btrfs_info(fs_info,
3869                                            "force reducing metadata integrity");
3870                         } else {
3871                                 btrfs_err(fs_info,
3872                                           "balance will reduce metadata integrity, use force if you want this");
3873                                 ret = -EINVAL;
3874                                 goto out;
3875                         }
3876                 }
3877         } while (read_seqretry(&fs_info->profiles_lock, seq));
3878
3879         /* if we're not converting, the target field is uninitialized */
3880         meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3881                 bctl->meta.target : fs_info->avail_metadata_alloc_bits;
3882         data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3883                 bctl->data.target : fs_info->avail_data_alloc_bits;
3884         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
3885                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3886                 btrfs_warn(fs_info,
3887                            "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
3888                            meta_target, data_target);
3889         }
3890
3891         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3892                 fs_info->num_tolerated_disk_barrier_failures = min(
3893                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
3894                         btrfs_get_num_tolerated_disk_barrier_failures(
3895                                 bctl->sys.target));
3896         }
3897
3898         ret = insert_balance_item(fs_info, bctl);
3899         if (ret && ret != -EEXIST)
3900                 goto out;
3901
3902         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3903                 BUG_ON(ret == -EEXIST);
3904                 set_balance_control(bctl);
3905         } else {
3906                 BUG_ON(ret != -EEXIST);
3907                 spin_lock(&fs_info->balance_lock);
3908                 update_balance_args(bctl);
3909                 spin_unlock(&fs_info->balance_lock);
3910         }
3911
3912         atomic_inc(&fs_info->balance_running);
3913         mutex_unlock(&fs_info->balance_mutex);
3914
3915         ret = __btrfs_balance(fs_info);
3916
3917         mutex_lock(&fs_info->balance_mutex);
3918         atomic_dec(&fs_info->balance_running);
3919
3920         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3921                 fs_info->num_tolerated_disk_barrier_failures =
3922                         btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3923         }
3924
3925         if (bargs) {
3926                 memset(bargs, 0, sizeof(*bargs));
3927                 update_ioctl_balance_args(fs_info, 0, bargs);
3928         }
3929
3930         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3931             balance_need_close(fs_info)) {
3932                 __cancel_balance(fs_info);
3933         }
3934
3935         wake_up(&fs_info->balance_wait_q);
3936
3937         return ret;
3938 out:
3939         if (bctl->flags & BTRFS_BALANCE_RESUME)
3940                 __cancel_balance(fs_info);
3941         else {
3942                 kfree(bctl);
3943                 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3944         }
3945         return ret;
3946 }
3947
3948 static int balance_kthread(void *data)
3949 {
3950         struct btrfs_fs_info *fs_info = data;
3951         int ret = 0;
3952
3953         mutex_lock(&fs_info->volume_mutex);
3954         mutex_lock(&fs_info->balance_mutex);
3955
3956         if (fs_info->balance_ctl) {
3957                 btrfs_info(fs_info, "continuing balance");
3958                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3959         }
3960
3961         mutex_unlock(&fs_info->balance_mutex);
3962         mutex_unlock(&fs_info->volume_mutex);
3963
3964         return ret;
3965 }
3966
3967 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3968 {
3969         struct task_struct *tsk;
3970
3971         spin_lock(&fs_info->balance_lock);
3972         if (!fs_info->balance_ctl) {
3973                 spin_unlock(&fs_info->balance_lock);
3974                 return 0;
3975         }
3976         spin_unlock(&fs_info->balance_lock);
3977
3978         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
3979                 btrfs_info(fs_info, "force skipping balance");
3980                 return 0;
3981         }
3982
3983         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3984         return PTR_ERR_OR_ZERO(tsk);
3985 }
3986
3987 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3988 {
3989         struct btrfs_balance_control *bctl;
3990         struct btrfs_balance_item *item;
3991         struct btrfs_disk_balance_args disk_bargs;
3992         struct btrfs_path *path;
3993         struct extent_buffer *leaf;
3994         struct btrfs_key key;
3995         int ret;
3996
3997         path = btrfs_alloc_path();
3998         if (!path)
3999                 return -ENOMEM;
4000
4001         key.objectid = BTRFS_BALANCE_OBJECTID;
4002         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4003         key.offset = 0;
4004
4005         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4006         if (ret < 0)
4007                 goto out;
4008         if (ret > 0) { /* ret = -ENOENT; */
4009                 ret = 0;
4010                 goto out;
4011         }
4012
4013         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4014         if (!bctl) {
4015                 ret = -ENOMEM;
4016                 goto out;
4017         }
4018
4019         leaf = path->nodes[0];
4020         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4021
4022         bctl->fs_info = fs_info;
4023         bctl->flags = btrfs_balance_flags(leaf, item);
4024         bctl->flags |= BTRFS_BALANCE_RESUME;
4025
4026         btrfs_balance_data(leaf, item, &disk_bargs);
4027         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4028         btrfs_balance_meta(leaf, item, &disk_bargs);
4029         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4030         btrfs_balance_sys(leaf, item, &disk_bargs);
4031         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4032
4033         WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
4034
4035         mutex_lock(&fs_info->volume_mutex);
4036         mutex_lock(&fs_info->balance_mutex);
4037
4038         set_balance_control(bctl);
4039
4040         mutex_unlock(&fs_info->balance_mutex);
4041         mutex_unlock(&fs_info->volume_mutex);
4042 out:
4043         btrfs_free_path(path);
4044         return ret;
4045 }
4046
4047 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4048 {
4049         int ret = 0;
4050
4051         mutex_lock(&fs_info->balance_mutex);
4052         if (!fs_info->balance_ctl) {
4053                 mutex_unlock(&fs_info->balance_mutex);
4054                 return -ENOTCONN;
4055         }
4056
4057         if (atomic_read(&fs_info->balance_running)) {
4058                 atomic_inc(&fs_info->balance_pause_req);
4059                 mutex_unlock(&fs_info->balance_mutex);
4060
4061                 wait_event(fs_info->balance_wait_q,
4062                            atomic_read(&fs_info->balance_running) == 0);
4063
4064                 mutex_lock(&fs_info->balance_mutex);
4065                 /* we are good with balance_ctl ripped off from under us */
4066                 BUG_ON(atomic_read(&fs_info->balance_running));
4067                 atomic_dec(&fs_info->balance_pause_req);
4068         } else {
4069                 ret = -ENOTCONN;
4070         }
4071
4072         mutex_unlock(&fs_info->balance_mutex);
4073         return ret;
4074 }
4075
4076 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4077 {
4078         if (fs_info->sb->s_flags & MS_RDONLY)
4079                 return -EROFS;
4080
4081         mutex_lock(&fs_info->balance_mutex);
4082         if (!fs_info->balance_ctl) {
4083                 mutex_unlock(&fs_info->balance_mutex);
4084                 return -ENOTCONN;
4085         }
4086
4087         atomic_inc(&fs_info->balance_cancel_req);
4088         /*
4089          * if we are running just wait and return, balance item is
4090          * deleted in btrfs_balance in this case
4091          */
4092         if (atomic_read(&fs_info->balance_running)) {
4093                 mutex_unlock(&fs_info->balance_mutex);
4094                 wait_event(fs_info->balance_wait_q,
4095                            atomic_read(&fs_info->balance_running) == 0);
4096                 mutex_lock(&fs_info->balance_mutex);
4097         } else {
4098                 /* __cancel_balance needs volume_mutex */
4099                 mutex_unlock(&fs_info->balance_mutex);
4100                 mutex_lock(&fs_info->volume_mutex);
4101                 mutex_lock(&fs_info->balance_mutex);
4102
4103                 if (fs_info->balance_ctl)
4104                         __cancel_balance(fs_info);
4105
4106                 mutex_unlock(&fs_info->volume_mutex);
4107         }
4108
4109         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
4110         atomic_dec(&fs_info->balance_cancel_req);
4111         mutex_unlock(&fs_info->balance_mutex);
4112         return 0;
4113 }
4114
4115 static int btrfs_uuid_scan_kthread(void *data)
4116 {
4117         struct btrfs_fs_info *fs_info = data;
4118         struct btrfs_root *root = fs_info->tree_root;
4119         struct btrfs_key key;
4120         struct btrfs_key max_key;
4121         struct btrfs_path *path = NULL;
4122         int ret = 0;
4123         struct extent_buffer *eb;
4124         int slot;
4125         struct btrfs_root_item root_item;
4126         u32 item_size;
4127         struct btrfs_trans_handle *trans = NULL;
4128
4129         path = btrfs_alloc_path();
4130         if (!path) {
4131                 ret = -ENOMEM;
4132                 goto out;
4133         }
4134
4135         key.objectid = 0;
4136         key.type = BTRFS_ROOT_ITEM_KEY;
4137         key.offset = 0;
4138
4139         max_key.objectid = (u64)-1;
4140         max_key.type = BTRFS_ROOT_ITEM_KEY;
4141         max_key.offset = (u64)-1;
4142
4143         while (1) {
4144                 ret = btrfs_search_forward(root, &key, path, 0);
4145                 if (ret) {
4146                         if (ret > 0)
4147                                 ret = 0;
4148                         break;
4149                 }
4150
4151                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4152                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4153                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4154                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4155                         goto skip;
4156
4157                 eb = path->nodes[0];
4158                 slot = path->slots[0];
4159                 item_size = btrfs_item_size_nr(eb, slot);
4160                 if (item_size < sizeof(root_item))
4161                         goto skip;
4162
4163                 read_extent_buffer(eb, &root_item,
4164                                    btrfs_item_ptr_offset(eb, slot),
4165                                    (int)sizeof(root_item));
4166                 if (btrfs_root_refs(&root_item) == 0)
4167                         goto skip;
4168
4169                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4170                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4171                         if (trans)
4172                                 goto update_tree;
4173
4174                         btrfs_release_path(path);
4175                         /*
4176                          * 1 - subvol uuid item
4177                          * 1 - received_subvol uuid item
4178                          */
4179                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4180                         if (IS_ERR(trans)) {
4181                                 ret = PTR_ERR(trans);
4182                                 break;
4183                         }
4184                         continue;
4185                 } else {
4186                         goto skip;
4187                 }
4188 update_tree:
4189                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4190                         ret = btrfs_uuid_tree_add(trans, fs_info,
4191                                                   root_item.uuid,
4192                                                   BTRFS_UUID_KEY_SUBVOL,
4193                                                   key.objectid);
4194                         if (ret < 0) {
4195                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4196                                         ret);
4197                                 break;
4198                         }
4199                 }
4200
4201                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4202                         ret = btrfs_uuid_tree_add(trans, fs_info,
4203                                                   root_item.received_uuid,
4204                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4205                                                   key.objectid);
4206                         if (ret < 0) {
4207                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4208                                         ret);
4209                                 break;
4210                         }
4211                 }
4212
4213 skip:
4214                 if (trans) {
4215                         ret = btrfs_end_transaction(trans);
4216                         trans = NULL;
4217                         if (ret)
4218                                 break;
4219                 }
4220
4221                 btrfs_release_path(path);
4222                 if (key.offset < (u64)-1) {
4223                         key.offset++;
4224                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4225                         key.offset = 0;
4226                         key.type = BTRFS_ROOT_ITEM_KEY;
4227                 } else if (key.objectid < (u64)-1) {
4228                         key.offset = 0;
4229                         key.type = BTRFS_ROOT_ITEM_KEY;
4230                         key.objectid++;
4231                 } else {
4232                         break;
4233                 }
4234                 cond_resched();
4235         }
4236
4237 out:
4238         btrfs_free_path(path);
4239         if (trans && !IS_ERR(trans))
4240                 btrfs_end_transaction(trans);
4241         if (ret)
4242                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4243         else
4244                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4245         up(&fs_info->uuid_tree_rescan_sem);
4246         return 0;
4247 }
4248
4249 /*
4250  * Callback for btrfs_uuid_tree_iterate().
4251  * returns:
4252  * 0    check succeeded, the entry is not outdated.
4253  * < 0  if an error occurred.
4254  * > 0  if the check failed, which means the caller shall remove the entry.
4255  */
4256 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4257                                        u8 *uuid, u8 type, u64 subid)
4258 {
4259         struct btrfs_key key;
4260         int ret = 0;
4261         struct btrfs_root *subvol_root;
4262
4263         if (type != BTRFS_UUID_KEY_SUBVOL &&
4264             type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4265                 goto out;
4266
4267         key.objectid = subid;
4268         key.type = BTRFS_ROOT_ITEM_KEY;
4269         key.offset = (u64)-1;
4270         subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4271         if (IS_ERR(subvol_root)) {
4272                 ret = PTR_ERR(subvol_root);
4273                 if (ret == -ENOENT)
4274                         ret = 1;
4275                 goto out;
4276         }
4277
4278         switch (type) {
4279         case BTRFS_UUID_KEY_SUBVOL:
4280                 if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4281                         ret = 1;
4282                 break;
4283         case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4284                 if (memcmp(uuid, subvol_root->root_item.received_uuid,
4285                            BTRFS_UUID_SIZE))
4286                         ret = 1;
4287                 break;
4288         }
4289
4290 out:
4291         return ret;
4292 }
4293
4294 static int btrfs_uuid_rescan_kthread(void *data)
4295 {
4296         struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4297         int ret;
4298
4299         /*
4300          * 1st step is to iterate through the existing UUID tree and
4301          * to delete all entries that contain outdated data.
4302          * 2nd step is to add all missing entries to the UUID tree.
4303          */
4304         ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4305         if (ret < 0) {
4306                 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4307                 up(&fs_info->uuid_tree_rescan_sem);
4308                 return ret;
4309         }
4310         return btrfs_uuid_scan_kthread(data);
4311 }
4312
4313 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4314 {
4315         struct btrfs_trans_handle *trans;
4316         struct btrfs_root *tree_root = fs_info->tree_root;
4317         struct btrfs_root *uuid_root;
4318         struct task_struct *task;
4319         int ret;
4320
4321         /*
4322          * 1 - root node
4323          * 1 - root item
4324          */
4325         trans = btrfs_start_transaction(tree_root, 2);
4326         if (IS_ERR(trans))
4327                 return PTR_ERR(trans);
4328
4329         uuid_root = btrfs_create_tree(trans, fs_info,
4330                                       BTRFS_UUID_TREE_OBJECTID);
4331         if (IS_ERR(uuid_root)) {
4332                 ret = PTR_ERR(uuid_root);
4333                 btrfs_abort_transaction(trans, ret);
4334                 btrfs_end_transaction(trans);
4335                 return ret;
4336         }
4337
4338         fs_info->uuid_root = uuid_root;
4339
4340         ret = btrfs_commit_transaction(trans);
4341         if (ret)
4342                 return ret;
4343
4344         down(&fs_info->uuid_tree_rescan_sem);
4345         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4346         if (IS_ERR(task)) {
4347                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4348                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4349                 up(&fs_info->uuid_tree_rescan_sem);
4350                 return PTR_ERR(task);
4351         }
4352
4353         return 0;
4354 }
4355
4356 int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4357 {
4358         struct task_struct *task;
4359
4360         down(&fs_info->uuid_tree_rescan_sem);
4361         task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4362         if (IS_ERR(task)) {
4363                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4364                 btrfs_warn(fs_info, "failed to start uuid_rescan task");
4365                 up(&fs_info->uuid_tree_rescan_sem);
4366                 return PTR_ERR(task);
4367         }
4368
4369         return 0;
4370 }
4371
4372 /*
4373  * shrinking a device means finding all of the device extents past
4374  * the new size, and then following the back refs to the chunks.
4375  * The chunk relocation code actually frees the device extent
4376  */
4377 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4378 {
4379         struct btrfs_fs_info *fs_info = device->fs_info;
4380         struct btrfs_root *root = fs_info->dev_root;
4381         struct btrfs_trans_handle *trans;
4382         struct btrfs_dev_extent *dev_extent = NULL;
4383         struct btrfs_path *path;
4384         u64 length;
4385         u64 chunk_offset;
4386         int ret;
4387         int slot;
4388         int failed = 0;
4389         bool retried = false;
4390         bool checked_pending_chunks = false;
4391         struct extent_buffer *l;
4392         struct btrfs_key key;
4393         struct btrfs_super_block *super_copy = fs_info->super_copy;
4394         u64 old_total = btrfs_super_total_bytes(super_copy);
4395         u64 old_size = btrfs_device_get_total_bytes(device);
4396         u64 diff = old_size - new_size;
4397
4398         if (device->is_tgtdev_for_dev_replace)
4399                 return -EINVAL;
4400
4401         path = btrfs_alloc_path();
4402         if (!path)
4403                 return -ENOMEM;
4404
4405         path->reada = READA_FORWARD;
4406
4407         mutex_lock(&fs_info->chunk_mutex);
4408
4409         btrfs_device_set_total_bytes(device, new_size);
4410         if (device->writeable) {
4411                 device->fs_devices->total_rw_bytes -= diff;
4412                 spin_lock(&fs_info->free_chunk_lock);
4413                 fs_info->free_chunk_space -= diff;
4414                 spin_unlock(&fs_info->free_chunk_lock);
4415         }
4416         mutex_unlock(&fs_info->chunk_mutex);
4417
4418 again:
4419         key.objectid = device->devid;
4420         key.offset = (u64)-1;
4421         key.type = BTRFS_DEV_EXTENT_KEY;
4422
4423         do {
4424                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4425                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4426                 if (ret < 0) {
4427                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4428                         goto done;
4429                 }
4430
4431                 ret = btrfs_previous_item(root, path, 0, key.type);
4432                 if (ret)
4433                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4434                 if (ret < 0)
4435                         goto done;
4436                 if (ret) {
4437                         ret = 0;
4438                         btrfs_release_path(path);
4439                         break;
4440                 }
4441
4442                 l = path->nodes[0];
4443                 slot = path->slots[0];
4444                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4445
4446                 if (key.objectid != device->devid) {
4447                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4448                         btrfs_release_path(path);
4449                         break;
4450                 }
4451
4452                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4453                 length = btrfs_dev_extent_length(l, dev_extent);
4454
4455                 if (key.offset + length <= new_size) {
4456                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4457                         btrfs_release_path(path);
4458                         break;
4459                 }
4460
4461                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4462                 btrfs_release_path(path);
4463
4464                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4465                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4466                 if (ret && ret != -ENOSPC)
4467                         goto done;
4468                 if (ret == -ENOSPC)
4469                         failed++;
4470         } while (key.offset-- > 0);
4471
4472         if (failed && !retried) {
4473                 failed = 0;
4474                 retried = true;
4475                 goto again;
4476         } else if (failed && retried) {
4477                 ret = -ENOSPC;
4478                 goto done;
4479         }
4480
4481         /* Shrinking succeeded, else we would be at "done". */
4482         trans = btrfs_start_transaction(root, 0);
4483         if (IS_ERR(trans)) {
4484                 ret = PTR_ERR(trans);
4485                 goto done;
4486         }
4487
4488         mutex_lock(&fs_info->chunk_mutex);
4489
4490         /*
4491          * We checked in the above loop all device extents that were already in
4492          * the device tree. However before we have updated the device's
4493          * total_bytes to the new size, we might have had chunk allocations that
4494          * have not complete yet (new block groups attached to transaction
4495          * handles), and therefore their device extents were not yet in the
4496          * device tree and we missed them in the loop above. So if we have any
4497          * pending chunk using a device extent that overlaps the device range
4498          * that we can not use anymore, commit the current transaction and
4499          * repeat the search on the device tree - this way we guarantee we will
4500          * not have chunks using device extents that end beyond 'new_size'.
4501          */
4502         if (!checked_pending_chunks) {
4503                 u64 start = new_size;
4504                 u64 len = old_size - new_size;
4505
4506                 if (contains_pending_extent(trans->transaction, device,
4507                                             &start, len)) {
4508                         mutex_unlock(&fs_info->chunk_mutex);
4509                         checked_pending_chunks = true;
4510                         failed = 0;
4511                         retried = false;
4512                         ret = btrfs_commit_transaction(trans);
4513                         if (ret)
4514                                 goto done;
4515                         goto again;
4516                 }
4517         }
4518
4519         btrfs_device_set_disk_total_bytes(device, new_size);
4520         if (list_empty(&device->resized_list))
4521                 list_add_tail(&device->resized_list,
4522                               &fs_info->fs_devices->resized_devices);
4523
4524         WARN_ON(diff > old_total);
4525         btrfs_set_super_total_bytes(super_copy, old_total - diff);
4526         mutex_unlock(&fs_info->chunk_mutex);
4527
4528         /* Now btrfs_update_device() will change the on-disk size. */
4529         ret = btrfs_update_device(trans, device);
4530         btrfs_end_transaction(trans);
4531 done:
4532         btrfs_free_path(path);
4533         if (ret) {
4534                 mutex_lock(&fs_info->chunk_mutex);
4535                 btrfs_device_set_total_bytes(device, old_size);
4536                 if (device->writeable)
4537                         device->fs_devices->total_rw_bytes += diff;
4538                 spin_lock(&fs_info->free_chunk_lock);
4539                 fs_info->free_chunk_space += diff;
4540                 spin_unlock(&fs_info->free_chunk_lock);
4541                 mutex_unlock(&fs_info->chunk_mutex);
4542         }
4543         return ret;
4544 }
4545
4546 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4547                            struct btrfs_key *key,
4548                            struct btrfs_chunk *chunk, int item_size)
4549 {
4550         struct btrfs_super_block *super_copy = fs_info->super_copy;
4551         struct btrfs_disk_key disk_key;
4552         u32 array_size;
4553         u8 *ptr;
4554
4555         mutex_lock(&fs_info->chunk_mutex);
4556         array_size = btrfs_super_sys_array_size(super_copy);
4557         if (array_size + item_size + sizeof(disk_key)
4558                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4559                 mutex_unlock(&fs_info->chunk_mutex);
4560                 return -EFBIG;
4561         }
4562
4563         ptr = super_copy->sys_chunk_array + array_size;
4564         btrfs_cpu_key_to_disk(&disk_key, key);
4565         memcpy(ptr, &disk_key, sizeof(disk_key));
4566         ptr += sizeof(disk_key);
4567         memcpy(ptr, chunk, item_size);
4568         item_size += sizeof(disk_key);
4569         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4570         mutex_unlock(&fs_info->chunk_mutex);
4571
4572         return 0;
4573 }
4574
4575 /*
4576  * sort the devices in descending order by max_avail, total_avail
4577  */
4578 static int btrfs_cmp_device_info(const void *a, const void *b)
4579 {
4580         const struct btrfs_device_info *di_a = a;
4581         const struct btrfs_device_info *di_b = b;
4582
4583         if (di_a->max_avail > di_b->max_avail)
4584                 return -1;
4585         if (di_a->max_avail < di_b->max_avail)
4586                 return 1;
4587         if (di_a->total_avail > di_b->total_avail)
4588                 return -1;
4589         if (di_a->total_avail < di_b->total_avail)
4590                 return 1;
4591         return 0;
4592 }
4593
4594 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4595 {
4596         /* TODO allow them to set a preferred stripe size */
4597         return SZ_64K;
4598 }
4599
4600 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4601 {
4602         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4603                 return;
4604
4605         btrfs_set_fs_incompat(info, RAID56);
4606 }
4607
4608 #define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info)             \
4609                         - sizeof(struct btrfs_chunk))           \
4610                         / sizeof(struct btrfs_stripe) + 1)
4611
4612 #define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
4613                                 - 2 * sizeof(struct btrfs_disk_key)     \
4614                                 - 2 * sizeof(struct btrfs_chunk))       \
4615                                 / sizeof(struct btrfs_stripe) + 1)
4616
4617 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4618                                u64 start, u64 type)
4619 {
4620         struct btrfs_fs_info *info = trans->fs_info;
4621         struct btrfs_fs_devices *fs_devices = info->fs_devices;
4622         struct list_head *cur;
4623         struct map_lookup *map = NULL;
4624         struct extent_map_tree *em_tree;
4625         struct extent_map *em;
4626         struct btrfs_device_info *devices_info = NULL;
4627         u64 total_avail;
4628         int num_stripes;        /* total number of stripes to allocate */
4629         int data_stripes;       /* number of stripes that count for
4630                                    block group size */
4631         int sub_stripes;        /* sub_stripes info for map */
4632         int dev_stripes;        /* stripes per dev */
4633         int devs_max;           /* max devs to use */
4634         int devs_min;           /* min devs needed */
4635         int devs_increment;     /* ndevs has to be a multiple of this */
4636         int ncopies;            /* how many copies to data has */
4637         int ret;
4638         u64 max_stripe_size;
4639         u64 max_chunk_size;
4640         u64 stripe_size;
4641         u64 num_bytes;
4642         u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4643         int ndevs;
4644         int i;
4645         int j;
4646         int index;
4647
4648         BUG_ON(!alloc_profile_is_valid(type, 0));
4649
4650         if (list_empty(&fs_devices->alloc_list))
4651                 return -ENOSPC;
4652
4653         index = __get_raid_index(type);
4654
4655         sub_stripes = btrfs_raid_array[index].sub_stripes;
4656         dev_stripes = btrfs_raid_array[index].dev_stripes;
4657         devs_max = btrfs_raid_array[index].devs_max;
4658         devs_min = btrfs_raid_array[index].devs_min;
4659         devs_increment = btrfs_raid_array[index].devs_increment;
4660         ncopies = btrfs_raid_array[index].ncopies;
4661
4662         if (type & BTRFS_BLOCK_GROUP_DATA) {
4663                 max_stripe_size = SZ_1G;
4664                 max_chunk_size = 10 * max_stripe_size;
4665                 if (!devs_max)
4666                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4667         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4668                 /* for larger filesystems, use larger metadata chunks */
4669                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4670                         max_stripe_size = SZ_1G;
4671                 else
4672                         max_stripe_size = SZ_256M;
4673                 max_chunk_size = max_stripe_size;
4674                 if (!devs_max)
4675                         devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4676         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4677                 max_stripe_size = SZ_32M;
4678                 max_chunk_size = 2 * max_stripe_size;
4679                 if (!devs_max)
4680                         devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4681         } else {
4682                 btrfs_err(info, "invalid chunk type 0x%llx requested",
4683                        type);
4684                 BUG_ON(1);
4685         }
4686
4687         /* we don't want a chunk larger than 10% of writeable space */
4688         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4689                              max_chunk_size);
4690
4691         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4692                                GFP_NOFS);
4693         if (!devices_info)
4694                 return -ENOMEM;
4695
4696         cur = fs_devices->alloc_list.next;
4697
4698         /*
4699          * in the first pass through the devices list, we gather information
4700          * about the available holes on each device.
4701          */
4702         ndevs = 0;
4703         while (cur != &fs_devices->alloc_list) {
4704                 struct btrfs_device *device;
4705                 u64 max_avail;
4706                 u64 dev_offset;
4707
4708                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4709
4710                 cur = cur->next;
4711
4712                 if (!device->writeable) {
4713                         WARN(1, KERN_ERR
4714                                "BTRFS: read-only device in alloc_list\n");
4715                         continue;
4716                 }
4717
4718                 if (!device->in_fs_metadata ||
4719                     device->is_tgtdev_for_dev_replace)
4720                         continue;
4721
4722                 if (device->total_bytes > device->bytes_used)
4723                         total_avail = device->total_bytes - device->bytes_used;
4724                 else
4725                         total_avail = 0;
4726
4727                 /* If there is no space on this device, skip it. */
4728                 if (total_avail == 0)
4729                         continue;
4730
4731                 ret = find_free_dev_extent(trans, device,
4732                                            max_stripe_size * dev_stripes,
4733                                            &dev_offset, &max_avail);
4734                 if (ret && ret != -ENOSPC)
4735                         goto error;
4736
4737                 if (ret == 0)
4738                         max_avail = max_stripe_size * dev_stripes;
4739
4740                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4741                         continue;
4742
4743                 if (ndevs == fs_devices->rw_devices) {
4744                         WARN(1, "%s: found more than %llu devices\n",
4745                              __func__, fs_devices->rw_devices);
4746                         break;
4747                 }
4748                 devices_info[ndevs].dev_offset = dev_offset;
4749                 devices_info[ndevs].max_avail = max_avail;
4750                 devices_info[ndevs].total_avail = total_avail;
4751                 devices_info[ndevs].dev = device;
4752                 ++ndevs;
4753         }
4754
4755         /*
4756          * now sort the devices by hole size / available space
4757          */
4758         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4759              btrfs_cmp_device_info, NULL);
4760
4761         /* round down to number of usable stripes */
4762         ndevs -= ndevs % devs_increment;
4763
4764         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4765                 ret = -ENOSPC;
4766                 goto error;
4767         }
4768
4769         if (devs_max && ndevs > devs_max)
4770                 ndevs = devs_max;
4771         /*
4772          * the primary goal is to maximize the number of stripes, so use as many
4773          * devices as possible, even if the stripes are not maximum sized.
4774          */
4775         stripe_size = devices_info[ndevs-1].max_avail;
4776         num_stripes = ndevs * dev_stripes;
4777
4778         /*
4779          * this will have to be fixed for RAID1 and RAID10 over
4780          * more drives
4781          */
4782         data_stripes = num_stripes / ncopies;
4783
4784         if (type & BTRFS_BLOCK_GROUP_RAID5) {
4785                 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4786                                                          info->stripesize);
4787                 data_stripes = num_stripes - 1;
4788         }
4789         if (type & BTRFS_BLOCK_GROUP_RAID6) {
4790                 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4791                                                          info->stripesize);
4792                 data_stripes = num_stripes - 2;
4793         }
4794
4795         /*
4796          * Use the number of data stripes to figure out how big this chunk
4797          * is really going to be in terms of logical address space,
4798          * and compare that answer with the max chunk size
4799          */
4800         if (stripe_size * data_stripes > max_chunk_size) {
4801                 u64 mask = (1ULL << 24) - 1;
4802
4803                 stripe_size = div_u64(max_chunk_size, data_stripes);
4804
4805                 /* bump the answer up to a 16MB boundary */
4806                 stripe_size = (stripe_size + mask) & ~mask;
4807
4808                 /* but don't go higher than the limits we found
4809                  * while searching for free extents
4810                  */
4811                 if (stripe_size > devices_info[ndevs-1].max_avail)
4812                         stripe_size = devices_info[ndevs-1].max_avail;
4813         }
4814
4815         stripe_size = div_u64(stripe_size, dev_stripes);
4816
4817         /* align to BTRFS_STRIPE_LEN */
4818         stripe_size = div64_u64(stripe_size, raid_stripe_len);
4819         stripe_size *= raid_stripe_len;
4820
4821         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4822         if (!map) {
4823                 ret = -ENOMEM;
4824                 goto error;
4825         }
4826         map->num_stripes = num_stripes;
4827
4828         for (i = 0; i < ndevs; ++i) {
4829                 for (j = 0; j < dev_stripes; ++j) {
4830                         int s = i * dev_stripes + j;
4831                         map->stripes[s].dev = devices_info[i].dev;
4832                         map->stripes[s].physical = devices_info[i].dev_offset +
4833                                                    j * stripe_size;
4834                 }
4835         }
4836         map->sector_size = info->sectorsize;
4837         map->stripe_len = raid_stripe_len;
4838         map->io_align = raid_stripe_len;
4839         map->io_width = raid_stripe_len;
4840         map->type = type;
4841         map->sub_stripes = sub_stripes;
4842
4843         num_bytes = stripe_size * data_stripes;
4844
4845         trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4846
4847         em = alloc_extent_map();
4848         if (!em) {
4849                 kfree(map);
4850                 ret = -ENOMEM;
4851                 goto error;
4852         }
4853         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4854         em->map_lookup = map;
4855         em->start = start;
4856         em->len = num_bytes;
4857         em->block_start = 0;
4858         em->block_len = em->len;
4859         em->orig_block_len = stripe_size;
4860
4861         em_tree = &info->mapping_tree.map_tree;
4862         write_lock(&em_tree->lock);
4863         ret = add_extent_mapping(em_tree, em, 0);
4864         if (!ret) {
4865                 list_add_tail(&em->list, &trans->transaction->pending_chunks);
4866                 refcount_inc(&em->refs);
4867         }
4868         write_unlock(&em_tree->lock);
4869         if (ret) {
4870                 free_extent_map(em);
4871                 goto error;
4872         }
4873
4874         ret = btrfs_make_block_group(trans, info, 0, type,
4875                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4876                                      start, num_bytes);
4877         if (ret)
4878                 goto error_del_extent;
4879
4880         for (i = 0; i < map->num_stripes; i++) {
4881                 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4882                 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4883         }
4884
4885         spin_lock(&info->free_chunk_lock);
4886         info->free_chunk_space -= (stripe_size * map->num_stripes);
4887         spin_unlock(&info->free_chunk_lock);
4888
4889         free_extent_map(em);
4890         check_raid56_incompat_flag(info, type);
4891
4892         kfree(devices_info);
4893         return 0;
4894
4895 error_del_extent:
4896         write_lock(&em_tree->lock);
4897         remove_extent_mapping(em_tree, em);
4898         write_unlock(&em_tree->lock);
4899
4900         /* One for our allocation */
4901         free_extent_map(em);
4902         /* One for the tree reference */
4903         free_extent_map(em);
4904         /* One for the pending_chunks list reference */
4905         free_extent_map(em);
4906 error:
4907         kfree(devices_info);
4908         return ret;
4909 }
4910
4911 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4912                                 struct btrfs_fs_info *fs_info,
4913                                 u64 chunk_offset, u64 chunk_size)
4914 {
4915         struct btrfs_root *extent_root = fs_info->extent_root;
4916         struct btrfs_root *chunk_root = fs_info->chunk_root;
4917         struct btrfs_key key;
4918         struct btrfs_device *device;
4919         struct btrfs_chunk *chunk;
4920         struct btrfs_stripe *stripe;
4921         struct extent_map *em;
4922         struct map_lookup *map;
4923         size_t item_size;
4924         u64 dev_offset;
4925         u64 stripe_size;
4926         int i = 0;
4927         int ret = 0;
4928
4929         em = get_chunk_map(fs_info, chunk_offset, chunk_size);
4930         if (IS_ERR(em))
4931                 return PTR_ERR(em);
4932
4933         map = em->map_lookup;
4934         item_size = btrfs_chunk_item_size(map->num_stripes);
4935         stripe_size = em->orig_block_len;
4936
4937         chunk = kzalloc(item_size, GFP_NOFS);
4938         if (!chunk) {
4939                 ret = -ENOMEM;
4940                 goto out;
4941         }
4942
4943         /*
4944          * Take the device list mutex to prevent races with the final phase of
4945          * a device replace operation that replaces the device object associated
4946          * with the map's stripes, because the device object's id can change
4947          * at any time during that final phase of the device replace operation
4948          * (dev-replace.c:btrfs_dev_replace_finishing()).
4949          */
4950         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4951         for (i = 0; i < map->num_stripes; i++) {
4952                 device = map->stripes[i].dev;
4953                 dev_offset = map->stripes[i].physical;
4954
4955                 ret = btrfs_update_device(trans, device);
4956                 if (ret)
4957                         break;
4958                 ret = btrfs_alloc_dev_extent(trans, device,
4959                                              chunk_root->root_key.objectid,
4960                                              BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4961                                              chunk_offset, dev_offset,
4962                                              stripe_size);
4963                 if (ret)
4964                         break;
4965         }
4966         if (ret) {
4967                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4968                 goto out;
4969         }
4970
4971         stripe = &chunk->stripe;
4972         for (i = 0; i < map->num_stripes; i++) {
4973                 device = map->stripes[i].dev;
4974                 dev_offset = map->stripes[i].physical;
4975
4976                 btrfs_set_stack_stripe_devid(stripe, device->devid);
4977                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4978                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4979                 stripe++;
4980         }
4981         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4982
4983         btrfs_set_stack_chunk_length(chunk, chunk_size);
4984         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4985         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4986         btrfs_set_stack_chunk_type(chunk, map->type);
4987         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4988         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4989         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4990         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
4991         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4992
4993         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4994         key.type = BTRFS_CHUNK_ITEM_KEY;
4995         key.offset = chunk_offset;
4996
4997         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4998         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4999                 /*
5000                  * TODO: Cleanup of inserted chunk root in case of
5001                  * failure.
5002                  */
5003                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5004         }
5005
5006 out:
5007         kfree(chunk);
5008         free_extent_map(em);
5009         return ret;
5010 }
5011
5012 /*
5013  * Chunk allocation falls into two parts. The first part does works
5014  * that make the new allocated chunk useable, but not do any operation
5015  * that modifies the chunk tree. The second part does the works that
5016  * require modifying the chunk tree. This division is important for the
5017  * bootstrap process of adding storage to a seed btrfs.
5018  */
5019 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5020                       struct btrfs_fs_info *fs_info, u64 type)
5021 {
5022         u64 chunk_offset;
5023
5024         ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
5025         chunk_offset = find_next_chunk(fs_info);
5026         return __btrfs_alloc_chunk(trans, chunk_offset, type);
5027 }
5028
5029 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5030                                          struct btrfs_fs_info *fs_info)
5031 {
5032         struct btrfs_root *extent_root = fs_info->extent_root;
5033         u64 chunk_offset;
5034         u64 sys_chunk_offset;
5035         u64 alloc_profile;
5036         int ret;
5037
5038         chunk_offset = find_next_chunk(fs_info);
5039         alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
5040         ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5041         if (ret)
5042                 return ret;
5043
5044         sys_chunk_offset = find_next_chunk(fs_info);
5045         alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
5046         ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5047         return ret;
5048 }
5049
5050 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5051 {
5052         int max_errors;
5053
5054         if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5055                          BTRFS_BLOCK_GROUP_RAID10 |
5056                          BTRFS_BLOCK_GROUP_RAID5 |
5057                          BTRFS_BLOCK_GROUP_DUP)) {
5058                 max_errors = 1;
5059         } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5060                 max_errors = 2;
5061         } else {
5062                 max_errors = 0;
5063         }
5064
5065         return max_errors;
5066 }
5067
5068 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5069 {
5070         struct extent_map *em;
5071         struct map_lookup *map;
5072         int readonly = 0;
5073         int miss_ndevs = 0;
5074         int i;
5075
5076         em = get_chunk_map(fs_info, chunk_offset, 1);
5077         if (IS_ERR(em))
5078                 return 1;
5079
5080         map = em->map_lookup;
5081         for (i = 0; i < map->num_stripes; i++) {
5082                 if (map->stripes[i].dev->missing) {
5083                         miss_ndevs++;
5084                         continue;
5085                 }
5086
5087                 if (!map->stripes[i].dev->writeable) {
5088                         readonly = 1;
5089                         goto end;
5090                 }
5091         }
5092
5093         /*
5094          * If the number of missing devices is larger than max errors,
5095          * we can not write the data into that chunk successfully, so
5096          * set it readonly.
5097          */
5098         if (miss_ndevs > btrfs_chunk_max_errors(map))
5099                 readonly = 1;
5100 end:
5101         free_extent_map(em);
5102         return readonly;
5103 }
5104
5105 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5106 {
5107         extent_map_tree_init(&tree->map_tree);
5108 }
5109
5110 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5111 {
5112         struct extent_map *em;
5113
5114         while (1) {
5115                 write_lock(&tree->map_tree.lock);
5116                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
5117                 if (em)
5118                         remove_extent_mapping(&tree->map_tree, em);
5119                 write_unlock(&tree->map_tree.lock);
5120                 if (!em)
5121                         break;
5122                 /* once for us */
5123                 free_extent_map(em);
5124                 /* once for the tree */
5125                 free_extent_map(em);
5126         }
5127 }
5128
5129 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5130 {
5131         struct extent_map *em;
5132         struct map_lookup *map;
5133         int ret;
5134
5135         em = get_chunk_map(fs_info, logical, len);
5136         if (IS_ERR(em))
5137                 /*
5138                  * We could return errors for these cases, but that could get
5139                  * ugly and we'd probably do the same thing which is just not do
5140                  * anything else and exit, so return 1 so the callers don't try
5141                  * to use other copies.
5142                  */
5143                 return 1;
5144
5145         map = em->map_lookup;
5146         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
5147                 ret = map->num_stripes;
5148         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5149                 ret = map->sub_stripes;
5150         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5151                 ret = 2;
5152         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5153                 ret = 3;
5154         else
5155                 ret = 1;
5156         free_extent_map(em);
5157
5158         btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
5159         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5160             fs_info->dev_replace.tgtdev)
5161                 ret++;
5162         btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
5163
5164         return ret;
5165 }
5166
5167 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5168                                     struct btrfs_mapping_tree *map_tree,
5169                                     u64 logical)
5170 {
5171         struct extent_map *em;
5172         struct map_lookup *map;
5173         unsigned long len = fs_info->sectorsize;
5174
5175         em = get_chunk_map(fs_info, logical, len);
5176         WARN_ON(IS_ERR(em));
5177
5178         map = em->map_lookup;
5179         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5180                 len = map->stripe_len * nr_data_stripes(map);
5181         free_extent_map(em);
5182         return len;
5183 }
5184
5185 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
5186                            u64 logical, u64 len, int mirror_num)
5187 {
5188         struct extent_map *em;
5189         struct map_lookup *map;
5190         int ret = 0;
5191
5192         em = get_chunk_map(fs_info, logical, len);
5193         WARN_ON(IS_ERR(em));
5194
5195         map = em->map_lookup;
5196         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5197                 ret = 1;
5198         free_extent_map(em);
5199         return ret;
5200 }
5201
5202 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5203                             struct map_lookup *map, int first, int num,
5204                             int optimal, int dev_replace_is_ongoing)
5205 {
5206         int i;
5207         int tolerance;
5208         struct btrfs_device *srcdev;
5209
5210         if (dev_replace_is_ongoing &&
5211             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5212              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5213                 srcdev = fs_info->dev_replace.srcdev;
5214         else
5215                 srcdev = NULL;
5216
5217         /*
5218          * try to avoid the drive that is the source drive for a
5219          * dev-replace procedure, only choose it if no other non-missing
5220          * mirror is available
5221          */
5222         for (tolerance = 0; tolerance < 2; tolerance++) {
5223                 if (map->stripes[optimal].dev->bdev &&
5224                     (tolerance || map->stripes[optimal].dev != srcdev))
5225                         return optimal;
5226                 for (i = first; i < first + num; i++) {
5227                         if (map->stripes[i].dev->bdev &&
5228                             (tolerance || map->stripes[i].dev != srcdev))
5229                                 return i;
5230                 }
5231         }
5232
5233         /* we couldn't find one that doesn't fail.  Just return something
5234          * and the io error handling code will clean up eventually
5235          */
5236         return optimal;
5237 }
5238
5239 static inline int parity_smaller(u64 a, u64 b)
5240 {
5241         return a > b;
5242 }
5243
5244 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5245 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5246 {
5247         struct btrfs_bio_stripe s;
5248         int i;
5249         u64 l;
5250         int again = 1;
5251
5252         while (again) {
5253                 again = 0;
5254                 for (i = 0; i < num_stripes - 1; i++) {
5255                         if (parity_smaller(bbio->raid_map[i],
5256                                            bbio->raid_map[i+1])) {
5257                                 s = bbio->stripes[i];
5258                                 l = bbio->raid_map[i];
5259                                 bbio->stripes[i] = bbio->stripes[i+1];
5260                                 bbio->raid_map[i] = bbio->raid_map[i+1];
5261                                 bbio->stripes[i+1] = s;
5262                                 bbio->raid_map[i+1] = l;
5263
5264                                 again = 1;
5265                         }
5266                 }
5267         }
5268 }
5269
5270 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5271 {
5272         struct btrfs_bio *bbio = kzalloc(
5273                  /* the size of the btrfs_bio */
5274                 sizeof(struct btrfs_bio) +
5275                 /* plus the variable array for the stripes */
5276                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5277                 /* plus the variable array for the tgt dev */
5278                 sizeof(int) * (real_stripes) +
5279                 /*
5280                  * plus the raid_map, which includes both the tgt dev
5281                  * and the stripes
5282                  */
5283                 sizeof(u64) * (total_stripes),
5284                 GFP_NOFS|__GFP_NOFAIL);
5285
5286         atomic_set(&bbio->error, 0);
5287         refcount_set(&bbio->refs, 1);
5288
5289         return bbio;
5290 }
5291
5292 void btrfs_get_bbio(struct btrfs_bio *bbio)
5293 {
5294         WARN_ON(!refcount_read(&bbio->refs));
5295         refcount_inc(&bbio->refs);
5296 }
5297
5298 void btrfs_put_bbio(struct btrfs_bio *bbio)
5299 {
5300         if (!bbio)
5301                 return;
5302         if (refcount_dec_and_test(&bbio->refs))
5303                 kfree(bbio);
5304 }
5305
5306 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5307 /*
5308  * Please note that, discard won't be sent to target device of device
5309  * replace.
5310  */
5311 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5312                                          u64 logical, u64 length,
5313                                          struct btrfs_bio **bbio_ret)
5314 {
5315         struct extent_map *em;
5316         struct map_lookup *map;
5317         struct btrfs_bio *bbio;
5318         u64 offset;
5319         u64 stripe_nr;
5320         u64 stripe_nr_end;
5321         u64 stripe_end_offset;
5322         u64 stripe_cnt;
5323         u64 stripe_len;
5324         u64 stripe_offset;
5325         u64 num_stripes;
5326         u32 stripe_index;
5327         u32 factor = 0;
5328         u32 sub_stripes = 0;
5329         u64 stripes_per_dev = 0;
5330         u32 remaining_stripes = 0;
5331         u32 last_stripe = 0;
5332         int ret = 0;
5333         int i;
5334
5335         /* discard always return a bbio */
5336         ASSERT(bbio_ret);
5337
5338         em = get_chunk_map(fs_info, logical, length);
5339         if (IS_ERR(em))
5340                 return PTR_ERR(em);
5341
5342         map = em->map_lookup;
5343         /* we don't discard raid56 yet */
5344         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5345                 ret = -EOPNOTSUPP;
5346                 goto out;
5347         }
5348
5349         offset = logical - em->start;
5350         length = min_t(u64, em->len - offset, length);
5351
5352         stripe_len = map->stripe_len;
5353         /*
5354          * stripe_nr counts the total number of stripes we have to stride
5355          * to get to this block
5356          */
5357         stripe_nr = div64_u64(offset, stripe_len);
5358
5359         /* stripe_offset is the offset of this block in its stripe */
5360         stripe_offset = offset - stripe_nr * stripe_len;
5361
5362         stripe_nr_end = round_up(offset + length, map->stripe_len);
5363         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5364         stripe_cnt = stripe_nr_end - stripe_nr;
5365         stripe_end_offset = stripe_nr_end * map->stripe_len -
5366                             (offset + length);
5367         /*
5368          * after this, stripe_nr is the number of stripes on this
5369          * device we have to walk to find the data, and stripe_index is
5370          * the number of our device in the stripe array
5371          */
5372         num_stripes = 1;
5373         stripe_index = 0;
5374         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5375                          BTRFS_BLOCK_GROUP_RAID10)) {
5376                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5377                         sub_stripes = 1;
5378                 else
5379                         sub_stripes = map->sub_stripes;
5380
5381                 factor = map->num_stripes / sub_stripes;
5382                 num_stripes = min_t(u64, map->num_stripes,
5383                                     sub_stripes * stripe_cnt);
5384                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5385                 stripe_index *= sub_stripes;
5386                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5387                                               &remaining_stripes);
5388                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5389                 last_stripe *= sub_stripes;
5390         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5391                                 BTRFS_BLOCK_GROUP_DUP)) {
5392                 num_stripes = map->num_stripes;
5393         } else {
5394                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5395                                         &stripe_index);
5396         }
5397
5398         bbio = alloc_btrfs_bio(num_stripes, 0);
5399         if (!bbio) {
5400                 ret = -ENOMEM;
5401                 goto out;
5402         }
5403
5404         for (i = 0; i < num_stripes; i++) {
5405                 bbio->stripes[i].physical =
5406                         map->stripes[stripe_index].physical +
5407                         stripe_offset + stripe_nr * map->stripe_len;
5408                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5409
5410                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5411                                  BTRFS_BLOCK_GROUP_RAID10)) {
5412                         bbio->stripes[i].length = stripes_per_dev *
5413                                 map->stripe_len;
5414
5415                         if (i / sub_stripes < remaining_stripes)
5416                                 bbio->stripes[i].length +=
5417                                         map->stripe_len;
5418
5419                         /*
5420                          * Special for the first stripe and
5421                          * the last stripe:
5422                          *
5423                          * |-------|...|-------|
5424                          *     |----------|
5425                          *    off     end_off
5426                          */
5427                         if (i < sub_stripes)
5428                                 bbio->stripes[i].length -=
5429                                         stripe_offset;
5430
5431                         if (stripe_index >= last_stripe &&
5432                             stripe_index <= (last_stripe +
5433                                              sub_stripes - 1))
5434                                 bbio->stripes[i].length -=
5435                                         stripe_end_offset;
5436
5437                         if (i == sub_stripes - 1)
5438                                 stripe_offset = 0;
5439                 } else {
5440                         bbio->stripes[i].length = length;
5441                 }
5442
5443                 stripe_index++;
5444                 if (stripe_index == map->num_stripes) {
5445                         stripe_index = 0;
5446                         stripe_nr++;
5447                 }
5448         }
5449
5450         *bbio_ret = bbio;
5451         bbio->map_type = map->type;
5452         bbio->num_stripes = num_stripes;
5453 out:
5454         free_extent_map(em);
5455         return ret;
5456 }
5457
5458 /*
5459  * In dev-replace case, for repair case (that's the only case where the mirror
5460  * is selected explicitly when calling btrfs_map_block), blocks left of the
5461  * left cursor can also be read from the target drive.
5462  *
5463  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5464  * array of stripes.
5465  * For READ, it also needs to be supported using the same mirror number.
5466  *
5467  * If the requested block is not left of the left cursor, EIO is returned. This
5468  * can happen because btrfs_num_copies() returns one more in the dev-replace
5469  * case.
5470  */
5471 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5472                                          u64 logical, u64 length,
5473                                          u64 srcdev_devid, int *mirror_num,
5474                                          u64 *physical)
5475 {
5476         struct btrfs_bio *bbio = NULL;
5477         int num_stripes;
5478         int index_srcdev = 0;
5479         int found = 0;
5480         u64 physical_of_found = 0;
5481         int i;
5482         int ret = 0;
5483
5484         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5485                                 logical, &length, &bbio, 0, 0);
5486         if (ret) {
5487                 ASSERT(bbio == NULL);
5488                 return ret;
5489         }
5490
5491         num_stripes = bbio->num_stripes;
5492         if (*mirror_num > num_stripes) {
5493                 /*
5494                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5495                  * that means that the requested area is not left of the left
5496                  * cursor
5497                  */
5498                 btrfs_put_bbio(bbio);
5499                 return -EIO;
5500         }
5501
5502         /*
5503          * process the rest of the function using the mirror_num of the source
5504          * drive. Therefore look it up first.  At the end, patch the device
5505          * pointer to the one of the target drive.
5506          */
5507         for (i = 0; i < num_stripes; i++) {
5508                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5509                         continue;
5510
5511                 /*
5512                  * In case of DUP, in order to keep it simple, only add the
5513                  * mirror with the lowest physical address
5514                  */
5515                 if (found &&
5516                     physical_of_found <= bbio->stripes[i].physical)
5517                         continue;
5518
5519                 index_srcdev = i;
5520                 found = 1;
5521                 physical_of_found = bbio->stripes[i].physical;
5522         }
5523
5524         btrfs_put_bbio(bbio);
5525
5526         ASSERT(found);
5527         if (!found)
5528                 return -EIO;
5529
5530         *mirror_num = index_srcdev + 1;
5531         *physical = physical_of_found;
5532         return ret;
5533 }
5534
5535 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5536                                       struct btrfs_bio **bbio_ret,
5537                                       struct btrfs_dev_replace *dev_replace,
5538                                       int *num_stripes_ret, int *max_errors_ret)
5539 {
5540         struct btrfs_bio *bbio = *bbio_ret;
5541         u64 srcdev_devid = dev_replace->srcdev->devid;
5542         int tgtdev_indexes = 0;
5543         int num_stripes = *num_stripes_ret;
5544         int max_errors = *max_errors_ret;
5545         int i;
5546
5547         if (op == BTRFS_MAP_WRITE) {
5548                 int index_where_to_add;
5549
5550                 /*
5551                  * duplicate the write operations while the dev replace
5552                  * procedure is running. Since the copying of the old disk to
5553                  * the new disk takes place at run time while the filesystem is
5554                  * mounted writable, the regular write operations to the old
5555                  * disk have to be duplicated to go to the new disk as well.
5556                  *
5557                  * Note that device->missing is handled by the caller, and that
5558                  * the write to the old disk is already set up in the stripes
5559                  * array.
5560                  */
5561                 index_where_to_add = num_stripes;
5562                 for (i = 0; i < num_stripes; i++) {
5563                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5564                                 /* write to new disk, too */
5565                                 struct btrfs_bio_stripe *new =
5566                                         bbio->stripes + index_where_to_add;
5567                                 struct btrfs_bio_stripe *old =
5568                                         bbio->stripes + i;
5569
5570                                 new->physical = old->physical;
5571                                 new->length = old->length;
5572                                 new->dev = dev_replace->tgtdev;
5573                                 bbio->tgtdev_map[i] = index_where_to_add;
5574                                 index_where_to_add++;
5575                                 max_errors++;
5576                                 tgtdev_indexes++;
5577                         }
5578                 }
5579                 num_stripes = index_where_to_add;
5580         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5581                 int index_srcdev = 0;
5582                 int found = 0;
5583                 u64 physical_of_found = 0;
5584
5585                 /*
5586                  * During the dev-replace procedure, the target drive can also
5587                  * be used to read data in case it is needed to repair a corrupt
5588                  * block elsewhere. This is possible if the requested area is
5589                  * left of the left cursor. In this area, the target drive is a
5590                  * full copy of the source drive.
5591                  */
5592                 for (i = 0; i < num_stripes; i++) {
5593                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5594                                 /*
5595                                  * In case of DUP, in order to keep it simple,
5596                                  * only add the mirror with the lowest physical
5597                                  * address
5598                                  */
5599                                 if (found &&
5600                                     physical_of_found <=
5601                                      bbio->stripes[i].physical)
5602                                         continue;
5603                                 index_srcdev = i;
5604                                 found = 1;
5605                                 physical_of_found = bbio->stripes[i].physical;
5606                         }
5607                 }
5608                 if (found) {
5609                         struct btrfs_bio_stripe *tgtdev_stripe =
5610                                 bbio->stripes + num_stripes;
5611
5612                         tgtdev_stripe->physical = physical_of_found;
5613                         tgtdev_stripe->length =
5614                                 bbio->stripes[index_srcdev].length;
5615                         tgtdev_stripe->dev = dev_replace->tgtdev;
5616                         bbio->tgtdev_map[index_srcdev] = num_stripes;
5617
5618                         tgtdev_indexes++;
5619                         num_stripes++;
5620                 }
5621         }
5622
5623         *num_stripes_ret = num_stripes;
5624         *max_errors_ret = max_errors;
5625         bbio->num_tgtdevs = tgtdev_indexes;
5626         *bbio_ret = bbio;
5627 }
5628
5629 static bool need_full_stripe(enum btrfs_map_op op)
5630 {
5631         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5632 }
5633
5634 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5635                              enum btrfs_map_op op,
5636                              u64 logical, u64 *length,
5637                              struct btrfs_bio **bbio_ret,
5638                              int mirror_num, int need_raid_map)
5639 {
5640         struct extent_map *em;
5641         struct map_lookup *map;
5642         u64 offset;
5643         u64 stripe_offset;
5644         u64 stripe_nr;
5645         u64 stripe_len;
5646         u32 stripe_index;
5647         int i;
5648         int ret = 0;
5649         int num_stripes;
5650         int max_errors = 0;
5651         int tgtdev_indexes = 0;
5652         struct btrfs_bio *bbio = NULL;
5653         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5654         int dev_replace_is_ongoing = 0;
5655         int num_alloc_stripes;
5656         int patch_the_first_stripe_for_dev_replace = 0;
5657         u64 physical_to_patch_in_first_stripe = 0;
5658         u64 raid56_full_stripe_start = (u64)-1;
5659
5660         if (op == BTRFS_MAP_DISCARD)
5661                 return __btrfs_map_block_for_discard(fs_info, logical,
5662                                                      *length, bbio_ret);
5663
5664         em = get_chunk_map(fs_info, logical, *length);
5665         if (IS_ERR(em))
5666                 return PTR_ERR(em);
5667
5668         map = em->map_lookup;
5669         offset = logical - em->start;
5670
5671         stripe_len = map->stripe_len;
5672         stripe_nr = offset;
5673         /*
5674          * stripe_nr counts the total number of stripes we have to stride
5675          * to get to this block
5676          */
5677         stripe_nr = div64_u64(stripe_nr, stripe_len);
5678
5679         stripe_offset = stripe_nr * stripe_len;
5680         if (offset < stripe_offset) {
5681                 btrfs_crit(fs_info,
5682                            "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5683                            stripe_offset, offset, em->start, logical,
5684                            stripe_len);
5685                 free_extent_map(em);
5686                 return -EINVAL;
5687         }
5688
5689         /* stripe_offset is the offset of this block in its stripe*/
5690         stripe_offset = offset - stripe_offset;
5691
5692         /* if we're here for raid56, we need to know the stripe aligned start */
5693         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5694                 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5695                 raid56_full_stripe_start = offset;
5696
5697                 /* allow a write of a full stripe, but make sure we don't
5698                  * allow straddling of stripes
5699                  */
5700                 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5701                                 full_stripe_len);
5702                 raid56_full_stripe_start *= full_stripe_len;
5703         }
5704
5705         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5706                 u64 max_len;
5707                 /* For writes to RAID[56], allow a full stripeset across all disks.
5708                    For other RAID types and for RAID[56] reads, just allow a single
5709                    stripe (on a single disk). */
5710                 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5711                     (op == BTRFS_MAP_WRITE)) {
5712                         max_len = stripe_len * nr_data_stripes(map) -
5713                                 (offset - raid56_full_stripe_start);
5714                 } else {
5715                         /* we limit the length of each bio to what fits in a stripe */
5716                         max_len = stripe_len - stripe_offset;
5717                 }
5718                 *length = min_t(u64, em->len - offset, max_len);
5719         } else {
5720                 *length = em->len - offset;
5721         }
5722
5723         /* This is for when we're called from btrfs_merge_bio_hook() and all
5724            it cares about is the length */
5725         if (!bbio_ret)
5726                 goto out;
5727
5728         btrfs_dev_replace_lock(dev_replace, 0);
5729         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5730         if (!dev_replace_is_ongoing)
5731                 btrfs_dev_replace_unlock(dev_replace, 0);
5732         else
5733                 btrfs_dev_replace_set_lock_blocking(dev_replace);
5734
5735         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5736             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5737                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
5738                                                     dev_replace->srcdev->devid,
5739                                                     &mirror_num,
5740                                             &physical_to_patch_in_first_stripe);
5741                 if (ret)
5742                         goto out;
5743                 else
5744                         patch_the_first_stripe_for_dev_replace = 1;
5745         } else if (mirror_num > map->num_stripes) {
5746                 mirror_num = 0;
5747         }
5748
5749         num_stripes = 1;
5750         stripe_index = 0;
5751         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5752                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5753                                 &stripe_index);
5754                 if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
5755                         mirror_num = 1;
5756         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5757                 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
5758                         num_stripes = map->num_stripes;
5759                 else if (mirror_num)
5760                         stripe_index = mirror_num - 1;
5761                 else {
5762                         stripe_index = find_live_mirror(fs_info, map, 0,
5763                                             map->num_stripes,
5764                                             current->pid % map->num_stripes,
5765                                             dev_replace_is_ongoing);
5766                         mirror_num = stripe_index + 1;
5767                 }
5768
5769         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5770                 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
5771                         num_stripes = map->num_stripes;
5772                 } else if (mirror_num) {
5773                         stripe_index = mirror_num - 1;
5774                 } else {
5775                         mirror_num = 1;
5776                 }
5777
5778         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5779                 u32 factor = map->num_stripes / map->sub_stripes;
5780
5781                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5782                 stripe_index *= map->sub_stripes;
5783
5784                 if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
5785                         num_stripes = map->sub_stripes;
5786                 else if (mirror_num)
5787                         stripe_index += mirror_num - 1;
5788                 else {
5789                         int old_stripe_index = stripe_index;
5790                         stripe_index = find_live_mirror(fs_info, map,
5791                                               stripe_index,
5792                                               map->sub_stripes, stripe_index +
5793                                               current->pid % map->sub_stripes,
5794                                               dev_replace_is_ongoing);
5795                         mirror_num = stripe_index - old_stripe_index + 1;
5796                 }
5797
5798         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5799                 if (need_raid_map &&
5800                     (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
5801                      mirror_num > 1)) {
5802                         /* push stripe_nr back to the start of the full stripe */
5803                         stripe_nr = div64_u64(raid56_full_stripe_start,
5804                                         stripe_len * nr_data_stripes(map));
5805
5806                         /* RAID[56] write or recovery. Return all stripes */
5807                         num_stripes = map->num_stripes;
5808                         max_errors = nr_parity_stripes(map);
5809
5810                         *length = map->stripe_len;
5811                         stripe_index = 0;
5812                         stripe_offset = 0;
5813                 } else {
5814                         /*
5815                          * Mirror #0 or #1 means the original data block.
5816                          * Mirror #2 is RAID5 parity block.
5817                          * Mirror #3 is RAID6 Q block.
5818                          */
5819                         stripe_nr = div_u64_rem(stripe_nr,
5820                                         nr_data_stripes(map), &stripe_index);
5821                         if (mirror_num > 1)
5822                                 stripe_index = nr_data_stripes(map) +
5823                                                 mirror_num - 2;
5824
5825                         /* We distribute the parity blocks across stripes */
5826                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5827                                         &stripe_index);
5828                         if ((op != BTRFS_MAP_WRITE &&
5829                              op != BTRFS_MAP_GET_READ_MIRRORS) &&
5830                             mirror_num <= 1)
5831                                 mirror_num = 1;
5832                 }
5833         } else {
5834                 /*
5835                  * after this, stripe_nr is the number of stripes on this
5836                  * device we have to walk to find the data, and stripe_index is
5837                  * the number of our device in the stripe array
5838                  */
5839                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5840                                 &stripe_index);
5841                 mirror_num = stripe_index + 1;
5842         }
5843         if (stripe_index >= map->num_stripes) {
5844                 btrfs_crit(fs_info,
5845                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5846                            stripe_index, map->num_stripes);
5847                 ret = -EINVAL;
5848                 goto out;
5849         }
5850
5851         num_alloc_stripes = num_stripes;
5852         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5853                 if (op == BTRFS_MAP_WRITE)
5854                         num_alloc_stripes <<= 1;
5855                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
5856                         num_alloc_stripes++;
5857                 tgtdev_indexes = num_stripes;
5858         }
5859
5860         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
5861         if (!bbio) {
5862                 ret = -ENOMEM;
5863                 goto out;
5864         }
5865         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5866                 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5867
5868         /* build raid_map */
5869         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
5870             (need_full_stripe(op) || mirror_num > 1)) {
5871                 u64 tmp;
5872                 unsigned rot;
5873
5874                 bbio->raid_map = (u64 *)((void *)bbio->stripes +
5875                                  sizeof(struct btrfs_bio_stripe) *
5876                                  num_alloc_stripes +
5877                                  sizeof(int) * tgtdev_indexes);
5878
5879                 /* Work out the disk rotation on this stripe-set */
5880                 div_u64_rem(stripe_nr, num_stripes, &rot);
5881
5882                 /* Fill in the logical address of each stripe */
5883                 tmp = stripe_nr * nr_data_stripes(map);
5884                 for (i = 0; i < nr_data_stripes(map); i++)
5885                         bbio->raid_map[(i+rot) % num_stripes] =
5886                                 em->start + (tmp + i) * map->stripe_len;
5887
5888                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5889                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5890                         bbio->raid_map[(i+rot+1) % num_stripes] =
5891                                 RAID6_Q_STRIPE;
5892         }
5893
5894
5895         for (i = 0; i < num_stripes; i++) {
5896                 bbio->stripes[i].physical =
5897                         map->stripes[stripe_index].physical +
5898                         stripe_offset +
5899                         stripe_nr * map->stripe_len;
5900                 bbio->stripes[i].dev =
5901                         map->stripes[stripe_index].dev;
5902                 stripe_index++;
5903         }
5904
5905         if (need_full_stripe(op))
5906                 max_errors = btrfs_chunk_max_errors(map);
5907
5908         if (bbio->raid_map)
5909                 sort_parity_stripes(bbio, num_stripes);
5910
5911         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5912             need_full_stripe(op)) {
5913                 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
5914                                           &max_errors);
5915         }
5916
5917         *bbio_ret = bbio;
5918         bbio->map_type = map->type;
5919         bbio->num_stripes = num_stripes;
5920         bbio->max_errors = max_errors;
5921         bbio->mirror_num = mirror_num;
5922
5923         /*
5924          * this is the case that REQ_READ && dev_replace_is_ongoing &&
5925          * mirror_num == num_stripes + 1 && dev_replace target drive is
5926          * available as a mirror
5927          */
5928         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5929                 WARN_ON(num_stripes > 1);
5930                 bbio->stripes[0].dev = dev_replace->tgtdev;
5931                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5932                 bbio->mirror_num = map->num_stripes + 1;
5933         }
5934 out:
5935         if (dev_replace_is_ongoing) {
5936                 btrfs_dev_replace_clear_lock_blocking(dev_replace);
5937                 btrfs_dev_replace_unlock(dev_replace, 0);
5938         }
5939         free_extent_map(em);
5940         return ret;
5941 }
5942
5943 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5944                       u64 logical, u64 *length,
5945                       struct btrfs_bio **bbio_ret, int mirror_num)
5946 {
5947         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5948                                  mirror_num, 0);
5949 }
5950
5951 /* For Scrub/replace */
5952 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5953                      u64 logical, u64 *length,
5954                      struct btrfs_bio **bbio_ret)
5955 {
5956         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5957 }
5958
5959 int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
5960                      u64 chunk_start, u64 physical, u64 devid,
5961                      u64 **logical, int *naddrs, int *stripe_len)
5962 {
5963         struct extent_map *em;
5964         struct map_lookup *map;
5965         u64 *buf;
5966         u64 bytenr;
5967         u64 length;
5968         u64 stripe_nr;
5969         u64 rmap_len;
5970         int i, j, nr = 0;
5971
5972         em = get_chunk_map(fs_info, chunk_start, 1);
5973         if (IS_ERR(em))
5974                 return -EIO;
5975
5976         map = em->map_lookup;
5977         length = em->len;
5978         rmap_len = map->stripe_len;
5979
5980         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5981                 length = div_u64(length, map->num_stripes / map->sub_stripes);
5982         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5983                 length = div_u64(length, map->num_stripes);
5984         else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5985                 length = div_u64(length, nr_data_stripes(map));
5986                 rmap_len = map->stripe_len * nr_data_stripes(map);
5987         }
5988
5989         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5990         BUG_ON(!buf); /* -ENOMEM */
5991
5992         for (i = 0; i < map->num_stripes; i++) {
5993                 if (devid && map->stripes[i].dev->devid != devid)
5994                         continue;
5995                 if (map->stripes[i].physical > physical ||
5996                     map->stripes[i].physical + length <= physical)
5997                         continue;
5998
5999                 stripe_nr = physical - map->stripes[i].physical;
6000                 stripe_nr = div64_u64(stripe_nr, map->stripe_len);
6001
6002                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6003                         stripe_nr = stripe_nr * map->num_stripes + i;
6004                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
6005                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6006                         stripe_nr = stripe_nr * map->num_stripes + i;
6007                 } /* else if RAID[56], multiply by nr_data_stripes().
6008                    * Alternatively, just use rmap_len below instead of
6009                    * map->stripe_len */
6010
6011                 bytenr = chunk_start + stripe_nr * rmap_len;
6012                 WARN_ON(nr >= map->num_stripes);
6013                 for (j = 0; j < nr; j++) {
6014                         if (buf[j] == bytenr)
6015                                 break;
6016                 }
6017                 if (j == nr) {
6018                         WARN_ON(nr >= map->num_stripes);
6019                         buf[nr++] = bytenr;
6020                 }
6021         }
6022
6023         *logical = buf;
6024         *naddrs = nr;
6025         *stripe_len = rmap_len;
6026
6027         free_extent_map(em);
6028         return 0;
6029 }
6030
6031 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6032 {
6033         bio->bi_private = bbio->private;
6034         bio->bi_end_io = bbio->end_io;
6035         bio_endio(bio);
6036
6037         btrfs_put_bbio(bbio);
6038 }
6039
6040 static void btrfs_end_bio(struct bio *bio)
6041 {
6042         struct btrfs_bio *bbio = bio->bi_private;
6043         int is_orig_bio = 0;
6044
6045         if (bio->bi_error) {
6046                 atomic_inc(&bbio->error);
6047                 if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
6048                         unsigned int stripe_index =
6049                                 btrfs_io_bio(bio)->stripe_index;
6050                         struct btrfs_device *dev;
6051
6052                         BUG_ON(stripe_index >= bbio->num_stripes);
6053                         dev = bbio->stripes[stripe_index].dev;
6054                         if (dev->bdev) {
6055                                 if (bio_op(bio) == REQ_OP_WRITE)
6056                                         btrfs_dev_stat_inc(dev,
6057                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6058                                 else
6059                                         btrfs_dev_stat_inc(dev,
6060                                                 BTRFS_DEV_STAT_READ_ERRS);
6061                                 if (bio->bi_opf & REQ_PREFLUSH)
6062                                         btrfs_dev_stat_inc(dev,
6063                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6064                                 btrfs_dev_stat_print_on_error(dev);
6065                         }
6066                 }
6067         }
6068
6069         if (bio == bbio->orig_bio)
6070                 is_orig_bio = 1;
6071
6072         btrfs_bio_counter_dec(bbio->fs_info);
6073
6074         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6075                 if (!is_orig_bio) {
6076                         bio_put(bio);
6077                         bio = bbio->orig_bio;
6078                 }
6079
6080                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6081                 /* only send an error to the higher layers if it is
6082                  * beyond the tolerance of the btrfs bio
6083                  */
6084                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6085                         bio->bi_error = -EIO;
6086                 } else {
6087                         /*
6088                          * this bio is actually up to date, we didn't
6089                          * go over the max number of errors
6090                          */
6091                         bio->bi_error = 0;
6092                 }
6093
6094                 btrfs_end_bbio(bbio, bio);
6095         } else if (!is_orig_bio) {
6096                 bio_put(bio);
6097         }
6098 }
6099
6100 /*
6101  * see run_scheduled_bios for a description of why bios are collected for
6102  * async submit.
6103  *
6104  * This will add one bio to the pending list for a device and make sure
6105  * the work struct is scheduled.
6106  */
6107 static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6108                                         struct bio *bio)
6109 {
6110         struct btrfs_fs_info *fs_info = device->fs_info;
6111         int should_queue = 1;
6112         struct btrfs_pending_bios *pending_bios;
6113
6114         if (device->missing || !device->bdev) {
6115                 bio_io_error(bio);
6116                 return;
6117         }
6118
6119         /* don't bother with additional async steps for reads, right now */
6120         if (bio_op(bio) == REQ_OP_READ) {
6121                 bio_get(bio);
6122                 btrfsic_submit_bio(bio);
6123                 bio_put(bio);
6124                 return;
6125         }
6126
6127         /*
6128          * nr_async_bios allows us to reliably return congestion to the
6129          * higher layers.  Otherwise, the async bio makes it appear we have
6130          * made progress against dirty pages when we've really just put it
6131          * on a queue for later
6132          */
6133         atomic_inc(&fs_info->nr_async_bios);
6134         WARN_ON(bio->bi_next);
6135         bio->bi_next = NULL;
6136
6137         spin_lock(&device->io_lock);
6138         if (op_is_sync(bio->bi_opf))
6139                 pending_bios = &device->pending_sync_bios;
6140         else
6141                 pending_bios = &device->pending_bios;
6142
6143         if (pending_bios->tail)
6144                 pending_bios->tail->bi_next = bio;
6145
6146         pending_bios->tail = bio;
6147         if (!pending_bios->head)
6148                 pending_bios->head = bio;
6149         if (device->running_pending)
6150                 should_queue = 0;
6151
6152         spin_unlock(&device->io_lock);
6153
6154         if (should_queue)
6155                 btrfs_queue_work(fs_info->submit_workers, &device->work);
6156 }
6157
6158 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6159                               u64 physical, int dev_nr, int async)
6160 {
6161         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6162         struct btrfs_fs_info *fs_info = bbio->fs_info;
6163
6164         bio->bi_private = bbio;
6165         btrfs_io_bio(bio)->stripe_index = dev_nr;
6166         bio->bi_end_io = btrfs_end_bio;
6167         bio->bi_iter.bi_sector = physical >> 9;
6168 #ifdef DEBUG
6169         {
6170                 struct rcu_string *name;
6171
6172                 rcu_read_lock();
6173                 name = rcu_dereference(dev->name);
6174                 btrfs_debug(fs_info,
6175                         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6176                         bio_op(bio), bio->bi_opf,
6177                         (u64)bio->bi_iter.bi_sector,
6178                         (u_long)dev->bdev->bd_dev, name->str, dev->devid,
6179                         bio->bi_iter.bi_size);
6180                 rcu_read_unlock();
6181         }
6182 #endif
6183         bio->bi_bdev = dev->bdev;
6184
6185         btrfs_bio_counter_inc_noblocked(fs_info);
6186
6187         if (async)
6188                 btrfs_schedule_bio(dev, bio);
6189         else
6190                 btrfsic_submit_bio(bio);
6191 }
6192
6193 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6194 {
6195         atomic_inc(&bbio->error);
6196         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6197                 /* Should be the original bio. */
6198                 WARN_ON(bio != bbio->orig_bio);
6199
6200                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6201                 bio->bi_iter.bi_sector = logical >> 9;
6202                 bio->bi_error = -EIO;
6203                 btrfs_end_bbio(bbio, bio);
6204         }
6205 }
6206
6207 int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6208                   int mirror_num, int async_submit)
6209 {
6210         struct btrfs_device *dev;
6211         struct bio *first_bio = bio;
6212         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6213         u64 length = 0;
6214         u64 map_length;
6215         int ret;
6216         int dev_nr;
6217         int total_devs;
6218         struct btrfs_bio *bbio = NULL;
6219
6220         length = bio->bi_iter.bi_size;
6221         map_length = length;
6222
6223         btrfs_bio_counter_inc_blocked(fs_info);
6224         ret = __btrfs_map_block(fs_info, bio_op(bio), logical,
6225                                 &map_length, &bbio, mirror_num, 1);
6226         if (ret) {
6227                 btrfs_bio_counter_dec(fs_info);
6228                 return ret;
6229         }
6230
6231         total_devs = bbio->num_stripes;
6232         bbio->orig_bio = first_bio;
6233         bbio->private = first_bio->bi_private;
6234         bbio->end_io = first_bio->bi_end_io;
6235         bbio->fs_info = fs_info;
6236         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6237
6238         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6239             ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6240                 /* In this case, map_length has been set to the length of
6241                    a single stripe; not the whole write */
6242                 if (bio_op(bio) == REQ_OP_WRITE) {
6243                         ret = raid56_parity_write(fs_info, bio, bbio,
6244                                                   map_length);
6245                 } else {
6246                         ret = raid56_parity_recover(fs_info, bio, bbio,
6247                                                     map_length, mirror_num, 1);
6248                 }
6249
6250                 btrfs_bio_counter_dec(fs_info);
6251                 return ret;
6252         }
6253
6254         if (map_length < length) {
6255                 btrfs_crit(fs_info,
6256                            "mapping failed logical %llu bio len %llu len %llu",
6257                            logical, length, map_length);
6258                 BUG();
6259         }
6260
6261         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6262                 dev = bbio->stripes[dev_nr].dev;
6263                 if (!dev || !dev->bdev ||
6264                     (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
6265                         bbio_error(bbio, first_bio, logical);
6266                         continue;
6267                 }
6268
6269                 if (dev_nr < total_devs - 1) {
6270                         bio = btrfs_bio_clone(first_bio, GFP_NOFS);
6271                         BUG_ON(!bio); /* -ENOMEM */
6272                 } else
6273                         bio = first_bio;
6274
6275                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6276                                   dev_nr, async_submit);
6277         }
6278         btrfs_bio_counter_dec(fs_info);
6279         return 0;
6280 }
6281
6282 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
6283                                        u8 *uuid, u8 *fsid)
6284 {
6285         struct btrfs_device *device;
6286         struct btrfs_fs_devices *cur_devices;
6287
6288         cur_devices = fs_info->fs_devices;
6289         while (cur_devices) {
6290                 if (!fsid ||
6291                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
6292                         device = __find_device(&cur_devices->devices,
6293                                                devid, uuid);
6294                         if (device)
6295                                 return device;
6296                 }
6297                 cur_devices = cur_devices->seed;
6298         }
6299         return NULL;
6300 }
6301
6302 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6303                                             u64 devid, u8 *dev_uuid)
6304 {
6305         struct btrfs_device *device;
6306
6307         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6308         if (IS_ERR(device))
6309                 return NULL;
6310
6311         list_add(&device->dev_list, &fs_devices->devices);
6312         device->fs_devices = fs_devices;
6313         fs_devices->num_devices++;
6314
6315         device->missing = 1;
6316         fs_devices->missing_devices++;
6317
6318         return device;
6319 }
6320
6321 /**
6322  * btrfs_alloc_device - allocate struct btrfs_device
6323  * @fs_info:    used only for generating a new devid, can be NULL if
6324  *              devid is provided (i.e. @devid != NULL).
6325  * @devid:      a pointer to devid for this device.  If NULL a new devid
6326  *              is generated.
6327  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6328  *              is generated.
6329  *
6330  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6331  * on error.  Returned struct is not linked onto any lists and can be
6332  * destroyed with kfree() right away.
6333  */
6334 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6335                                         const u64 *devid,
6336                                         const u8 *uuid)
6337 {
6338         struct btrfs_device *dev;
6339         u64 tmp;
6340
6341         if (WARN_ON(!devid && !fs_info))
6342                 return ERR_PTR(-EINVAL);
6343
6344         dev = __alloc_device();
6345         if (IS_ERR(dev))
6346                 return dev;
6347
6348         if (devid)
6349                 tmp = *devid;
6350         else {
6351                 int ret;
6352
6353                 ret = find_next_devid(fs_info, &tmp);
6354                 if (ret) {
6355                         kfree(dev);
6356                         return ERR_PTR(ret);
6357                 }
6358         }
6359         dev->devid = tmp;
6360
6361         if (uuid)
6362                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6363         else
6364                 generate_random_uuid(dev->uuid);
6365
6366         btrfs_init_work(&dev->work, btrfs_submit_helper,
6367                         pending_bios_fn, NULL, NULL);
6368
6369         return dev;
6370 }
6371
6372 /* Return -EIO if any error, otherwise return 0. */
6373 static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6374                                    struct extent_buffer *leaf,
6375                                    struct btrfs_chunk *chunk, u64 logical)
6376 {
6377         u64 length;
6378         u64 stripe_len;
6379         u16 num_stripes;
6380         u16 sub_stripes;
6381         u64 type;
6382
6383         length = btrfs_chunk_length(leaf, chunk);
6384         stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6385         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6386         sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6387         type = btrfs_chunk_type(leaf, chunk);
6388
6389         if (!num_stripes) {
6390                 btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6391                           num_stripes);
6392                 return -EIO;
6393         }
6394         if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
6395                 btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6396                 return -EIO;
6397         }
6398         if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
6399                 btrfs_err(fs_info, "invalid chunk sectorsize %u",
6400                           btrfs_chunk_sector_size(leaf, chunk));
6401                 return -EIO;
6402         }
6403         if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
6404                 btrfs_err(fs_info, "invalid chunk length %llu", length);
6405                 return -EIO;
6406         }
6407         if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6408                 btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6409                           stripe_len);
6410                 return -EIO;
6411         }
6412         if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6413             type) {
6414                 btrfs_err(fs_info, "unrecognized chunk type: %llu",
6415                           ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
6416                             BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6417                           btrfs_chunk_type(leaf, chunk));
6418                 return -EIO;
6419         }
6420         if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
6421             (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
6422             (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
6423             (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
6424             (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
6425             ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
6426              num_stripes != 1)) {
6427                 btrfs_err(fs_info,
6428                         "invalid num_stripes:sub_stripes %u:%u for profile %llu",
6429                         num_stripes, sub_stripes,
6430                         type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
6431                 return -EIO;
6432         }
6433
6434         return 0;
6435 }
6436
6437 static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6438                           struct extent_buffer *leaf,
6439                           struct btrfs_chunk *chunk)
6440 {
6441         struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6442         struct map_lookup *map;
6443         struct extent_map *em;
6444         u64 logical;
6445         u64 length;
6446         u64 stripe_len;
6447         u64 devid;
6448         u8 uuid[BTRFS_UUID_SIZE];
6449         int num_stripes;
6450         int ret;
6451         int i;
6452
6453         logical = key->offset;
6454         length = btrfs_chunk_length(leaf, chunk);
6455         stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6456         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6457
6458         ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6459         if (ret)
6460                 return ret;
6461
6462         read_lock(&map_tree->map_tree.lock);
6463         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6464         read_unlock(&map_tree->map_tree.lock);
6465
6466         /* already mapped? */
6467         if (em && em->start <= logical && em->start + em->len > logical) {
6468                 free_extent_map(em);
6469                 return 0;
6470         } else if (em) {
6471                 free_extent_map(em);
6472         }
6473
6474         em = alloc_extent_map();
6475         if (!em)
6476                 return -ENOMEM;
6477         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6478         if (!map) {
6479                 free_extent_map(em);
6480                 return -ENOMEM;
6481         }
6482
6483         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6484         em->map_lookup = map;
6485         em->start = logical;
6486         em->len = length;
6487         em->orig_start = 0;
6488         em->block_start = 0;
6489         em->block_len = em->len;
6490
6491         map->num_stripes = num_stripes;
6492         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6493         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6494         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
6495         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6496         map->type = btrfs_chunk_type(leaf, chunk);
6497         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6498         for (i = 0; i < num_stripes; i++) {
6499                 map->stripes[i].physical =
6500                         btrfs_stripe_offset_nr(leaf, chunk, i);
6501                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6502                 read_extent_buffer(leaf, uuid, (unsigned long)
6503                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6504                                    BTRFS_UUID_SIZE);
6505                 map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6506                                                         uuid, NULL);
6507                 if (!map->stripes[i].dev &&
6508                     !btrfs_test_opt(fs_info, DEGRADED)) {
6509                         free_extent_map(em);
6510                         return -EIO;
6511                 }
6512                 if (!map->stripes[i].dev) {
6513                         map->stripes[i].dev =
6514                                 add_missing_dev(fs_info->fs_devices, devid,
6515                                                 uuid);
6516                         if (!map->stripes[i].dev) {
6517                                 free_extent_map(em);
6518                                 return -EIO;
6519                         }
6520                         btrfs_warn(fs_info, "devid %llu uuid %pU is missing",
6521                                    devid, uuid);
6522                 }
6523                 map->stripes[i].dev->in_fs_metadata = 1;
6524         }
6525
6526         write_lock(&map_tree->map_tree.lock);
6527         ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6528         write_unlock(&map_tree->map_tree.lock);
6529         BUG_ON(ret); /* Tree corruption */
6530         free_extent_map(em);
6531
6532         return 0;
6533 }
6534
6535 static void fill_device_from_item(struct extent_buffer *leaf,
6536                                  struct btrfs_dev_item *dev_item,
6537                                  struct btrfs_device *device)
6538 {
6539         unsigned long ptr;
6540
6541         device->devid = btrfs_device_id(leaf, dev_item);
6542         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6543         device->total_bytes = device->disk_total_bytes;
6544         device->commit_total_bytes = device->disk_total_bytes;
6545         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6546         device->commit_bytes_used = device->bytes_used;
6547         device->type = btrfs_device_type(leaf, dev_item);
6548         device->io_align = btrfs_device_io_align(leaf, dev_item);
6549         device->io_width = btrfs_device_io_width(leaf, dev_item);
6550         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6551         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6552         device->is_tgtdev_for_dev_replace = 0;
6553
6554         ptr = btrfs_device_uuid(dev_item);
6555         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6556 }
6557
6558 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6559                                                   u8 *fsid)
6560 {
6561         struct btrfs_fs_devices *fs_devices;
6562         int ret;
6563
6564         BUG_ON(!mutex_is_locked(&uuid_mutex));
6565
6566         fs_devices = fs_info->fs_devices->seed;
6567         while (fs_devices) {
6568                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
6569                         return fs_devices;
6570
6571                 fs_devices = fs_devices->seed;
6572         }
6573
6574         fs_devices = find_fsid(fsid);
6575         if (!fs_devices) {
6576                 if (!btrfs_test_opt(fs_info, DEGRADED))
6577                         return ERR_PTR(-ENOENT);
6578
6579                 fs_devices = alloc_fs_devices(fsid);
6580                 if (IS_ERR(fs_devices))
6581                         return fs_devices;
6582
6583                 fs_devices->seeding = 1;
6584                 fs_devices->opened = 1;
6585                 return fs_devices;
6586         }
6587
6588         fs_devices = clone_fs_devices(fs_devices);
6589         if (IS_ERR(fs_devices))
6590                 return fs_devices;
6591
6592         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6593                                    fs_info->bdev_holder);
6594         if (ret) {
6595                 free_fs_devices(fs_devices);
6596                 fs_devices = ERR_PTR(ret);
6597                 goto out;
6598         }
6599
6600         if (!fs_devices->seeding) {
6601                 __btrfs_close_devices(fs_devices);
6602                 free_fs_devices(fs_devices);
6603                 fs_devices = ERR_PTR(-EINVAL);
6604                 goto out;
6605         }
6606
6607         fs_devices->seed = fs_info->fs_devices->seed;
6608         fs_info->fs_devices->seed = fs_devices;
6609 out:
6610         return fs_devices;
6611 }
6612
6613 static int read_one_dev(struct btrfs_fs_info *fs_info,
6614                         struct extent_buffer *leaf,
6615                         struct btrfs_dev_item *dev_item)
6616 {
6617         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6618         struct btrfs_device *device;
6619         u64 devid;
6620         int ret;
6621         u8 fs_uuid[BTRFS_UUID_SIZE];
6622         u8 dev_uuid[BTRFS_UUID_SIZE];
6623
6624         devid = btrfs_device_id(leaf, dev_item);
6625         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6626                            BTRFS_UUID_SIZE);
6627         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6628                            BTRFS_UUID_SIZE);
6629
6630         if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) {
6631                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6632                 if (IS_ERR(fs_devices))
6633                         return PTR_ERR(fs_devices);
6634         }
6635
6636         device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6637         if (!device) {
6638                 if (!btrfs_test_opt(fs_info, DEGRADED))
6639                         return -EIO;
6640
6641                 device = add_missing_dev(fs_devices, devid, dev_uuid);
6642                 if (!device)
6643                         return -ENOMEM;
6644                 btrfs_warn(fs_info, "devid %llu uuid %pU missing",
6645                                 devid, dev_uuid);
6646         } else {
6647                 if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED))
6648                         return -EIO;
6649
6650                 if(!device->bdev && !device->missing) {
6651                         /*
6652                          * this happens when a device that was properly setup
6653                          * in the device info lists suddenly goes bad.
6654                          * device->bdev is NULL, and so we have to set
6655                          * device->missing to one here
6656                          */
6657                         device->fs_devices->missing_devices++;
6658                         device->missing = 1;
6659                 }
6660
6661                 /* Move the device to its own fs_devices */
6662                 if (device->fs_devices != fs_devices) {
6663                         ASSERT(device->missing);
6664
6665                         list_move(&device->dev_list, &fs_devices->devices);
6666                         device->fs_devices->num_devices--;
6667                         fs_devices->num_devices++;
6668
6669                         device->fs_devices->missing_devices--;
6670                         fs_devices->missing_devices++;
6671
6672                         device->fs_devices = fs_devices;
6673                 }
6674         }
6675
6676         if (device->fs_devices != fs_info->fs_devices) {
6677                 BUG_ON(device->writeable);
6678                 if (device->generation !=
6679                     btrfs_device_generation(leaf, dev_item))
6680                         return -EINVAL;
6681         }
6682
6683         fill_device_from_item(leaf, dev_item, device);
6684         device->in_fs_metadata = 1;
6685         if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6686                 device->fs_devices->total_rw_bytes += device->total_bytes;
6687                 spin_lock(&fs_info->free_chunk_lock);
6688                 fs_info->free_chunk_space += device->total_bytes -
6689                         device->bytes_used;
6690                 spin_unlock(&fs_info->free_chunk_lock);
6691         }
6692         ret = 0;
6693         return ret;
6694 }
6695
6696 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6697 {
6698         struct btrfs_root *root = fs_info->tree_root;
6699         struct btrfs_super_block *super_copy = fs_info->super_copy;
6700         struct extent_buffer *sb;
6701         struct btrfs_disk_key *disk_key;
6702         struct btrfs_chunk *chunk;
6703         u8 *array_ptr;
6704         unsigned long sb_array_offset;
6705         int ret = 0;
6706         u32 num_stripes;
6707         u32 array_size;
6708         u32 len = 0;
6709         u32 cur_offset;
6710         u64 type;
6711         struct btrfs_key key;
6712
6713         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6714         /*
6715          * This will create extent buffer of nodesize, superblock size is
6716          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6717          * overallocate but we can keep it as-is, only the first page is used.
6718          */
6719         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6720         if (IS_ERR(sb))
6721                 return PTR_ERR(sb);
6722         set_extent_buffer_uptodate(sb);
6723         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6724         /*
6725          * The sb extent buffer is artificial and just used to read the system array.
6726          * set_extent_buffer_uptodate() call does not properly mark all it's
6727          * pages up-to-date when the page is larger: extent does not cover the
6728          * whole page and consequently check_page_uptodate does not find all
6729          * the page's extents up-to-date (the hole beyond sb),
6730          * write_extent_buffer then triggers a WARN_ON.
6731          *
6732          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6733          * but sb spans only this function. Add an explicit SetPageUptodate call
6734          * to silence the warning eg. on PowerPC 64.
6735          */
6736         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6737                 SetPageUptodate(sb->pages[0]);
6738
6739         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6740         array_size = btrfs_super_sys_array_size(super_copy);
6741
6742         array_ptr = super_copy->sys_chunk_array;
6743         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6744         cur_offset = 0;
6745
6746         while (cur_offset < array_size) {
6747                 disk_key = (struct btrfs_disk_key *)array_ptr;
6748                 len = sizeof(*disk_key);
6749                 if (cur_offset + len > array_size)
6750                         goto out_short_read;
6751
6752                 btrfs_disk_key_to_cpu(&key, disk_key);
6753
6754                 array_ptr += len;
6755                 sb_array_offset += len;
6756                 cur_offset += len;
6757
6758                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6759                         chunk = (struct btrfs_chunk *)sb_array_offset;
6760                         /*
6761                          * At least one btrfs_chunk with one stripe must be
6762                          * present, exact stripe count check comes afterwards
6763                          */
6764                         len = btrfs_chunk_item_size(1);
6765                         if (cur_offset + len > array_size)
6766                                 goto out_short_read;
6767
6768                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6769                         if (!num_stripes) {
6770                                 btrfs_err(fs_info,
6771                                         "invalid number of stripes %u in sys_array at offset %u",
6772                                         num_stripes, cur_offset);
6773                                 ret = -EIO;
6774                                 break;
6775                         }
6776
6777                         type = btrfs_chunk_type(sb, chunk);
6778                         if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6779                                 btrfs_err(fs_info,
6780                             "invalid chunk type %llu in sys_array at offset %u",
6781                                         type, cur_offset);
6782                                 ret = -EIO;
6783                                 break;
6784                         }
6785
6786                         len = btrfs_chunk_item_size(num_stripes);
6787                         if (cur_offset + len > array_size)
6788                                 goto out_short_read;
6789
6790                         ret = read_one_chunk(fs_info, &key, sb, chunk);
6791                         if (ret)
6792                                 break;
6793                 } else {
6794                         btrfs_err(fs_info,
6795                             "unexpected item type %u in sys_array at offset %u",
6796                                   (u32)key.type, cur_offset);
6797                         ret = -EIO;
6798                         break;
6799                 }
6800                 array_ptr += len;
6801                 sb_array_offset += len;
6802                 cur_offset += len;
6803         }
6804         clear_extent_buffer_uptodate(sb);
6805         free_extent_buffer_stale(sb);
6806         return ret;
6807
6808 out_short_read:
6809         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6810                         len, cur_offset);
6811         clear_extent_buffer_uptodate(sb);
6812         free_extent_buffer_stale(sb);
6813         return -EIO;
6814 }
6815
6816 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6817 {
6818         struct btrfs_root *root = fs_info->chunk_root;
6819         struct btrfs_path *path;
6820         struct extent_buffer *leaf;
6821         struct btrfs_key key;
6822         struct btrfs_key found_key;
6823         int ret;
6824         int slot;
6825         u64 total_dev = 0;
6826
6827         path = btrfs_alloc_path();
6828         if (!path)
6829                 return -ENOMEM;
6830
6831         mutex_lock(&uuid_mutex);
6832         mutex_lock(&fs_info->chunk_mutex);
6833
6834         /*
6835          * Read all device items, and then all the chunk items. All
6836          * device items are found before any chunk item (their object id
6837          * is smaller than the lowest possible object id for a chunk
6838          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6839          */
6840         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6841         key.offset = 0;
6842         key.type = 0;
6843         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6844         if (ret < 0)
6845                 goto error;
6846         while (1) {
6847                 leaf = path->nodes[0];
6848                 slot = path->slots[0];
6849                 if (slot >= btrfs_header_nritems(leaf)) {
6850                         ret = btrfs_next_leaf(root, path);
6851                         if (ret == 0)
6852                                 continue;
6853                         if (ret < 0)
6854                                 goto error;
6855                         break;
6856                 }
6857                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6858                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6859                         struct btrfs_dev_item *dev_item;
6860                         dev_item = btrfs_item_ptr(leaf, slot,
6861                                                   struct btrfs_dev_item);
6862                         ret = read_one_dev(fs_info, leaf, dev_item);
6863                         if (ret)
6864                                 goto error;
6865                         total_dev++;
6866                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6867                         struct btrfs_chunk *chunk;
6868                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6869                         ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
6870                         if (ret)
6871                                 goto error;
6872                 }
6873                 path->slots[0]++;
6874         }
6875
6876         /*
6877          * After loading chunk tree, we've got all device information,
6878          * do another round of validation checks.
6879          */
6880         if (total_dev != fs_info->fs_devices->total_devices) {
6881                 btrfs_err(fs_info,
6882            "super_num_devices %llu mismatch with num_devices %llu found here",
6883                           btrfs_super_num_devices(fs_info->super_copy),
6884                           total_dev);
6885                 ret = -EINVAL;
6886                 goto error;
6887         }
6888         if (btrfs_super_total_bytes(fs_info->super_copy) <
6889             fs_info->fs_devices->total_rw_bytes) {
6890                 btrfs_err(fs_info,
6891         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
6892                           btrfs_super_total_bytes(fs_info->super_copy),
6893                           fs_info->fs_devices->total_rw_bytes);
6894                 ret = -EINVAL;
6895                 goto error;
6896         }
6897         ret = 0;
6898 error:
6899         mutex_unlock(&fs_info->chunk_mutex);
6900         mutex_unlock(&uuid_mutex);
6901
6902         btrfs_free_path(path);
6903         return ret;
6904 }
6905
6906 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6907 {
6908         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6909         struct btrfs_device *device;
6910
6911         while (fs_devices) {
6912                 mutex_lock(&fs_devices->device_list_mutex);
6913                 list_for_each_entry(device, &fs_devices->devices, dev_list)
6914                         device->fs_info = fs_info;
6915                 mutex_unlock(&fs_devices->device_list_mutex);
6916
6917                 fs_devices = fs_devices->seed;
6918         }
6919 }
6920
6921 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6922 {
6923         int i;
6924
6925         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6926                 btrfs_dev_stat_reset(dev, i);
6927 }
6928
6929 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6930 {
6931         struct btrfs_key key;
6932         struct btrfs_key found_key;
6933         struct btrfs_root *dev_root = fs_info->dev_root;
6934         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6935         struct extent_buffer *eb;
6936         int slot;
6937         int ret = 0;
6938         struct btrfs_device *device;
6939         struct btrfs_path *path = NULL;
6940         int i;
6941
6942         path = btrfs_alloc_path();
6943         if (!path) {
6944                 ret = -ENOMEM;
6945                 goto out;
6946         }
6947
6948         mutex_lock(&fs_devices->device_list_mutex);
6949         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6950                 int item_size;
6951                 struct btrfs_dev_stats_item *ptr;
6952
6953                 key.objectid = BTRFS_DEV_STATS_OBJECTID;
6954                 key.type = BTRFS_PERSISTENT_ITEM_KEY;
6955                 key.offset = device->devid;
6956                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6957                 if (ret) {
6958                         __btrfs_reset_dev_stats(device);
6959                         device->dev_stats_valid = 1;
6960                         btrfs_release_path(path);
6961                         continue;
6962                 }
6963                 slot = path->slots[0];
6964                 eb = path->nodes[0];
6965                 btrfs_item_key_to_cpu(eb, &found_key, slot);
6966                 item_size = btrfs_item_size_nr(eb, slot);
6967
6968                 ptr = btrfs_item_ptr(eb, slot,
6969                                      struct btrfs_dev_stats_item);
6970
6971                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6972                         if (item_size >= (1 + i) * sizeof(__le64))
6973                                 btrfs_dev_stat_set(device, i,
6974                                         btrfs_dev_stats_value(eb, ptr, i));
6975                         else
6976                                 btrfs_dev_stat_reset(device, i);
6977                 }
6978
6979                 device->dev_stats_valid = 1;
6980                 btrfs_dev_stat_print_on_load(device);
6981                 btrfs_release_path(path);
6982         }
6983         mutex_unlock(&fs_devices->device_list_mutex);
6984
6985 out:
6986         btrfs_free_path(path);
6987         return ret < 0 ? ret : 0;
6988 }
6989
6990 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6991                                 struct btrfs_fs_info *fs_info,
6992                                 struct btrfs_device *device)
6993 {
6994         struct btrfs_root *dev_root = fs_info->dev_root;
6995         struct btrfs_path *path;
6996         struct btrfs_key key;
6997         struct extent_buffer *eb;
6998         struct btrfs_dev_stats_item *ptr;
6999         int ret;
7000         int i;
7001
7002         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7003         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7004         key.offset = device->devid;
7005
7006         path = btrfs_alloc_path();
7007         if (!path)
7008                 return -ENOMEM;
7009         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7010         if (ret < 0) {
7011                 btrfs_warn_in_rcu(fs_info,
7012                         "error %d while searching for dev_stats item for device %s",
7013                               ret, rcu_str_deref(device->name));
7014                 goto out;
7015         }
7016
7017         if (ret == 0 &&
7018             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7019                 /* need to delete old one and insert a new one */
7020                 ret = btrfs_del_item(trans, dev_root, path);
7021                 if (ret != 0) {
7022                         btrfs_warn_in_rcu(fs_info,
7023                                 "delete too small dev_stats item for device %s failed %d",
7024                                       rcu_str_deref(device->name), ret);
7025                         goto out;
7026                 }
7027                 ret = 1;
7028         }
7029
7030         if (ret == 1) {
7031                 /* need to insert a new item */
7032                 btrfs_release_path(path);
7033                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7034                                               &key, sizeof(*ptr));
7035                 if (ret < 0) {
7036                         btrfs_warn_in_rcu(fs_info,
7037                                 "insert dev_stats item for device %s failed %d",
7038                                 rcu_str_deref(device->name), ret);
7039                         goto out;
7040                 }
7041         }
7042
7043         eb = path->nodes[0];
7044         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7045         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7046                 btrfs_set_dev_stats_value(eb, ptr, i,
7047                                           btrfs_dev_stat_read(device, i));
7048         btrfs_mark_buffer_dirty(eb);
7049
7050 out:
7051         btrfs_free_path(path);
7052         return ret;
7053 }
7054
7055 /*
7056  * called from commit_transaction. Writes all changed device stats to disk.
7057  */
7058 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7059                         struct btrfs_fs_info *fs_info)
7060 {
7061         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7062         struct btrfs_device *device;
7063         int stats_cnt;
7064         int ret = 0;
7065
7066         mutex_lock(&fs_devices->device_list_mutex);
7067         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7068                 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
7069                         continue;
7070
7071                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7072                 ret = update_dev_stat_item(trans, fs_info, device);
7073                 if (!ret)
7074                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7075         }
7076         mutex_unlock(&fs_devices->device_list_mutex);
7077
7078         return ret;
7079 }
7080
7081 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7082 {
7083         btrfs_dev_stat_inc(dev, index);
7084         btrfs_dev_stat_print_on_error(dev);
7085 }
7086
7087 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7088 {
7089         if (!dev->dev_stats_valid)
7090                 return;
7091         btrfs_err_rl_in_rcu(dev->fs_info,
7092                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7093                            rcu_str_deref(dev->name),
7094                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7095                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7096                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7097                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7098                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7099 }
7100
7101 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7102 {
7103         int i;
7104
7105         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7106                 if (btrfs_dev_stat_read(dev, i) != 0)
7107                         break;
7108         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7109                 return; /* all values == 0, suppress message */
7110
7111         btrfs_info_in_rcu(dev->fs_info,
7112                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7113                rcu_str_deref(dev->name),
7114                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7115                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7116                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7117                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7118                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7119 }
7120
7121 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7122                         struct btrfs_ioctl_get_dev_stats *stats)
7123 {
7124         struct btrfs_device *dev;
7125         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7126         int i;
7127
7128         mutex_lock(&fs_devices->device_list_mutex);
7129         dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7130         mutex_unlock(&fs_devices->device_list_mutex);
7131
7132         if (!dev) {
7133                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7134                 return -ENODEV;
7135         } else if (!dev->dev_stats_valid) {
7136                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7137                 return -ENODEV;
7138         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7139                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7140                         if (stats->nr_items > i)
7141                                 stats->values[i] =
7142                                         btrfs_dev_stat_read_and_reset(dev, i);
7143                         else
7144                                 btrfs_dev_stat_reset(dev, i);
7145                 }
7146         } else {
7147                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7148                         if (stats->nr_items > i)
7149                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7150         }
7151         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7152                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7153         return 0;
7154 }
7155
7156 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7157 {
7158         struct buffer_head *bh;
7159         struct btrfs_super_block *disk_super;
7160         int copy_num;
7161
7162         if (!bdev)
7163                 return;
7164
7165         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7166                 copy_num++) {
7167
7168                 if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7169                         continue;
7170
7171                 disk_super = (struct btrfs_super_block *)bh->b_data;
7172
7173                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7174                 set_buffer_dirty(bh);
7175                 sync_dirty_buffer(bh);
7176                 brelse(bh);
7177         }
7178
7179         /* Notify udev that device has changed */
7180         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7181
7182         /* Update ctime/mtime for device path for libblkid */
7183         update_dev_time(device_path);
7184 }
7185
7186 /*
7187  * Update the size of all devices, which is used for writing out the
7188  * super blocks.
7189  */
7190 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
7191 {
7192         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7193         struct btrfs_device *curr, *next;
7194
7195         if (list_empty(&fs_devices->resized_devices))
7196                 return;
7197
7198         mutex_lock(&fs_devices->device_list_mutex);
7199         mutex_lock(&fs_info->chunk_mutex);
7200         list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7201                                  resized_list) {
7202                 list_del_init(&curr->resized_list);
7203                 curr->commit_total_bytes = curr->disk_total_bytes;
7204         }
7205         mutex_unlock(&fs_info->chunk_mutex);
7206         mutex_unlock(&fs_devices->device_list_mutex);
7207 }
7208
7209 /* Must be invoked during the transaction commit */
7210 void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
7211                                         struct btrfs_transaction *transaction)
7212 {
7213         struct extent_map *em;
7214         struct map_lookup *map;
7215         struct btrfs_device *dev;
7216         int i;
7217
7218         if (list_empty(&transaction->pending_chunks))
7219                 return;
7220
7221         /* In order to kick the device replace finish process */
7222         mutex_lock(&fs_info->chunk_mutex);
7223         list_for_each_entry(em, &transaction->pending_chunks, list) {
7224                 map = em->map_lookup;
7225
7226                 for (i = 0; i < map->num_stripes; i++) {
7227                         dev = map->stripes[i].dev;
7228                         dev->commit_bytes_used = dev->bytes_used;
7229                 }
7230         }
7231         mutex_unlock(&fs_info->chunk_mutex);
7232 }
7233
7234 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7235 {
7236         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7237         while (fs_devices) {
7238                 fs_devices->fs_info = fs_info;
7239                 fs_devices = fs_devices->seed;
7240         }
7241 }
7242
7243 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7244 {
7245         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7246         while (fs_devices) {
7247                 fs_devices->fs_info = NULL;
7248                 fs_devices = fs_devices->seed;
7249         }
7250 }