]> git.karo-electronics.de Git - karo-tx-linux.git/blob - fs/btrfs/volumes.c
Merge tag 'for-linus' of git://github.com/prasad-joshi/logfs_upstream
[karo-tx-linux.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <asm/div64.h>
29 #include "compat.h"
30 #include "ctree.h"
31 #include "extent_map.h"
32 #include "disk-io.h"
33 #include "transaction.h"
34 #include "print-tree.h"
35 #include "volumes.h"
36 #include "async-thread.h"
37 #include "check-integrity.h"
38 #include "rcu-string.h"
39
40 static int init_first_rw_device(struct btrfs_trans_handle *trans,
41                                 struct btrfs_root *root,
42                                 struct btrfs_device *device);
43 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
44 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
45 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
46
47 static DEFINE_MUTEX(uuid_mutex);
48 static LIST_HEAD(fs_uuids);
49
50 static void lock_chunks(struct btrfs_root *root)
51 {
52         mutex_lock(&root->fs_info->chunk_mutex);
53 }
54
55 static void unlock_chunks(struct btrfs_root *root)
56 {
57         mutex_unlock(&root->fs_info->chunk_mutex);
58 }
59
60 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
61 {
62         struct btrfs_device *device;
63         WARN_ON(fs_devices->opened);
64         while (!list_empty(&fs_devices->devices)) {
65                 device = list_entry(fs_devices->devices.next,
66                                     struct btrfs_device, dev_list);
67                 list_del(&device->dev_list);
68                 rcu_string_free(device->name);
69                 kfree(device);
70         }
71         kfree(fs_devices);
72 }
73
74 void btrfs_cleanup_fs_uuids(void)
75 {
76         struct btrfs_fs_devices *fs_devices;
77
78         while (!list_empty(&fs_uuids)) {
79                 fs_devices = list_entry(fs_uuids.next,
80                                         struct btrfs_fs_devices, list);
81                 list_del(&fs_devices->list);
82                 free_fs_devices(fs_devices);
83         }
84 }
85
86 static noinline struct btrfs_device *__find_device(struct list_head *head,
87                                                    u64 devid, u8 *uuid)
88 {
89         struct btrfs_device *dev;
90
91         list_for_each_entry(dev, head, dev_list) {
92                 if (dev->devid == devid &&
93                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
94                         return dev;
95                 }
96         }
97         return NULL;
98 }
99
100 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
101 {
102         struct btrfs_fs_devices *fs_devices;
103
104         list_for_each_entry(fs_devices, &fs_uuids, list) {
105                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
106                         return fs_devices;
107         }
108         return NULL;
109 }
110
111 static void requeue_list(struct btrfs_pending_bios *pending_bios,
112                         struct bio *head, struct bio *tail)
113 {
114
115         struct bio *old_head;
116
117         old_head = pending_bios->head;
118         pending_bios->head = head;
119         if (pending_bios->tail)
120                 tail->bi_next = old_head;
121         else
122                 pending_bios->tail = tail;
123 }
124
125 /*
126  * we try to collect pending bios for a device so we don't get a large
127  * number of procs sending bios down to the same device.  This greatly
128  * improves the schedulers ability to collect and merge the bios.
129  *
130  * But, it also turns into a long list of bios to process and that is sure
131  * to eventually make the worker thread block.  The solution here is to
132  * make some progress and then put this work struct back at the end of
133  * the list if the block device is congested.  This way, multiple devices
134  * can make progress from a single worker thread.
135  */
136 static noinline void run_scheduled_bios(struct btrfs_device *device)
137 {
138         struct bio *pending;
139         struct backing_dev_info *bdi;
140         struct btrfs_fs_info *fs_info;
141         struct btrfs_pending_bios *pending_bios;
142         struct bio *tail;
143         struct bio *cur;
144         int again = 0;
145         unsigned long num_run;
146         unsigned long batch_run = 0;
147         unsigned long limit;
148         unsigned long last_waited = 0;
149         int force_reg = 0;
150         int sync_pending = 0;
151         struct blk_plug plug;
152
153         /*
154          * this function runs all the bios we've collected for
155          * a particular device.  We don't want to wander off to
156          * another device without first sending all of these down.
157          * So, setup a plug here and finish it off before we return
158          */
159         blk_start_plug(&plug);
160
161         bdi = blk_get_backing_dev_info(device->bdev);
162         fs_info = device->dev_root->fs_info;
163         limit = btrfs_async_submit_limit(fs_info);
164         limit = limit * 2 / 3;
165
166 loop:
167         spin_lock(&device->io_lock);
168
169 loop_lock:
170         num_run = 0;
171
172         /* take all the bios off the list at once and process them
173          * later on (without the lock held).  But, remember the
174          * tail and other pointers so the bios can be properly reinserted
175          * into the list if we hit congestion
176          */
177         if (!force_reg && device->pending_sync_bios.head) {
178                 pending_bios = &device->pending_sync_bios;
179                 force_reg = 1;
180         } else {
181                 pending_bios = &device->pending_bios;
182                 force_reg = 0;
183         }
184
185         pending = pending_bios->head;
186         tail = pending_bios->tail;
187         WARN_ON(pending && !tail);
188
189         /*
190          * if pending was null this time around, no bios need processing
191          * at all and we can stop.  Otherwise it'll loop back up again
192          * and do an additional check so no bios are missed.
193          *
194          * device->running_pending is used to synchronize with the
195          * schedule_bio code.
196          */
197         if (device->pending_sync_bios.head == NULL &&
198             device->pending_bios.head == NULL) {
199                 again = 0;
200                 device->running_pending = 0;
201         } else {
202                 again = 1;
203                 device->running_pending = 1;
204         }
205
206         pending_bios->head = NULL;
207         pending_bios->tail = NULL;
208
209         spin_unlock(&device->io_lock);
210
211         while (pending) {
212
213                 rmb();
214                 /* we want to work on both lists, but do more bios on the
215                  * sync list than the regular list
216                  */
217                 if ((num_run > 32 &&
218                     pending_bios != &device->pending_sync_bios &&
219                     device->pending_sync_bios.head) ||
220                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
221                     device->pending_bios.head)) {
222                         spin_lock(&device->io_lock);
223                         requeue_list(pending_bios, pending, tail);
224                         goto loop_lock;
225                 }
226
227                 cur = pending;
228                 pending = pending->bi_next;
229                 cur->bi_next = NULL;
230                 atomic_dec(&fs_info->nr_async_bios);
231
232                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
233                     waitqueue_active(&fs_info->async_submit_wait))
234                         wake_up(&fs_info->async_submit_wait);
235
236                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
237
238                 /*
239                  * if we're doing the sync list, record that our
240                  * plug has some sync requests on it
241                  *
242                  * If we're doing the regular list and there are
243                  * sync requests sitting around, unplug before
244                  * we add more
245                  */
246                 if (pending_bios == &device->pending_sync_bios) {
247                         sync_pending = 1;
248                 } else if (sync_pending) {
249                         blk_finish_plug(&plug);
250                         blk_start_plug(&plug);
251                         sync_pending = 0;
252                 }
253
254                 btrfsic_submit_bio(cur->bi_rw, cur);
255                 num_run++;
256                 batch_run++;
257                 if (need_resched())
258                         cond_resched();
259
260                 /*
261                  * we made progress, there is more work to do and the bdi
262                  * is now congested.  Back off and let other work structs
263                  * run instead
264                  */
265                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
266                     fs_info->fs_devices->open_devices > 1) {
267                         struct io_context *ioc;
268
269                         ioc = current->io_context;
270
271                         /*
272                          * the main goal here is that we don't want to
273                          * block if we're going to be able to submit
274                          * more requests without blocking.
275                          *
276                          * This code does two great things, it pokes into
277                          * the elevator code from a filesystem _and_
278                          * it makes assumptions about how batching works.
279                          */
280                         if (ioc && ioc->nr_batch_requests > 0 &&
281                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
282                             (last_waited == 0 ||
283                              ioc->last_waited == last_waited)) {
284                                 /*
285                                  * we want to go through our batch of
286                                  * requests and stop.  So, we copy out
287                                  * the ioc->last_waited time and test
288                                  * against it before looping
289                                  */
290                                 last_waited = ioc->last_waited;
291                                 if (need_resched())
292                                         cond_resched();
293                                 continue;
294                         }
295                         spin_lock(&device->io_lock);
296                         requeue_list(pending_bios, pending, tail);
297                         device->running_pending = 1;
298
299                         spin_unlock(&device->io_lock);
300                         btrfs_requeue_work(&device->work);
301                         goto done;
302                 }
303                 /* unplug every 64 requests just for good measure */
304                 if (batch_run % 64 == 0) {
305                         blk_finish_plug(&plug);
306                         blk_start_plug(&plug);
307                         sync_pending = 0;
308                 }
309         }
310
311         cond_resched();
312         if (again)
313                 goto loop;
314
315         spin_lock(&device->io_lock);
316         if (device->pending_bios.head || device->pending_sync_bios.head)
317                 goto loop_lock;
318         spin_unlock(&device->io_lock);
319
320 done:
321         blk_finish_plug(&plug);
322 }
323
324 static void pending_bios_fn(struct btrfs_work *work)
325 {
326         struct btrfs_device *device;
327
328         device = container_of(work, struct btrfs_device, work);
329         run_scheduled_bios(device);
330 }
331
332 static noinline int device_list_add(const char *path,
333                            struct btrfs_super_block *disk_super,
334                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
335 {
336         struct btrfs_device *device;
337         struct btrfs_fs_devices *fs_devices;
338         struct rcu_string *name;
339         u64 found_transid = btrfs_super_generation(disk_super);
340
341         fs_devices = find_fsid(disk_super->fsid);
342         if (!fs_devices) {
343                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
344                 if (!fs_devices)
345                         return -ENOMEM;
346                 INIT_LIST_HEAD(&fs_devices->devices);
347                 INIT_LIST_HEAD(&fs_devices->alloc_list);
348                 list_add(&fs_devices->list, &fs_uuids);
349                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
350                 fs_devices->latest_devid = devid;
351                 fs_devices->latest_trans = found_transid;
352                 mutex_init(&fs_devices->device_list_mutex);
353                 device = NULL;
354         } else {
355                 device = __find_device(&fs_devices->devices, devid,
356                                        disk_super->dev_item.uuid);
357         }
358         if (!device) {
359                 if (fs_devices->opened)
360                         return -EBUSY;
361
362                 device = kzalloc(sizeof(*device), GFP_NOFS);
363                 if (!device) {
364                         /* we can safely leave the fs_devices entry around */
365                         return -ENOMEM;
366                 }
367                 device->devid = devid;
368                 device->dev_stats_valid = 0;
369                 device->work.func = pending_bios_fn;
370                 memcpy(device->uuid, disk_super->dev_item.uuid,
371                        BTRFS_UUID_SIZE);
372                 spin_lock_init(&device->io_lock);
373
374                 name = rcu_string_strdup(path, GFP_NOFS);
375                 if (!name) {
376                         kfree(device);
377                         return -ENOMEM;
378                 }
379                 rcu_assign_pointer(device->name, name);
380                 INIT_LIST_HEAD(&device->dev_alloc_list);
381
382                 /* init readahead state */
383                 spin_lock_init(&device->reada_lock);
384                 device->reada_curr_zone = NULL;
385                 atomic_set(&device->reada_in_flight, 0);
386                 device->reada_next = 0;
387                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
388                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
389
390                 mutex_lock(&fs_devices->device_list_mutex);
391                 list_add_rcu(&device->dev_list, &fs_devices->devices);
392                 mutex_unlock(&fs_devices->device_list_mutex);
393
394                 device->fs_devices = fs_devices;
395                 fs_devices->num_devices++;
396         } else if (!device->name || strcmp(device->name->str, path)) {
397                 name = rcu_string_strdup(path, GFP_NOFS);
398                 if (!name)
399                         return -ENOMEM;
400                 rcu_string_free(device->name);
401                 rcu_assign_pointer(device->name, name);
402                 if (device->missing) {
403                         fs_devices->missing_devices--;
404                         device->missing = 0;
405                 }
406         }
407
408         if (found_transid > fs_devices->latest_trans) {
409                 fs_devices->latest_devid = devid;
410                 fs_devices->latest_trans = found_transid;
411         }
412         *fs_devices_ret = fs_devices;
413         return 0;
414 }
415
416 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
417 {
418         struct btrfs_fs_devices *fs_devices;
419         struct btrfs_device *device;
420         struct btrfs_device *orig_dev;
421
422         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
423         if (!fs_devices)
424                 return ERR_PTR(-ENOMEM);
425
426         INIT_LIST_HEAD(&fs_devices->devices);
427         INIT_LIST_HEAD(&fs_devices->alloc_list);
428         INIT_LIST_HEAD(&fs_devices->list);
429         mutex_init(&fs_devices->device_list_mutex);
430         fs_devices->latest_devid = orig->latest_devid;
431         fs_devices->latest_trans = orig->latest_trans;
432         fs_devices->total_devices = orig->total_devices;
433         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
434
435         /* We have held the volume lock, it is safe to get the devices. */
436         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
437                 struct rcu_string *name;
438
439                 device = kzalloc(sizeof(*device), GFP_NOFS);
440                 if (!device)
441                         goto error;
442
443                 /*
444                  * This is ok to do without rcu read locked because we hold the
445                  * uuid mutex so nothing we touch in here is going to disappear.
446                  */
447                 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
448                 if (!name) {
449                         kfree(device);
450                         goto error;
451                 }
452                 rcu_assign_pointer(device->name, name);
453
454                 device->devid = orig_dev->devid;
455                 device->work.func = pending_bios_fn;
456                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
457                 spin_lock_init(&device->io_lock);
458                 INIT_LIST_HEAD(&device->dev_list);
459                 INIT_LIST_HEAD(&device->dev_alloc_list);
460
461                 list_add(&device->dev_list, &fs_devices->devices);
462                 device->fs_devices = fs_devices;
463                 fs_devices->num_devices++;
464         }
465         return fs_devices;
466 error:
467         free_fs_devices(fs_devices);
468         return ERR_PTR(-ENOMEM);
469 }
470
471 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
472 {
473         struct btrfs_device *device, *next;
474
475         struct block_device *latest_bdev = NULL;
476         u64 latest_devid = 0;
477         u64 latest_transid = 0;
478
479         mutex_lock(&uuid_mutex);
480 again:
481         /* This is the initialized path, it is safe to release the devices. */
482         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
483                 if (device->in_fs_metadata) {
484                         if (!latest_transid ||
485                             device->generation > latest_transid) {
486                                 latest_devid = device->devid;
487                                 latest_transid = device->generation;
488                                 latest_bdev = device->bdev;
489                         }
490                         continue;
491                 }
492
493                 if (device->bdev) {
494                         blkdev_put(device->bdev, device->mode);
495                         device->bdev = NULL;
496                         fs_devices->open_devices--;
497                 }
498                 if (device->writeable) {
499                         list_del_init(&device->dev_alloc_list);
500                         device->writeable = 0;
501                         fs_devices->rw_devices--;
502                 }
503                 list_del_init(&device->dev_list);
504                 fs_devices->num_devices--;
505                 rcu_string_free(device->name);
506                 kfree(device);
507         }
508
509         if (fs_devices->seed) {
510                 fs_devices = fs_devices->seed;
511                 goto again;
512         }
513
514         fs_devices->latest_bdev = latest_bdev;
515         fs_devices->latest_devid = latest_devid;
516         fs_devices->latest_trans = latest_transid;
517
518         mutex_unlock(&uuid_mutex);
519 }
520
521 static void __free_device(struct work_struct *work)
522 {
523         struct btrfs_device *device;
524
525         device = container_of(work, struct btrfs_device, rcu_work);
526
527         if (device->bdev)
528                 blkdev_put(device->bdev, device->mode);
529
530         rcu_string_free(device->name);
531         kfree(device);
532 }
533
534 static void free_device(struct rcu_head *head)
535 {
536         struct btrfs_device *device;
537
538         device = container_of(head, struct btrfs_device, rcu);
539
540         INIT_WORK(&device->rcu_work, __free_device);
541         schedule_work(&device->rcu_work);
542 }
543
544 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
545 {
546         struct btrfs_device *device;
547
548         if (--fs_devices->opened > 0)
549                 return 0;
550
551         mutex_lock(&fs_devices->device_list_mutex);
552         list_for_each_entry(device, &fs_devices->devices, dev_list) {
553                 struct btrfs_device *new_device;
554                 struct rcu_string *name;
555
556                 if (device->bdev)
557                         fs_devices->open_devices--;
558
559                 if (device->writeable) {
560                         list_del_init(&device->dev_alloc_list);
561                         fs_devices->rw_devices--;
562                 }
563
564                 if (device->can_discard)
565                         fs_devices->num_can_discard--;
566
567                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
568                 BUG_ON(!new_device); /* -ENOMEM */
569                 memcpy(new_device, device, sizeof(*new_device));
570
571                 /* Safe because we are under uuid_mutex */
572                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
573                 BUG_ON(device->name && !name); /* -ENOMEM */
574                 rcu_assign_pointer(new_device->name, name);
575                 new_device->bdev = NULL;
576                 new_device->writeable = 0;
577                 new_device->in_fs_metadata = 0;
578                 new_device->can_discard = 0;
579                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
580
581                 call_rcu(&device->rcu, free_device);
582         }
583         mutex_unlock(&fs_devices->device_list_mutex);
584
585         WARN_ON(fs_devices->open_devices);
586         WARN_ON(fs_devices->rw_devices);
587         fs_devices->opened = 0;
588         fs_devices->seeding = 0;
589
590         return 0;
591 }
592
593 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
594 {
595         struct btrfs_fs_devices *seed_devices = NULL;
596         int ret;
597
598         mutex_lock(&uuid_mutex);
599         ret = __btrfs_close_devices(fs_devices);
600         if (!fs_devices->opened) {
601                 seed_devices = fs_devices->seed;
602                 fs_devices->seed = NULL;
603         }
604         mutex_unlock(&uuid_mutex);
605
606         while (seed_devices) {
607                 fs_devices = seed_devices;
608                 seed_devices = fs_devices->seed;
609                 __btrfs_close_devices(fs_devices);
610                 free_fs_devices(fs_devices);
611         }
612         return ret;
613 }
614
615 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
616                                 fmode_t flags, void *holder)
617 {
618         struct request_queue *q;
619         struct block_device *bdev;
620         struct list_head *head = &fs_devices->devices;
621         struct btrfs_device *device;
622         struct block_device *latest_bdev = NULL;
623         struct buffer_head *bh;
624         struct btrfs_super_block *disk_super;
625         u64 latest_devid = 0;
626         u64 latest_transid = 0;
627         u64 devid;
628         int seeding = 1;
629         int ret = 0;
630
631         flags |= FMODE_EXCL;
632
633         list_for_each_entry(device, head, dev_list) {
634                 if (device->bdev)
635                         continue;
636                 if (!device->name)
637                         continue;
638
639                 bdev = blkdev_get_by_path(device->name->str, flags, holder);
640                 if (IS_ERR(bdev)) {
641                         printk(KERN_INFO "open %s failed\n", device->name->str);
642                         goto error;
643                 }
644                 filemap_write_and_wait(bdev->bd_inode->i_mapping);
645                 invalidate_bdev(bdev);
646                 set_blocksize(bdev, 4096);
647
648                 bh = btrfs_read_dev_super(bdev);
649                 if (!bh)
650                         goto error_close;
651
652                 disk_super = (struct btrfs_super_block *)bh->b_data;
653                 devid = btrfs_stack_device_id(&disk_super->dev_item);
654                 if (devid != device->devid)
655                         goto error_brelse;
656
657                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
658                            BTRFS_UUID_SIZE))
659                         goto error_brelse;
660
661                 device->generation = btrfs_super_generation(disk_super);
662                 if (!latest_transid || device->generation > latest_transid) {
663                         latest_devid = devid;
664                         latest_transid = device->generation;
665                         latest_bdev = bdev;
666                 }
667
668                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
669                         device->writeable = 0;
670                 } else {
671                         device->writeable = !bdev_read_only(bdev);
672                         seeding = 0;
673                 }
674
675                 q = bdev_get_queue(bdev);
676                 if (blk_queue_discard(q)) {
677                         device->can_discard = 1;
678                         fs_devices->num_can_discard++;
679                 }
680
681                 device->bdev = bdev;
682                 device->in_fs_metadata = 0;
683                 device->mode = flags;
684
685                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
686                         fs_devices->rotating = 1;
687
688                 fs_devices->open_devices++;
689                 if (device->writeable) {
690                         fs_devices->rw_devices++;
691                         list_add(&device->dev_alloc_list,
692                                  &fs_devices->alloc_list);
693                 }
694                 brelse(bh);
695                 continue;
696
697 error_brelse:
698                 brelse(bh);
699 error_close:
700                 blkdev_put(bdev, flags);
701 error:
702                 continue;
703         }
704         if (fs_devices->open_devices == 0) {
705                 ret = -EINVAL;
706                 goto out;
707         }
708         fs_devices->seeding = seeding;
709         fs_devices->opened = 1;
710         fs_devices->latest_bdev = latest_bdev;
711         fs_devices->latest_devid = latest_devid;
712         fs_devices->latest_trans = latest_transid;
713         fs_devices->total_rw_bytes = 0;
714 out:
715         return ret;
716 }
717
718 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
719                        fmode_t flags, void *holder)
720 {
721         int ret;
722
723         mutex_lock(&uuid_mutex);
724         if (fs_devices->opened) {
725                 fs_devices->opened++;
726                 ret = 0;
727         } else {
728                 ret = __btrfs_open_devices(fs_devices, flags, holder);
729         }
730         mutex_unlock(&uuid_mutex);
731         return ret;
732 }
733
734 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
735                           struct btrfs_fs_devices **fs_devices_ret)
736 {
737         struct btrfs_super_block *disk_super;
738         struct block_device *bdev;
739         struct buffer_head *bh;
740         int ret;
741         u64 devid;
742         u64 transid;
743         u64 total_devices;
744
745         flags |= FMODE_EXCL;
746         bdev = blkdev_get_by_path(path, flags, holder);
747
748         if (IS_ERR(bdev)) {
749                 ret = PTR_ERR(bdev);
750                 goto error;
751         }
752
753         mutex_lock(&uuid_mutex);
754         ret = set_blocksize(bdev, 4096);
755         if (ret)
756                 goto error_close;
757         bh = btrfs_read_dev_super(bdev);
758         if (!bh) {
759                 ret = -EINVAL;
760                 goto error_close;
761         }
762         disk_super = (struct btrfs_super_block *)bh->b_data;
763         devid = btrfs_stack_device_id(&disk_super->dev_item);
764         transid = btrfs_super_generation(disk_super);
765         total_devices = btrfs_super_num_devices(disk_super);
766         if (disk_super->label[0])
767                 printk(KERN_INFO "device label %s ", disk_super->label);
768         else
769                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
770         printk(KERN_CONT "devid %llu transid %llu %s\n",
771                (unsigned long long)devid, (unsigned long long)transid, path);
772         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
773         if (!ret && fs_devices_ret)
774                 (*fs_devices_ret)->total_devices = total_devices;
775         brelse(bh);
776 error_close:
777         mutex_unlock(&uuid_mutex);
778         blkdev_put(bdev, flags);
779 error:
780         return ret;
781 }
782
783 /* helper to account the used device space in the range */
784 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
785                                    u64 end, u64 *length)
786 {
787         struct btrfs_key key;
788         struct btrfs_root *root = device->dev_root;
789         struct btrfs_dev_extent *dev_extent;
790         struct btrfs_path *path;
791         u64 extent_end;
792         int ret;
793         int slot;
794         struct extent_buffer *l;
795
796         *length = 0;
797
798         if (start >= device->total_bytes)
799                 return 0;
800
801         path = btrfs_alloc_path();
802         if (!path)
803                 return -ENOMEM;
804         path->reada = 2;
805
806         key.objectid = device->devid;
807         key.offset = start;
808         key.type = BTRFS_DEV_EXTENT_KEY;
809
810         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
811         if (ret < 0)
812                 goto out;
813         if (ret > 0) {
814                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
815                 if (ret < 0)
816                         goto out;
817         }
818
819         while (1) {
820                 l = path->nodes[0];
821                 slot = path->slots[0];
822                 if (slot >= btrfs_header_nritems(l)) {
823                         ret = btrfs_next_leaf(root, path);
824                         if (ret == 0)
825                                 continue;
826                         if (ret < 0)
827                                 goto out;
828
829                         break;
830                 }
831                 btrfs_item_key_to_cpu(l, &key, slot);
832
833                 if (key.objectid < device->devid)
834                         goto next;
835
836                 if (key.objectid > device->devid)
837                         break;
838
839                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
840                         goto next;
841
842                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
843                 extent_end = key.offset + btrfs_dev_extent_length(l,
844                                                                   dev_extent);
845                 if (key.offset <= start && extent_end > end) {
846                         *length = end - start + 1;
847                         break;
848                 } else if (key.offset <= start && extent_end > start)
849                         *length += extent_end - start;
850                 else if (key.offset > start && extent_end <= end)
851                         *length += extent_end - key.offset;
852                 else if (key.offset > start && key.offset <= end) {
853                         *length += end - key.offset + 1;
854                         break;
855                 } else if (key.offset > end)
856                         break;
857
858 next:
859                 path->slots[0]++;
860         }
861         ret = 0;
862 out:
863         btrfs_free_path(path);
864         return ret;
865 }
866
867 /*
868  * find_free_dev_extent - find free space in the specified device
869  * @device:     the device which we search the free space in
870  * @num_bytes:  the size of the free space that we need
871  * @start:      store the start of the free space.
872  * @len:        the size of the free space. that we find, or the size of the max
873  *              free space if we don't find suitable free space
874  *
875  * this uses a pretty simple search, the expectation is that it is
876  * called very infrequently and that a given device has a small number
877  * of extents
878  *
879  * @start is used to store the start of the free space if we find. But if we
880  * don't find suitable free space, it will be used to store the start position
881  * of the max free space.
882  *
883  * @len is used to store the size of the free space that we find.
884  * But if we don't find suitable free space, it is used to store the size of
885  * the max free space.
886  */
887 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
888                          u64 *start, u64 *len)
889 {
890         struct btrfs_key key;
891         struct btrfs_root *root = device->dev_root;
892         struct btrfs_dev_extent *dev_extent;
893         struct btrfs_path *path;
894         u64 hole_size;
895         u64 max_hole_start;
896         u64 max_hole_size;
897         u64 extent_end;
898         u64 search_start;
899         u64 search_end = device->total_bytes;
900         int ret;
901         int slot;
902         struct extent_buffer *l;
903
904         /* FIXME use last free of some kind */
905
906         /* we don't want to overwrite the superblock on the drive,
907          * so we make sure to start at an offset of at least 1MB
908          */
909         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
910
911         max_hole_start = search_start;
912         max_hole_size = 0;
913         hole_size = 0;
914
915         if (search_start >= search_end) {
916                 ret = -ENOSPC;
917                 goto error;
918         }
919
920         path = btrfs_alloc_path();
921         if (!path) {
922                 ret = -ENOMEM;
923                 goto error;
924         }
925         path->reada = 2;
926
927         key.objectid = device->devid;
928         key.offset = search_start;
929         key.type = BTRFS_DEV_EXTENT_KEY;
930
931         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
932         if (ret < 0)
933                 goto out;
934         if (ret > 0) {
935                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
936                 if (ret < 0)
937                         goto out;
938         }
939
940         while (1) {
941                 l = path->nodes[0];
942                 slot = path->slots[0];
943                 if (slot >= btrfs_header_nritems(l)) {
944                         ret = btrfs_next_leaf(root, path);
945                         if (ret == 0)
946                                 continue;
947                         if (ret < 0)
948                                 goto out;
949
950                         break;
951                 }
952                 btrfs_item_key_to_cpu(l, &key, slot);
953
954                 if (key.objectid < device->devid)
955                         goto next;
956
957                 if (key.objectid > device->devid)
958                         break;
959
960                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
961                         goto next;
962
963                 if (key.offset > search_start) {
964                         hole_size = key.offset - search_start;
965
966                         if (hole_size > max_hole_size) {
967                                 max_hole_start = search_start;
968                                 max_hole_size = hole_size;
969                         }
970
971                         /*
972                          * If this free space is greater than which we need,
973                          * it must be the max free space that we have found
974                          * until now, so max_hole_start must point to the start
975                          * of this free space and the length of this free space
976                          * is stored in max_hole_size. Thus, we return
977                          * max_hole_start and max_hole_size and go back to the
978                          * caller.
979                          */
980                         if (hole_size >= num_bytes) {
981                                 ret = 0;
982                                 goto out;
983                         }
984                 }
985
986                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
987                 extent_end = key.offset + btrfs_dev_extent_length(l,
988                                                                   dev_extent);
989                 if (extent_end > search_start)
990                         search_start = extent_end;
991 next:
992                 path->slots[0]++;
993                 cond_resched();
994         }
995
996         /*
997          * At this point, search_start should be the end of
998          * allocated dev extents, and when shrinking the device,
999          * search_end may be smaller than search_start.
1000          */
1001         if (search_end > search_start)
1002                 hole_size = search_end - search_start;
1003
1004         if (hole_size > max_hole_size) {
1005                 max_hole_start = search_start;
1006                 max_hole_size = hole_size;
1007         }
1008
1009         /* See above. */
1010         if (hole_size < num_bytes)
1011                 ret = -ENOSPC;
1012         else
1013                 ret = 0;
1014
1015 out:
1016         btrfs_free_path(path);
1017 error:
1018         *start = max_hole_start;
1019         if (len)
1020                 *len = max_hole_size;
1021         return ret;
1022 }
1023
1024 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1025                           struct btrfs_device *device,
1026                           u64 start)
1027 {
1028         int ret;
1029         struct btrfs_path *path;
1030         struct btrfs_root *root = device->dev_root;
1031         struct btrfs_key key;
1032         struct btrfs_key found_key;
1033         struct extent_buffer *leaf = NULL;
1034         struct btrfs_dev_extent *extent = NULL;
1035
1036         path = btrfs_alloc_path();
1037         if (!path)
1038                 return -ENOMEM;
1039
1040         key.objectid = device->devid;
1041         key.offset = start;
1042         key.type = BTRFS_DEV_EXTENT_KEY;
1043 again:
1044         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1045         if (ret > 0) {
1046                 ret = btrfs_previous_item(root, path, key.objectid,
1047                                           BTRFS_DEV_EXTENT_KEY);
1048                 if (ret)
1049                         goto out;
1050                 leaf = path->nodes[0];
1051                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1052                 extent = btrfs_item_ptr(leaf, path->slots[0],
1053                                         struct btrfs_dev_extent);
1054                 BUG_ON(found_key.offset > start || found_key.offset +
1055                        btrfs_dev_extent_length(leaf, extent) < start);
1056                 key = found_key;
1057                 btrfs_release_path(path);
1058                 goto again;
1059         } else if (ret == 0) {
1060                 leaf = path->nodes[0];
1061                 extent = btrfs_item_ptr(leaf, path->slots[0],
1062                                         struct btrfs_dev_extent);
1063         } else {
1064                 btrfs_error(root->fs_info, ret, "Slot search failed");
1065                 goto out;
1066         }
1067
1068         if (device->bytes_used > 0) {
1069                 u64 len = btrfs_dev_extent_length(leaf, extent);
1070                 device->bytes_used -= len;
1071                 spin_lock(&root->fs_info->free_chunk_lock);
1072                 root->fs_info->free_chunk_space += len;
1073                 spin_unlock(&root->fs_info->free_chunk_lock);
1074         }
1075         ret = btrfs_del_item(trans, root, path);
1076         if (ret) {
1077                 btrfs_error(root->fs_info, ret,
1078                             "Failed to remove dev extent item");
1079         }
1080 out:
1081         btrfs_free_path(path);
1082         return ret;
1083 }
1084
1085 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1086                            struct btrfs_device *device,
1087                            u64 chunk_tree, u64 chunk_objectid,
1088                            u64 chunk_offset, u64 start, u64 num_bytes)
1089 {
1090         int ret;
1091         struct btrfs_path *path;
1092         struct btrfs_root *root = device->dev_root;
1093         struct btrfs_dev_extent *extent;
1094         struct extent_buffer *leaf;
1095         struct btrfs_key key;
1096
1097         WARN_ON(!device->in_fs_metadata);
1098         path = btrfs_alloc_path();
1099         if (!path)
1100                 return -ENOMEM;
1101
1102         key.objectid = device->devid;
1103         key.offset = start;
1104         key.type = BTRFS_DEV_EXTENT_KEY;
1105         ret = btrfs_insert_empty_item(trans, root, path, &key,
1106                                       sizeof(*extent));
1107         if (ret)
1108                 goto out;
1109
1110         leaf = path->nodes[0];
1111         extent = btrfs_item_ptr(leaf, path->slots[0],
1112                                 struct btrfs_dev_extent);
1113         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1114         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1115         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1116
1117         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1118                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1119                     BTRFS_UUID_SIZE);
1120
1121         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1122         btrfs_mark_buffer_dirty(leaf);
1123 out:
1124         btrfs_free_path(path);
1125         return ret;
1126 }
1127
1128 static noinline int find_next_chunk(struct btrfs_root *root,
1129                                     u64 objectid, u64 *offset)
1130 {
1131         struct btrfs_path *path;
1132         int ret;
1133         struct btrfs_key key;
1134         struct btrfs_chunk *chunk;
1135         struct btrfs_key found_key;
1136
1137         path = btrfs_alloc_path();
1138         if (!path)
1139                 return -ENOMEM;
1140
1141         key.objectid = objectid;
1142         key.offset = (u64)-1;
1143         key.type = BTRFS_CHUNK_ITEM_KEY;
1144
1145         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1146         if (ret < 0)
1147                 goto error;
1148
1149         BUG_ON(ret == 0); /* Corruption */
1150
1151         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1152         if (ret) {
1153                 *offset = 0;
1154         } else {
1155                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1156                                       path->slots[0]);
1157                 if (found_key.objectid != objectid)
1158                         *offset = 0;
1159                 else {
1160                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1161                                                struct btrfs_chunk);
1162                         *offset = found_key.offset +
1163                                 btrfs_chunk_length(path->nodes[0], chunk);
1164                 }
1165         }
1166         ret = 0;
1167 error:
1168         btrfs_free_path(path);
1169         return ret;
1170 }
1171
1172 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1173 {
1174         int ret;
1175         struct btrfs_key key;
1176         struct btrfs_key found_key;
1177         struct btrfs_path *path;
1178
1179         root = root->fs_info->chunk_root;
1180
1181         path = btrfs_alloc_path();
1182         if (!path)
1183                 return -ENOMEM;
1184
1185         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1186         key.type = BTRFS_DEV_ITEM_KEY;
1187         key.offset = (u64)-1;
1188
1189         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1190         if (ret < 0)
1191                 goto error;
1192
1193         BUG_ON(ret == 0); /* Corruption */
1194
1195         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1196                                   BTRFS_DEV_ITEM_KEY);
1197         if (ret) {
1198                 *objectid = 1;
1199         } else {
1200                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1201                                       path->slots[0]);
1202                 *objectid = found_key.offset + 1;
1203         }
1204         ret = 0;
1205 error:
1206         btrfs_free_path(path);
1207         return ret;
1208 }
1209
1210 /*
1211  * the device information is stored in the chunk root
1212  * the btrfs_device struct should be fully filled in
1213  */
1214 int btrfs_add_device(struct btrfs_trans_handle *trans,
1215                      struct btrfs_root *root,
1216                      struct btrfs_device *device)
1217 {
1218         int ret;
1219         struct btrfs_path *path;
1220         struct btrfs_dev_item *dev_item;
1221         struct extent_buffer *leaf;
1222         struct btrfs_key key;
1223         unsigned long ptr;
1224
1225         root = root->fs_info->chunk_root;
1226
1227         path = btrfs_alloc_path();
1228         if (!path)
1229                 return -ENOMEM;
1230
1231         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1232         key.type = BTRFS_DEV_ITEM_KEY;
1233         key.offset = device->devid;
1234
1235         ret = btrfs_insert_empty_item(trans, root, path, &key,
1236                                       sizeof(*dev_item));
1237         if (ret)
1238                 goto out;
1239
1240         leaf = path->nodes[0];
1241         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1242
1243         btrfs_set_device_id(leaf, dev_item, device->devid);
1244         btrfs_set_device_generation(leaf, dev_item, 0);
1245         btrfs_set_device_type(leaf, dev_item, device->type);
1246         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1247         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1248         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1249         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1250         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1251         btrfs_set_device_group(leaf, dev_item, 0);
1252         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1253         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1254         btrfs_set_device_start_offset(leaf, dev_item, 0);
1255
1256         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1257         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1258         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1259         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1260         btrfs_mark_buffer_dirty(leaf);
1261
1262         ret = 0;
1263 out:
1264         btrfs_free_path(path);
1265         return ret;
1266 }
1267
1268 static int btrfs_rm_dev_item(struct btrfs_root *root,
1269                              struct btrfs_device *device)
1270 {
1271         int ret;
1272         struct btrfs_path *path;
1273         struct btrfs_key key;
1274         struct btrfs_trans_handle *trans;
1275
1276         root = root->fs_info->chunk_root;
1277
1278         path = btrfs_alloc_path();
1279         if (!path)
1280                 return -ENOMEM;
1281
1282         trans = btrfs_start_transaction(root, 0);
1283         if (IS_ERR(trans)) {
1284                 btrfs_free_path(path);
1285                 return PTR_ERR(trans);
1286         }
1287         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1288         key.type = BTRFS_DEV_ITEM_KEY;
1289         key.offset = device->devid;
1290         lock_chunks(root);
1291
1292         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1293         if (ret < 0)
1294                 goto out;
1295
1296         if (ret > 0) {
1297                 ret = -ENOENT;
1298                 goto out;
1299         }
1300
1301         ret = btrfs_del_item(trans, root, path);
1302         if (ret)
1303                 goto out;
1304 out:
1305         btrfs_free_path(path);
1306         unlock_chunks(root);
1307         btrfs_commit_transaction(trans, root);
1308         return ret;
1309 }
1310
1311 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1312 {
1313         struct btrfs_device *device;
1314         struct btrfs_device *next_device;
1315         struct block_device *bdev;
1316         struct buffer_head *bh = NULL;
1317         struct btrfs_super_block *disk_super;
1318         struct btrfs_fs_devices *cur_devices;
1319         u64 all_avail;
1320         u64 devid;
1321         u64 num_devices;
1322         u8 *dev_uuid;
1323         int ret = 0;
1324         bool clear_super = false;
1325
1326         mutex_lock(&uuid_mutex);
1327
1328         all_avail = root->fs_info->avail_data_alloc_bits |
1329                 root->fs_info->avail_system_alloc_bits |
1330                 root->fs_info->avail_metadata_alloc_bits;
1331
1332         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1333             root->fs_info->fs_devices->num_devices <= 4) {
1334                 printk(KERN_ERR "btrfs: unable to go below four devices "
1335                        "on raid10\n");
1336                 ret = -EINVAL;
1337                 goto out;
1338         }
1339
1340         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1341             root->fs_info->fs_devices->num_devices <= 2) {
1342                 printk(KERN_ERR "btrfs: unable to go below two "
1343                        "devices on raid1\n");
1344                 ret = -EINVAL;
1345                 goto out;
1346         }
1347
1348         if (strcmp(device_path, "missing") == 0) {
1349                 struct list_head *devices;
1350                 struct btrfs_device *tmp;
1351
1352                 device = NULL;
1353                 devices = &root->fs_info->fs_devices->devices;
1354                 /*
1355                  * It is safe to read the devices since the volume_mutex
1356                  * is held.
1357                  */
1358                 list_for_each_entry(tmp, devices, dev_list) {
1359                         if (tmp->in_fs_metadata && !tmp->bdev) {
1360                                 device = tmp;
1361                                 break;
1362                         }
1363                 }
1364                 bdev = NULL;
1365                 bh = NULL;
1366                 disk_super = NULL;
1367                 if (!device) {
1368                         printk(KERN_ERR "btrfs: no missing devices found to "
1369                                "remove\n");
1370                         goto out;
1371                 }
1372         } else {
1373                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1374                                           root->fs_info->bdev_holder);
1375                 if (IS_ERR(bdev)) {
1376                         ret = PTR_ERR(bdev);
1377                         goto out;
1378                 }
1379
1380                 set_blocksize(bdev, 4096);
1381                 invalidate_bdev(bdev);
1382                 bh = btrfs_read_dev_super(bdev);
1383                 if (!bh) {
1384                         ret = -EINVAL;
1385                         goto error_close;
1386                 }
1387                 disk_super = (struct btrfs_super_block *)bh->b_data;
1388                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1389                 dev_uuid = disk_super->dev_item.uuid;
1390                 device = btrfs_find_device(root, devid, dev_uuid,
1391                                            disk_super->fsid);
1392                 if (!device) {
1393                         ret = -ENOENT;
1394                         goto error_brelse;
1395                 }
1396         }
1397
1398         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1399                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1400                        "device\n");
1401                 ret = -EINVAL;
1402                 goto error_brelse;
1403         }
1404
1405         if (device->writeable) {
1406                 lock_chunks(root);
1407                 list_del_init(&device->dev_alloc_list);
1408                 unlock_chunks(root);
1409                 root->fs_info->fs_devices->rw_devices--;
1410                 clear_super = true;
1411         }
1412
1413         ret = btrfs_shrink_device(device, 0);
1414         if (ret)
1415                 goto error_undo;
1416
1417         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1418         if (ret)
1419                 goto error_undo;
1420
1421         spin_lock(&root->fs_info->free_chunk_lock);
1422         root->fs_info->free_chunk_space = device->total_bytes -
1423                 device->bytes_used;
1424         spin_unlock(&root->fs_info->free_chunk_lock);
1425
1426         device->in_fs_metadata = 0;
1427         btrfs_scrub_cancel_dev(root, device);
1428
1429         /*
1430          * the device list mutex makes sure that we don't change
1431          * the device list while someone else is writing out all
1432          * the device supers.
1433          */
1434
1435         cur_devices = device->fs_devices;
1436         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1437         list_del_rcu(&device->dev_list);
1438
1439         device->fs_devices->num_devices--;
1440         device->fs_devices->total_devices--;
1441
1442         if (device->missing)
1443                 root->fs_info->fs_devices->missing_devices--;
1444
1445         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1446                                  struct btrfs_device, dev_list);
1447         if (device->bdev == root->fs_info->sb->s_bdev)
1448                 root->fs_info->sb->s_bdev = next_device->bdev;
1449         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1450                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1451
1452         if (device->bdev)
1453                 device->fs_devices->open_devices--;
1454
1455         call_rcu(&device->rcu, free_device);
1456         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1457
1458         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1459         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1460
1461         if (cur_devices->open_devices == 0) {
1462                 struct btrfs_fs_devices *fs_devices;
1463                 fs_devices = root->fs_info->fs_devices;
1464                 while (fs_devices) {
1465                         if (fs_devices->seed == cur_devices)
1466                                 break;
1467                         fs_devices = fs_devices->seed;
1468                 }
1469                 fs_devices->seed = cur_devices->seed;
1470                 cur_devices->seed = NULL;
1471                 lock_chunks(root);
1472                 __btrfs_close_devices(cur_devices);
1473                 unlock_chunks(root);
1474                 free_fs_devices(cur_devices);
1475         }
1476
1477         /*
1478          * at this point, the device is zero sized.  We want to
1479          * remove it from the devices list and zero out the old super
1480          */
1481         if (clear_super) {
1482                 /* make sure this device isn't detected as part of
1483                  * the FS anymore
1484                  */
1485                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1486                 set_buffer_dirty(bh);
1487                 sync_dirty_buffer(bh);
1488         }
1489
1490         ret = 0;
1491
1492 error_brelse:
1493         brelse(bh);
1494 error_close:
1495         if (bdev)
1496                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1497 out:
1498         mutex_unlock(&uuid_mutex);
1499         return ret;
1500 error_undo:
1501         if (device->writeable) {
1502                 lock_chunks(root);
1503                 list_add(&device->dev_alloc_list,
1504                          &root->fs_info->fs_devices->alloc_list);
1505                 unlock_chunks(root);
1506                 root->fs_info->fs_devices->rw_devices++;
1507         }
1508         goto error_brelse;
1509 }
1510
1511 /*
1512  * does all the dirty work required for changing file system's UUID.
1513  */
1514 static int btrfs_prepare_sprout(struct btrfs_root *root)
1515 {
1516         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1517         struct btrfs_fs_devices *old_devices;
1518         struct btrfs_fs_devices *seed_devices;
1519         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1520         struct btrfs_device *device;
1521         u64 super_flags;
1522
1523         BUG_ON(!mutex_is_locked(&uuid_mutex));
1524         if (!fs_devices->seeding)
1525                 return -EINVAL;
1526
1527         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1528         if (!seed_devices)
1529                 return -ENOMEM;
1530
1531         old_devices = clone_fs_devices(fs_devices);
1532         if (IS_ERR(old_devices)) {
1533                 kfree(seed_devices);
1534                 return PTR_ERR(old_devices);
1535         }
1536
1537         list_add(&old_devices->list, &fs_uuids);
1538
1539         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1540         seed_devices->opened = 1;
1541         INIT_LIST_HEAD(&seed_devices->devices);
1542         INIT_LIST_HEAD(&seed_devices->alloc_list);
1543         mutex_init(&seed_devices->device_list_mutex);
1544
1545         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1546         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1547                               synchronize_rcu);
1548         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1549
1550         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1551         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1552                 device->fs_devices = seed_devices;
1553         }
1554
1555         fs_devices->seeding = 0;
1556         fs_devices->num_devices = 0;
1557         fs_devices->open_devices = 0;
1558         fs_devices->total_devices = 0;
1559         fs_devices->seed = seed_devices;
1560
1561         generate_random_uuid(fs_devices->fsid);
1562         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1563         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1564         super_flags = btrfs_super_flags(disk_super) &
1565                       ~BTRFS_SUPER_FLAG_SEEDING;
1566         btrfs_set_super_flags(disk_super, super_flags);
1567
1568         return 0;
1569 }
1570
1571 /*
1572  * strore the expected generation for seed devices in device items.
1573  */
1574 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1575                                struct btrfs_root *root)
1576 {
1577         struct btrfs_path *path;
1578         struct extent_buffer *leaf;
1579         struct btrfs_dev_item *dev_item;
1580         struct btrfs_device *device;
1581         struct btrfs_key key;
1582         u8 fs_uuid[BTRFS_UUID_SIZE];
1583         u8 dev_uuid[BTRFS_UUID_SIZE];
1584         u64 devid;
1585         int ret;
1586
1587         path = btrfs_alloc_path();
1588         if (!path)
1589                 return -ENOMEM;
1590
1591         root = root->fs_info->chunk_root;
1592         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1593         key.offset = 0;
1594         key.type = BTRFS_DEV_ITEM_KEY;
1595
1596         while (1) {
1597                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1598                 if (ret < 0)
1599                         goto error;
1600
1601                 leaf = path->nodes[0];
1602 next_slot:
1603                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1604                         ret = btrfs_next_leaf(root, path);
1605                         if (ret > 0)
1606                                 break;
1607                         if (ret < 0)
1608                                 goto error;
1609                         leaf = path->nodes[0];
1610                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1611                         btrfs_release_path(path);
1612                         continue;
1613                 }
1614
1615                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1616                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1617                     key.type != BTRFS_DEV_ITEM_KEY)
1618                         break;
1619
1620                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1621                                           struct btrfs_dev_item);
1622                 devid = btrfs_device_id(leaf, dev_item);
1623                 read_extent_buffer(leaf, dev_uuid,
1624                                    (unsigned long)btrfs_device_uuid(dev_item),
1625                                    BTRFS_UUID_SIZE);
1626                 read_extent_buffer(leaf, fs_uuid,
1627                                    (unsigned long)btrfs_device_fsid(dev_item),
1628                                    BTRFS_UUID_SIZE);
1629                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1630                 BUG_ON(!device); /* Logic error */
1631
1632                 if (device->fs_devices->seeding) {
1633                         btrfs_set_device_generation(leaf, dev_item,
1634                                                     device->generation);
1635                         btrfs_mark_buffer_dirty(leaf);
1636                 }
1637
1638                 path->slots[0]++;
1639                 goto next_slot;
1640         }
1641         ret = 0;
1642 error:
1643         btrfs_free_path(path);
1644         return ret;
1645 }
1646
1647 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1648 {
1649         struct request_queue *q;
1650         struct btrfs_trans_handle *trans;
1651         struct btrfs_device *device;
1652         struct block_device *bdev;
1653         struct list_head *devices;
1654         struct super_block *sb = root->fs_info->sb;
1655         struct rcu_string *name;
1656         u64 total_bytes;
1657         int seeding_dev = 0;
1658         int ret = 0;
1659
1660         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1661                 return -EROFS;
1662
1663         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1664                                   root->fs_info->bdev_holder);
1665         if (IS_ERR(bdev))
1666                 return PTR_ERR(bdev);
1667
1668         if (root->fs_info->fs_devices->seeding) {
1669                 seeding_dev = 1;
1670                 down_write(&sb->s_umount);
1671                 mutex_lock(&uuid_mutex);
1672         }
1673
1674         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1675
1676         devices = &root->fs_info->fs_devices->devices;
1677         /*
1678          * we have the volume lock, so we don't need the extra
1679          * device list mutex while reading the list here.
1680          */
1681         list_for_each_entry(device, devices, dev_list) {
1682                 if (device->bdev == bdev) {
1683                         ret = -EEXIST;
1684                         goto error;
1685                 }
1686         }
1687
1688         device = kzalloc(sizeof(*device), GFP_NOFS);
1689         if (!device) {
1690                 /* we can safely leave the fs_devices entry around */
1691                 ret = -ENOMEM;
1692                 goto error;
1693         }
1694
1695         name = rcu_string_strdup(device_path, GFP_NOFS);
1696         if (!name) {
1697                 kfree(device);
1698                 ret = -ENOMEM;
1699                 goto error;
1700         }
1701         rcu_assign_pointer(device->name, name);
1702
1703         ret = find_next_devid(root, &device->devid);
1704         if (ret) {
1705                 rcu_string_free(device->name);
1706                 kfree(device);
1707                 goto error;
1708         }
1709
1710         trans = btrfs_start_transaction(root, 0);
1711         if (IS_ERR(trans)) {
1712                 rcu_string_free(device->name);
1713                 kfree(device);
1714                 ret = PTR_ERR(trans);
1715                 goto error;
1716         }
1717
1718         lock_chunks(root);
1719
1720         q = bdev_get_queue(bdev);
1721         if (blk_queue_discard(q))
1722                 device->can_discard = 1;
1723         device->writeable = 1;
1724         device->work.func = pending_bios_fn;
1725         generate_random_uuid(device->uuid);
1726         spin_lock_init(&device->io_lock);
1727         device->generation = trans->transid;
1728         device->io_width = root->sectorsize;
1729         device->io_align = root->sectorsize;
1730         device->sector_size = root->sectorsize;
1731         device->total_bytes = i_size_read(bdev->bd_inode);
1732         device->disk_total_bytes = device->total_bytes;
1733         device->dev_root = root->fs_info->dev_root;
1734         device->bdev = bdev;
1735         device->in_fs_metadata = 1;
1736         device->mode = FMODE_EXCL;
1737         set_blocksize(device->bdev, 4096);
1738
1739         if (seeding_dev) {
1740                 sb->s_flags &= ~MS_RDONLY;
1741                 ret = btrfs_prepare_sprout(root);
1742                 BUG_ON(ret); /* -ENOMEM */
1743         }
1744
1745         device->fs_devices = root->fs_info->fs_devices;
1746
1747         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1748         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1749         list_add(&device->dev_alloc_list,
1750                  &root->fs_info->fs_devices->alloc_list);
1751         root->fs_info->fs_devices->num_devices++;
1752         root->fs_info->fs_devices->open_devices++;
1753         root->fs_info->fs_devices->rw_devices++;
1754         root->fs_info->fs_devices->total_devices++;
1755         if (device->can_discard)
1756                 root->fs_info->fs_devices->num_can_discard++;
1757         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1758
1759         spin_lock(&root->fs_info->free_chunk_lock);
1760         root->fs_info->free_chunk_space += device->total_bytes;
1761         spin_unlock(&root->fs_info->free_chunk_lock);
1762
1763         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1764                 root->fs_info->fs_devices->rotating = 1;
1765
1766         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1767         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1768                                     total_bytes + device->total_bytes);
1769
1770         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1771         btrfs_set_super_num_devices(root->fs_info->super_copy,
1772                                     total_bytes + 1);
1773         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1774
1775         if (seeding_dev) {
1776                 ret = init_first_rw_device(trans, root, device);
1777                 if (ret)
1778                         goto error_trans;
1779                 ret = btrfs_finish_sprout(trans, root);
1780                 if (ret)
1781                         goto error_trans;
1782         } else {
1783                 ret = btrfs_add_device(trans, root, device);
1784                 if (ret)
1785                         goto error_trans;
1786         }
1787
1788         /*
1789          * we've got more storage, clear any full flags on the space
1790          * infos
1791          */
1792         btrfs_clear_space_info_full(root->fs_info);
1793
1794         unlock_chunks(root);
1795         ret = btrfs_commit_transaction(trans, root);
1796
1797         if (seeding_dev) {
1798                 mutex_unlock(&uuid_mutex);
1799                 up_write(&sb->s_umount);
1800
1801                 if (ret) /* transaction commit */
1802                         return ret;
1803
1804                 ret = btrfs_relocate_sys_chunks(root);
1805                 if (ret < 0)
1806                         btrfs_error(root->fs_info, ret,
1807                                     "Failed to relocate sys chunks after "
1808                                     "device initialization. This can be fixed "
1809                                     "using the \"btrfs balance\" command.");
1810         }
1811
1812         return ret;
1813
1814 error_trans:
1815         unlock_chunks(root);
1816         btrfs_abort_transaction(trans, root, ret);
1817         btrfs_end_transaction(trans, root);
1818         rcu_string_free(device->name);
1819         kfree(device);
1820 error:
1821         blkdev_put(bdev, FMODE_EXCL);
1822         if (seeding_dev) {
1823                 mutex_unlock(&uuid_mutex);
1824                 up_write(&sb->s_umount);
1825         }
1826         return ret;
1827 }
1828
1829 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1830                                         struct btrfs_device *device)
1831 {
1832         int ret;
1833         struct btrfs_path *path;
1834         struct btrfs_root *root;
1835         struct btrfs_dev_item *dev_item;
1836         struct extent_buffer *leaf;
1837         struct btrfs_key key;
1838
1839         root = device->dev_root->fs_info->chunk_root;
1840
1841         path = btrfs_alloc_path();
1842         if (!path)
1843                 return -ENOMEM;
1844
1845         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1846         key.type = BTRFS_DEV_ITEM_KEY;
1847         key.offset = device->devid;
1848
1849         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1850         if (ret < 0)
1851                 goto out;
1852
1853         if (ret > 0) {
1854                 ret = -ENOENT;
1855                 goto out;
1856         }
1857
1858         leaf = path->nodes[0];
1859         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1860
1861         btrfs_set_device_id(leaf, dev_item, device->devid);
1862         btrfs_set_device_type(leaf, dev_item, device->type);
1863         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1864         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1865         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1866         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1867         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1868         btrfs_mark_buffer_dirty(leaf);
1869
1870 out:
1871         btrfs_free_path(path);
1872         return ret;
1873 }
1874
1875 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1876                       struct btrfs_device *device, u64 new_size)
1877 {
1878         struct btrfs_super_block *super_copy =
1879                 device->dev_root->fs_info->super_copy;
1880         u64 old_total = btrfs_super_total_bytes(super_copy);
1881         u64 diff = new_size - device->total_bytes;
1882
1883         if (!device->writeable)
1884                 return -EACCES;
1885         if (new_size <= device->total_bytes)
1886                 return -EINVAL;
1887
1888         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1889         device->fs_devices->total_rw_bytes += diff;
1890
1891         device->total_bytes = new_size;
1892         device->disk_total_bytes = new_size;
1893         btrfs_clear_space_info_full(device->dev_root->fs_info);
1894
1895         return btrfs_update_device(trans, device);
1896 }
1897
1898 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1899                       struct btrfs_device *device, u64 new_size)
1900 {
1901         int ret;
1902         lock_chunks(device->dev_root);
1903         ret = __btrfs_grow_device(trans, device, new_size);
1904         unlock_chunks(device->dev_root);
1905         return ret;
1906 }
1907
1908 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1909                             struct btrfs_root *root,
1910                             u64 chunk_tree, u64 chunk_objectid,
1911                             u64 chunk_offset)
1912 {
1913         int ret;
1914         struct btrfs_path *path;
1915         struct btrfs_key key;
1916
1917         root = root->fs_info->chunk_root;
1918         path = btrfs_alloc_path();
1919         if (!path)
1920                 return -ENOMEM;
1921
1922         key.objectid = chunk_objectid;
1923         key.offset = chunk_offset;
1924         key.type = BTRFS_CHUNK_ITEM_KEY;
1925
1926         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1927         if (ret < 0)
1928                 goto out;
1929         else if (ret > 0) { /* Logic error or corruption */
1930                 btrfs_error(root->fs_info, -ENOENT,
1931                             "Failed lookup while freeing chunk.");
1932                 ret = -ENOENT;
1933                 goto out;
1934         }
1935
1936         ret = btrfs_del_item(trans, root, path);
1937         if (ret < 0)
1938                 btrfs_error(root->fs_info, ret,
1939                             "Failed to delete chunk item.");
1940 out:
1941         btrfs_free_path(path);
1942         return ret;
1943 }
1944
1945 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1946                         chunk_offset)
1947 {
1948         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1949         struct btrfs_disk_key *disk_key;
1950         struct btrfs_chunk *chunk;
1951         u8 *ptr;
1952         int ret = 0;
1953         u32 num_stripes;
1954         u32 array_size;
1955         u32 len = 0;
1956         u32 cur;
1957         struct btrfs_key key;
1958
1959         array_size = btrfs_super_sys_array_size(super_copy);
1960
1961         ptr = super_copy->sys_chunk_array;
1962         cur = 0;
1963
1964         while (cur < array_size) {
1965                 disk_key = (struct btrfs_disk_key *)ptr;
1966                 btrfs_disk_key_to_cpu(&key, disk_key);
1967
1968                 len = sizeof(*disk_key);
1969
1970                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1971                         chunk = (struct btrfs_chunk *)(ptr + len);
1972                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1973                         len += btrfs_chunk_item_size(num_stripes);
1974                 } else {
1975                         ret = -EIO;
1976                         break;
1977                 }
1978                 if (key.objectid == chunk_objectid &&
1979                     key.offset == chunk_offset) {
1980                         memmove(ptr, ptr + len, array_size - (cur + len));
1981                         array_size -= len;
1982                         btrfs_set_super_sys_array_size(super_copy, array_size);
1983                 } else {
1984                         ptr += len;
1985                         cur += len;
1986                 }
1987         }
1988         return ret;
1989 }
1990
1991 static int btrfs_relocate_chunk(struct btrfs_root *root,
1992                          u64 chunk_tree, u64 chunk_objectid,
1993                          u64 chunk_offset)
1994 {
1995         struct extent_map_tree *em_tree;
1996         struct btrfs_root *extent_root;
1997         struct btrfs_trans_handle *trans;
1998         struct extent_map *em;
1999         struct map_lookup *map;
2000         int ret;
2001         int i;
2002
2003         root = root->fs_info->chunk_root;
2004         extent_root = root->fs_info->extent_root;
2005         em_tree = &root->fs_info->mapping_tree.map_tree;
2006
2007         ret = btrfs_can_relocate(extent_root, chunk_offset);
2008         if (ret)
2009                 return -ENOSPC;
2010
2011         /* step one, relocate all the extents inside this chunk */
2012         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2013         if (ret)
2014                 return ret;
2015
2016         trans = btrfs_start_transaction(root, 0);
2017         BUG_ON(IS_ERR(trans));
2018
2019         lock_chunks(root);
2020
2021         /*
2022          * step two, delete the device extents and the
2023          * chunk tree entries
2024          */
2025         read_lock(&em_tree->lock);
2026         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2027         read_unlock(&em_tree->lock);
2028
2029         BUG_ON(!em || em->start > chunk_offset ||
2030                em->start + em->len < chunk_offset);
2031         map = (struct map_lookup *)em->bdev;
2032
2033         for (i = 0; i < map->num_stripes; i++) {
2034                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2035                                             map->stripes[i].physical);
2036                 BUG_ON(ret);
2037
2038                 if (map->stripes[i].dev) {
2039                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2040                         BUG_ON(ret);
2041                 }
2042         }
2043         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2044                                chunk_offset);
2045
2046         BUG_ON(ret);
2047
2048         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2049
2050         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2051                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2052                 BUG_ON(ret);
2053         }
2054
2055         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2056         BUG_ON(ret);
2057
2058         write_lock(&em_tree->lock);
2059         remove_extent_mapping(em_tree, em);
2060         write_unlock(&em_tree->lock);
2061
2062         kfree(map);
2063         em->bdev = NULL;
2064
2065         /* once for the tree */
2066         free_extent_map(em);
2067         /* once for us */
2068         free_extent_map(em);
2069
2070         unlock_chunks(root);
2071         btrfs_end_transaction(trans, root);
2072         return 0;
2073 }
2074
2075 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2076 {
2077         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2078         struct btrfs_path *path;
2079         struct extent_buffer *leaf;
2080         struct btrfs_chunk *chunk;
2081         struct btrfs_key key;
2082         struct btrfs_key found_key;
2083         u64 chunk_tree = chunk_root->root_key.objectid;
2084         u64 chunk_type;
2085         bool retried = false;
2086         int failed = 0;
2087         int ret;
2088
2089         path = btrfs_alloc_path();
2090         if (!path)
2091                 return -ENOMEM;
2092
2093 again:
2094         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2095         key.offset = (u64)-1;
2096         key.type = BTRFS_CHUNK_ITEM_KEY;
2097
2098         while (1) {
2099                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2100                 if (ret < 0)
2101                         goto error;
2102                 BUG_ON(ret == 0); /* Corruption */
2103
2104                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2105                                           key.type);
2106                 if (ret < 0)
2107                         goto error;
2108                 if (ret > 0)
2109                         break;
2110
2111                 leaf = path->nodes[0];
2112                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2113
2114                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2115                                        struct btrfs_chunk);
2116                 chunk_type = btrfs_chunk_type(leaf, chunk);
2117                 btrfs_release_path(path);
2118
2119                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2120                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2121                                                    found_key.objectid,
2122                                                    found_key.offset);
2123                         if (ret == -ENOSPC)
2124                                 failed++;
2125                         else if (ret)
2126                                 BUG();
2127                 }
2128
2129                 if (found_key.offset == 0)
2130                         break;
2131                 key.offset = found_key.offset - 1;
2132         }
2133         ret = 0;
2134         if (failed && !retried) {
2135                 failed = 0;
2136                 retried = true;
2137                 goto again;
2138         } else if (failed && retried) {
2139                 WARN_ON(1);
2140                 ret = -ENOSPC;
2141         }
2142 error:
2143         btrfs_free_path(path);
2144         return ret;
2145 }
2146
2147 static int insert_balance_item(struct btrfs_root *root,
2148                                struct btrfs_balance_control *bctl)
2149 {
2150         struct btrfs_trans_handle *trans;
2151         struct btrfs_balance_item *item;
2152         struct btrfs_disk_balance_args disk_bargs;
2153         struct btrfs_path *path;
2154         struct extent_buffer *leaf;
2155         struct btrfs_key key;
2156         int ret, err;
2157
2158         path = btrfs_alloc_path();
2159         if (!path)
2160                 return -ENOMEM;
2161
2162         trans = btrfs_start_transaction(root, 0);
2163         if (IS_ERR(trans)) {
2164                 btrfs_free_path(path);
2165                 return PTR_ERR(trans);
2166         }
2167
2168         key.objectid = BTRFS_BALANCE_OBJECTID;
2169         key.type = BTRFS_BALANCE_ITEM_KEY;
2170         key.offset = 0;
2171
2172         ret = btrfs_insert_empty_item(trans, root, path, &key,
2173                                       sizeof(*item));
2174         if (ret)
2175                 goto out;
2176
2177         leaf = path->nodes[0];
2178         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2179
2180         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2181
2182         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2183         btrfs_set_balance_data(leaf, item, &disk_bargs);
2184         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2185         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2186         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2187         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2188
2189         btrfs_set_balance_flags(leaf, item, bctl->flags);
2190
2191         btrfs_mark_buffer_dirty(leaf);
2192 out:
2193         btrfs_free_path(path);
2194         err = btrfs_commit_transaction(trans, root);
2195         if (err && !ret)
2196                 ret = err;
2197         return ret;
2198 }
2199
2200 static int del_balance_item(struct btrfs_root *root)
2201 {
2202         struct btrfs_trans_handle *trans;
2203         struct btrfs_path *path;
2204         struct btrfs_key key;
2205         int ret, err;
2206
2207         path = btrfs_alloc_path();
2208         if (!path)
2209                 return -ENOMEM;
2210
2211         trans = btrfs_start_transaction(root, 0);
2212         if (IS_ERR(trans)) {
2213                 btrfs_free_path(path);
2214                 return PTR_ERR(trans);
2215         }
2216
2217         key.objectid = BTRFS_BALANCE_OBJECTID;
2218         key.type = BTRFS_BALANCE_ITEM_KEY;
2219         key.offset = 0;
2220
2221         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2222         if (ret < 0)
2223                 goto out;
2224         if (ret > 0) {
2225                 ret = -ENOENT;
2226                 goto out;
2227         }
2228
2229         ret = btrfs_del_item(trans, root, path);
2230 out:
2231         btrfs_free_path(path);
2232         err = btrfs_commit_transaction(trans, root);
2233         if (err && !ret)
2234                 ret = err;
2235         return ret;
2236 }
2237
2238 /*
2239  * This is a heuristic used to reduce the number of chunks balanced on
2240  * resume after balance was interrupted.
2241  */
2242 static void update_balance_args(struct btrfs_balance_control *bctl)
2243 {
2244         /*
2245          * Turn on soft mode for chunk types that were being converted.
2246          */
2247         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2248                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2249         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2250                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2251         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2252                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2253
2254         /*
2255          * Turn on usage filter if is not already used.  The idea is
2256          * that chunks that we have already balanced should be
2257          * reasonably full.  Don't do it for chunks that are being
2258          * converted - that will keep us from relocating unconverted
2259          * (albeit full) chunks.
2260          */
2261         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2262             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2263                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2264                 bctl->data.usage = 90;
2265         }
2266         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2267             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2268                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2269                 bctl->sys.usage = 90;
2270         }
2271         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2272             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2273                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2274                 bctl->meta.usage = 90;
2275         }
2276 }
2277
2278 /*
2279  * Should be called with both balance and volume mutexes held to
2280  * serialize other volume operations (add_dev/rm_dev/resize) with
2281  * restriper.  Same goes for unset_balance_control.
2282  */
2283 static void set_balance_control(struct btrfs_balance_control *bctl)
2284 {
2285         struct btrfs_fs_info *fs_info = bctl->fs_info;
2286
2287         BUG_ON(fs_info->balance_ctl);
2288
2289         spin_lock(&fs_info->balance_lock);
2290         fs_info->balance_ctl = bctl;
2291         spin_unlock(&fs_info->balance_lock);
2292 }
2293
2294 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2295 {
2296         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2297
2298         BUG_ON(!fs_info->balance_ctl);
2299
2300         spin_lock(&fs_info->balance_lock);
2301         fs_info->balance_ctl = NULL;
2302         spin_unlock(&fs_info->balance_lock);
2303
2304         kfree(bctl);
2305 }
2306
2307 /*
2308  * Balance filters.  Return 1 if chunk should be filtered out
2309  * (should not be balanced).
2310  */
2311 static int chunk_profiles_filter(u64 chunk_type,
2312                                  struct btrfs_balance_args *bargs)
2313 {
2314         chunk_type = chunk_to_extended(chunk_type) &
2315                                 BTRFS_EXTENDED_PROFILE_MASK;
2316
2317         if (bargs->profiles & chunk_type)
2318                 return 0;
2319
2320         return 1;
2321 }
2322
2323 static u64 div_factor_fine(u64 num, int factor)
2324 {
2325         if (factor <= 0)
2326                 return 0;
2327         if (factor >= 100)
2328                 return num;
2329
2330         num *= factor;
2331         do_div(num, 100);
2332         return num;
2333 }
2334
2335 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2336                               struct btrfs_balance_args *bargs)
2337 {
2338         struct btrfs_block_group_cache *cache;
2339         u64 chunk_used, user_thresh;
2340         int ret = 1;
2341
2342         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2343         chunk_used = btrfs_block_group_used(&cache->item);
2344
2345         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2346         if (chunk_used < user_thresh)
2347                 ret = 0;
2348
2349         btrfs_put_block_group(cache);
2350         return ret;
2351 }
2352
2353 static int chunk_devid_filter(struct extent_buffer *leaf,
2354                               struct btrfs_chunk *chunk,
2355                               struct btrfs_balance_args *bargs)
2356 {
2357         struct btrfs_stripe *stripe;
2358         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2359         int i;
2360
2361         for (i = 0; i < num_stripes; i++) {
2362                 stripe = btrfs_stripe_nr(chunk, i);
2363                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2364                         return 0;
2365         }
2366
2367         return 1;
2368 }
2369
2370 /* [pstart, pend) */
2371 static int chunk_drange_filter(struct extent_buffer *leaf,
2372                                struct btrfs_chunk *chunk,
2373                                u64 chunk_offset,
2374                                struct btrfs_balance_args *bargs)
2375 {
2376         struct btrfs_stripe *stripe;
2377         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2378         u64 stripe_offset;
2379         u64 stripe_length;
2380         int factor;
2381         int i;
2382
2383         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2384                 return 0;
2385
2386         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2387              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2388                 factor = 2;
2389         else
2390                 factor = 1;
2391         factor = num_stripes / factor;
2392
2393         for (i = 0; i < num_stripes; i++) {
2394                 stripe = btrfs_stripe_nr(chunk, i);
2395                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2396                         continue;
2397
2398                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2399                 stripe_length = btrfs_chunk_length(leaf, chunk);
2400                 do_div(stripe_length, factor);
2401
2402                 if (stripe_offset < bargs->pend &&
2403                     stripe_offset + stripe_length > bargs->pstart)
2404                         return 0;
2405         }
2406
2407         return 1;
2408 }
2409
2410 /* [vstart, vend) */
2411 static int chunk_vrange_filter(struct extent_buffer *leaf,
2412                                struct btrfs_chunk *chunk,
2413                                u64 chunk_offset,
2414                                struct btrfs_balance_args *bargs)
2415 {
2416         if (chunk_offset < bargs->vend &&
2417             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2418                 /* at least part of the chunk is inside this vrange */
2419                 return 0;
2420
2421         return 1;
2422 }
2423
2424 static int chunk_soft_convert_filter(u64 chunk_type,
2425                                      struct btrfs_balance_args *bargs)
2426 {
2427         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2428                 return 0;
2429
2430         chunk_type = chunk_to_extended(chunk_type) &
2431                                 BTRFS_EXTENDED_PROFILE_MASK;
2432
2433         if (bargs->target == chunk_type)
2434                 return 1;
2435
2436         return 0;
2437 }
2438
2439 static int should_balance_chunk(struct btrfs_root *root,
2440                                 struct extent_buffer *leaf,
2441                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2442 {
2443         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2444         struct btrfs_balance_args *bargs = NULL;
2445         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2446
2447         /* type filter */
2448         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2449               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2450                 return 0;
2451         }
2452
2453         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2454                 bargs = &bctl->data;
2455         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2456                 bargs = &bctl->sys;
2457         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2458                 bargs = &bctl->meta;
2459
2460         /* profiles filter */
2461         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2462             chunk_profiles_filter(chunk_type, bargs)) {
2463                 return 0;
2464         }
2465
2466         /* usage filter */
2467         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2468             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2469                 return 0;
2470         }
2471
2472         /* devid filter */
2473         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2474             chunk_devid_filter(leaf, chunk, bargs)) {
2475                 return 0;
2476         }
2477
2478         /* drange filter, makes sense only with devid filter */
2479         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2480             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2481                 return 0;
2482         }
2483
2484         /* vrange filter */
2485         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2486             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2487                 return 0;
2488         }
2489
2490         /* soft profile changing mode */
2491         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2492             chunk_soft_convert_filter(chunk_type, bargs)) {
2493                 return 0;
2494         }
2495
2496         return 1;
2497 }
2498
2499 static u64 div_factor(u64 num, int factor)
2500 {
2501         if (factor == 10)
2502                 return num;
2503         num *= factor;
2504         do_div(num, 10);
2505         return num;
2506 }
2507
2508 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2509 {
2510         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2511         struct btrfs_root *chunk_root = fs_info->chunk_root;
2512         struct btrfs_root *dev_root = fs_info->dev_root;
2513         struct list_head *devices;
2514         struct btrfs_device *device;
2515         u64 old_size;
2516         u64 size_to_free;
2517         struct btrfs_chunk *chunk;
2518         struct btrfs_path *path;
2519         struct btrfs_key key;
2520         struct btrfs_key found_key;
2521         struct btrfs_trans_handle *trans;
2522         struct extent_buffer *leaf;
2523         int slot;
2524         int ret;
2525         int enospc_errors = 0;
2526         bool counting = true;
2527
2528         /* step one make some room on all the devices */
2529         devices = &fs_info->fs_devices->devices;
2530         list_for_each_entry(device, devices, dev_list) {
2531                 old_size = device->total_bytes;
2532                 size_to_free = div_factor(old_size, 1);
2533                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2534                 if (!device->writeable ||
2535                     device->total_bytes - device->bytes_used > size_to_free)
2536                         continue;
2537
2538                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2539                 if (ret == -ENOSPC)
2540                         break;
2541                 BUG_ON(ret);
2542
2543                 trans = btrfs_start_transaction(dev_root, 0);
2544                 BUG_ON(IS_ERR(trans));
2545
2546                 ret = btrfs_grow_device(trans, device, old_size);
2547                 BUG_ON(ret);
2548
2549                 btrfs_end_transaction(trans, dev_root);
2550         }
2551
2552         /* step two, relocate all the chunks */
2553         path = btrfs_alloc_path();
2554         if (!path) {
2555                 ret = -ENOMEM;
2556                 goto error;
2557         }
2558
2559         /* zero out stat counters */
2560         spin_lock(&fs_info->balance_lock);
2561         memset(&bctl->stat, 0, sizeof(bctl->stat));
2562         spin_unlock(&fs_info->balance_lock);
2563 again:
2564         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2565         key.offset = (u64)-1;
2566         key.type = BTRFS_CHUNK_ITEM_KEY;
2567
2568         while (1) {
2569                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2570                     atomic_read(&fs_info->balance_cancel_req)) {
2571                         ret = -ECANCELED;
2572                         goto error;
2573                 }
2574
2575                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2576                 if (ret < 0)
2577                         goto error;
2578
2579                 /*
2580                  * this shouldn't happen, it means the last relocate
2581                  * failed
2582                  */
2583                 if (ret == 0)
2584                         BUG(); /* FIXME break ? */
2585
2586                 ret = btrfs_previous_item(chunk_root, path, 0,
2587                                           BTRFS_CHUNK_ITEM_KEY);
2588                 if (ret) {
2589                         ret = 0;
2590                         break;
2591                 }
2592
2593                 leaf = path->nodes[0];
2594                 slot = path->slots[0];
2595                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2596
2597                 if (found_key.objectid != key.objectid)
2598                         break;
2599
2600                 /* chunk zero is special */
2601                 if (found_key.offset == 0)
2602                         break;
2603
2604                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2605
2606                 if (!counting) {
2607                         spin_lock(&fs_info->balance_lock);
2608                         bctl->stat.considered++;
2609                         spin_unlock(&fs_info->balance_lock);
2610                 }
2611
2612                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2613                                            found_key.offset);
2614                 btrfs_release_path(path);
2615                 if (!ret)
2616                         goto loop;
2617
2618                 if (counting) {
2619                         spin_lock(&fs_info->balance_lock);
2620                         bctl->stat.expected++;
2621                         spin_unlock(&fs_info->balance_lock);
2622                         goto loop;
2623                 }
2624
2625                 ret = btrfs_relocate_chunk(chunk_root,
2626                                            chunk_root->root_key.objectid,
2627                                            found_key.objectid,
2628                                            found_key.offset);
2629                 if (ret && ret != -ENOSPC)
2630                         goto error;
2631                 if (ret == -ENOSPC) {
2632                         enospc_errors++;
2633                 } else {
2634                         spin_lock(&fs_info->balance_lock);
2635                         bctl->stat.completed++;
2636                         spin_unlock(&fs_info->balance_lock);
2637                 }
2638 loop:
2639                 key.offset = found_key.offset - 1;
2640         }
2641
2642         if (counting) {
2643                 btrfs_release_path(path);
2644                 counting = false;
2645                 goto again;
2646         }
2647 error:
2648         btrfs_free_path(path);
2649         if (enospc_errors) {
2650                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2651                        enospc_errors);
2652                 if (!ret)
2653                         ret = -ENOSPC;
2654         }
2655
2656         return ret;
2657 }
2658
2659 /**
2660  * alloc_profile_is_valid - see if a given profile is valid and reduced
2661  * @flags: profile to validate
2662  * @extended: if true @flags is treated as an extended profile
2663  */
2664 static int alloc_profile_is_valid(u64 flags, int extended)
2665 {
2666         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2667                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
2668
2669         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2670
2671         /* 1) check that all other bits are zeroed */
2672         if (flags & ~mask)
2673                 return 0;
2674
2675         /* 2) see if profile is reduced */
2676         if (flags == 0)
2677                 return !extended; /* "0" is valid for usual profiles */
2678
2679         /* true if exactly one bit set */
2680         return (flags & (flags - 1)) == 0;
2681 }
2682
2683 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2684 {
2685         /* cancel requested || normal exit path */
2686         return atomic_read(&fs_info->balance_cancel_req) ||
2687                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2688                  atomic_read(&fs_info->balance_cancel_req) == 0);
2689 }
2690
2691 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2692 {
2693         int ret;
2694
2695         unset_balance_control(fs_info);
2696         ret = del_balance_item(fs_info->tree_root);
2697         BUG_ON(ret);
2698 }
2699
2700 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2701                                struct btrfs_ioctl_balance_args *bargs);
2702
2703 /*
2704  * Should be called with both balance and volume mutexes held
2705  */
2706 int btrfs_balance(struct btrfs_balance_control *bctl,
2707                   struct btrfs_ioctl_balance_args *bargs)
2708 {
2709         struct btrfs_fs_info *fs_info = bctl->fs_info;
2710         u64 allowed;
2711         int mixed = 0;
2712         int ret;
2713
2714         if (btrfs_fs_closing(fs_info) ||
2715             atomic_read(&fs_info->balance_pause_req) ||
2716             atomic_read(&fs_info->balance_cancel_req)) {
2717                 ret = -EINVAL;
2718                 goto out;
2719         }
2720
2721         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2722         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2723                 mixed = 1;
2724
2725         /*
2726          * In case of mixed groups both data and meta should be picked,
2727          * and identical options should be given for both of them.
2728          */
2729         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
2730         if (mixed && (bctl->flags & allowed)) {
2731                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2732                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2733                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2734                         printk(KERN_ERR "btrfs: with mixed groups data and "
2735                                "metadata balance options must be the same\n");
2736                         ret = -EINVAL;
2737                         goto out;
2738                 }
2739         }
2740
2741         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2742         if (fs_info->fs_devices->num_devices == 1)
2743                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2744         else if (fs_info->fs_devices->num_devices < 4)
2745                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2746         else
2747                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2748                                 BTRFS_BLOCK_GROUP_RAID10);
2749
2750         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2751             (!alloc_profile_is_valid(bctl->data.target, 1) ||
2752              (bctl->data.target & ~allowed))) {
2753                 printk(KERN_ERR "btrfs: unable to start balance with target "
2754                        "data profile %llu\n",
2755                        (unsigned long long)bctl->data.target);
2756                 ret = -EINVAL;
2757                 goto out;
2758         }
2759         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2760             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
2761              (bctl->meta.target & ~allowed))) {
2762                 printk(KERN_ERR "btrfs: unable to start balance with target "
2763                        "metadata profile %llu\n",
2764                        (unsigned long long)bctl->meta.target);
2765                 ret = -EINVAL;
2766                 goto out;
2767         }
2768         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2769             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
2770              (bctl->sys.target & ~allowed))) {
2771                 printk(KERN_ERR "btrfs: unable to start balance with target "
2772                        "system profile %llu\n",
2773                        (unsigned long long)bctl->sys.target);
2774                 ret = -EINVAL;
2775                 goto out;
2776         }
2777
2778         /* allow dup'ed data chunks only in mixed mode */
2779         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2780             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
2781                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2782                 ret = -EINVAL;
2783                 goto out;
2784         }
2785
2786         /* allow to reduce meta or sys integrity only if force set */
2787         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2788                         BTRFS_BLOCK_GROUP_RAID10;
2789         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2790              (fs_info->avail_system_alloc_bits & allowed) &&
2791              !(bctl->sys.target & allowed)) ||
2792             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2793              (fs_info->avail_metadata_alloc_bits & allowed) &&
2794              !(bctl->meta.target & allowed))) {
2795                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2796                         printk(KERN_INFO "btrfs: force reducing metadata "
2797                                "integrity\n");
2798                 } else {
2799                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2800                                "integrity, use force if you want this\n");
2801                         ret = -EINVAL;
2802                         goto out;
2803                 }
2804         }
2805
2806         ret = insert_balance_item(fs_info->tree_root, bctl);
2807         if (ret && ret != -EEXIST)
2808                 goto out;
2809
2810         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2811                 BUG_ON(ret == -EEXIST);
2812                 set_balance_control(bctl);
2813         } else {
2814                 BUG_ON(ret != -EEXIST);
2815                 spin_lock(&fs_info->balance_lock);
2816                 update_balance_args(bctl);
2817                 spin_unlock(&fs_info->balance_lock);
2818         }
2819
2820         atomic_inc(&fs_info->balance_running);
2821         mutex_unlock(&fs_info->balance_mutex);
2822
2823         ret = __btrfs_balance(fs_info);
2824
2825         mutex_lock(&fs_info->balance_mutex);
2826         atomic_dec(&fs_info->balance_running);
2827
2828         if (bargs) {
2829                 memset(bargs, 0, sizeof(*bargs));
2830                 update_ioctl_balance_args(fs_info, 0, bargs);
2831         }
2832
2833         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2834             balance_need_close(fs_info)) {
2835                 __cancel_balance(fs_info);
2836         }
2837
2838         wake_up(&fs_info->balance_wait_q);
2839
2840         return ret;
2841 out:
2842         if (bctl->flags & BTRFS_BALANCE_RESUME)
2843                 __cancel_balance(fs_info);
2844         else
2845                 kfree(bctl);
2846         return ret;
2847 }
2848
2849 static int balance_kthread(void *data)
2850 {
2851         struct btrfs_fs_info *fs_info = data;
2852         int ret = 0;
2853
2854         mutex_lock(&fs_info->volume_mutex);
2855         mutex_lock(&fs_info->balance_mutex);
2856
2857         if (fs_info->balance_ctl) {
2858                 printk(KERN_INFO "btrfs: continuing balance\n");
2859                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2860         }
2861
2862         mutex_unlock(&fs_info->balance_mutex);
2863         mutex_unlock(&fs_info->volume_mutex);
2864
2865         return ret;
2866 }
2867
2868 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2869 {
2870         struct task_struct *tsk;
2871
2872         spin_lock(&fs_info->balance_lock);
2873         if (!fs_info->balance_ctl) {
2874                 spin_unlock(&fs_info->balance_lock);
2875                 return 0;
2876         }
2877         spin_unlock(&fs_info->balance_lock);
2878
2879         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2880                 printk(KERN_INFO "btrfs: force skipping balance\n");
2881                 return 0;
2882         }
2883
2884         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2885         if (IS_ERR(tsk))
2886                 return PTR_ERR(tsk);
2887
2888         return 0;
2889 }
2890
2891 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
2892 {
2893         struct btrfs_balance_control *bctl;
2894         struct btrfs_balance_item *item;
2895         struct btrfs_disk_balance_args disk_bargs;
2896         struct btrfs_path *path;
2897         struct extent_buffer *leaf;
2898         struct btrfs_key key;
2899         int ret;
2900
2901         path = btrfs_alloc_path();
2902         if (!path)
2903                 return -ENOMEM;
2904
2905         key.objectid = BTRFS_BALANCE_OBJECTID;
2906         key.type = BTRFS_BALANCE_ITEM_KEY;
2907         key.offset = 0;
2908
2909         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2910         if (ret < 0)
2911                 goto out;
2912         if (ret > 0) { /* ret = -ENOENT; */
2913                 ret = 0;
2914                 goto out;
2915         }
2916
2917         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2918         if (!bctl) {
2919                 ret = -ENOMEM;
2920                 goto out;
2921         }
2922
2923         leaf = path->nodes[0];
2924         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2925
2926         bctl->fs_info = fs_info;
2927         bctl->flags = btrfs_balance_flags(leaf, item);
2928         bctl->flags |= BTRFS_BALANCE_RESUME;
2929
2930         btrfs_balance_data(leaf, item, &disk_bargs);
2931         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2932         btrfs_balance_meta(leaf, item, &disk_bargs);
2933         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2934         btrfs_balance_sys(leaf, item, &disk_bargs);
2935         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2936
2937         mutex_lock(&fs_info->volume_mutex);
2938         mutex_lock(&fs_info->balance_mutex);
2939
2940         set_balance_control(bctl);
2941
2942         mutex_unlock(&fs_info->balance_mutex);
2943         mutex_unlock(&fs_info->volume_mutex);
2944 out:
2945         btrfs_free_path(path);
2946         return ret;
2947 }
2948
2949 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2950 {
2951         int ret = 0;
2952
2953         mutex_lock(&fs_info->balance_mutex);
2954         if (!fs_info->balance_ctl) {
2955                 mutex_unlock(&fs_info->balance_mutex);
2956                 return -ENOTCONN;
2957         }
2958
2959         if (atomic_read(&fs_info->balance_running)) {
2960                 atomic_inc(&fs_info->balance_pause_req);
2961                 mutex_unlock(&fs_info->balance_mutex);
2962
2963                 wait_event(fs_info->balance_wait_q,
2964                            atomic_read(&fs_info->balance_running) == 0);
2965
2966                 mutex_lock(&fs_info->balance_mutex);
2967                 /* we are good with balance_ctl ripped off from under us */
2968                 BUG_ON(atomic_read(&fs_info->balance_running));
2969                 atomic_dec(&fs_info->balance_pause_req);
2970         } else {
2971                 ret = -ENOTCONN;
2972         }
2973
2974         mutex_unlock(&fs_info->balance_mutex);
2975         return ret;
2976 }
2977
2978 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2979 {
2980         mutex_lock(&fs_info->balance_mutex);
2981         if (!fs_info->balance_ctl) {
2982                 mutex_unlock(&fs_info->balance_mutex);
2983                 return -ENOTCONN;
2984         }
2985
2986         atomic_inc(&fs_info->balance_cancel_req);
2987         /*
2988          * if we are running just wait and return, balance item is
2989          * deleted in btrfs_balance in this case
2990          */
2991         if (atomic_read(&fs_info->balance_running)) {
2992                 mutex_unlock(&fs_info->balance_mutex);
2993                 wait_event(fs_info->balance_wait_q,
2994                            atomic_read(&fs_info->balance_running) == 0);
2995                 mutex_lock(&fs_info->balance_mutex);
2996         } else {
2997                 /* __cancel_balance needs volume_mutex */
2998                 mutex_unlock(&fs_info->balance_mutex);
2999                 mutex_lock(&fs_info->volume_mutex);
3000                 mutex_lock(&fs_info->balance_mutex);
3001
3002                 if (fs_info->balance_ctl)
3003                         __cancel_balance(fs_info);
3004
3005                 mutex_unlock(&fs_info->volume_mutex);
3006         }
3007
3008         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3009         atomic_dec(&fs_info->balance_cancel_req);
3010         mutex_unlock(&fs_info->balance_mutex);
3011         return 0;
3012 }
3013
3014 /*
3015  * shrinking a device means finding all of the device extents past
3016  * the new size, and then following the back refs to the chunks.
3017  * The chunk relocation code actually frees the device extent
3018  */
3019 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3020 {
3021         struct btrfs_trans_handle *trans;
3022         struct btrfs_root *root = device->dev_root;
3023         struct btrfs_dev_extent *dev_extent = NULL;
3024         struct btrfs_path *path;
3025         u64 length;
3026         u64 chunk_tree;
3027         u64 chunk_objectid;
3028         u64 chunk_offset;
3029         int ret;
3030         int slot;
3031         int failed = 0;
3032         bool retried = false;
3033         struct extent_buffer *l;
3034         struct btrfs_key key;
3035         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3036         u64 old_total = btrfs_super_total_bytes(super_copy);
3037         u64 old_size = device->total_bytes;
3038         u64 diff = device->total_bytes - new_size;
3039
3040         if (new_size >= device->total_bytes)
3041                 return -EINVAL;
3042
3043         path = btrfs_alloc_path();
3044         if (!path)
3045                 return -ENOMEM;
3046
3047         path->reada = 2;
3048
3049         lock_chunks(root);
3050
3051         device->total_bytes = new_size;
3052         if (device->writeable) {
3053                 device->fs_devices->total_rw_bytes -= diff;
3054                 spin_lock(&root->fs_info->free_chunk_lock);
3055                 root->fs_info->free_chunk_space -= diff;
3056                 spin_unlock(&root->fs_info->free_chunk_lock);
3057         }
3058         unlock_chunks(root);
3059
3060 again:
3061         key.objectid = device->devid;
3062         key.offset = (u64)-1;
3063         key.type = BTRFS_DEV_EXTENT_KEY;
3064
3065         do {
3066                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3067                 if (ret < 0)
3068                         goto done;
3069
3070                 ret = btrfs_previous_item(root, path, 0, key.type);
3071                 if (ret < 0)
3072                         goto done;
3073                 if (ret) {
3074                         ret = 0;
3075                         btrfs_release_path(path);
3076                         break;
3077                 }
3078
3079                 l = path->nodes[0];
3080                 slot = path->slots[0];
3081                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3082
3083                 if (key.objectid != device->devid) {
3084                         btrfs_release_path(path);
3085                         break;
3086                 }
3087
3088                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3089                 length = btrfs_dev_extent_length(l, dev_extent);
3090
3091                 if (key.offset + length <= new_size) {
3092                         btrfs_release_path(path);
3093                         break;
3094                 }
3095
3096                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3097                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3098                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3099                 btrfs_release_path(path);
3100
3101                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3102                                            chunk_offset);
3103                 if (ret && ret != -ENOSPC)
3104                         goto done;
3105                 if (ret == -ENOSPC)
3106                         failed++;
3107         } while (key.offset-- > 0);
3108
3109         if (failed && !retried) {
3110                 failed = 0;
3111                 retried = true;
3112                 goto again;
3113         } else if (failed && retried) {
3114                 ret = -ENOSPC;
3115                 lock_chunks(root);
3116
3117                 device->total_bytes = old_size;
3118                 if (device->writeable)
3119                         device->fs_devices->total_rw_bytes += diff;
3120                 spin_lock(&root->fs_info->free_chunk_lock);
3121                 root->fs_info->free_chunk_space += diff;
3122                 spin_unlock(&root->fs_info->free_chunk_lock);
3123                 unlock_chunks(root);
3124                 goto done;
3125         }
3126
3127         /* Shrinking succeeded, else we would be at "done". */
3128         trans = btrfs_start_transaction(root, 0);
3129         if (IS_ERR(trans)) {
3130                 ret = PTR_ERR(trans);
3131                 goto done;
3132         }
3133
3134         lock_chunks(root);
3135
3136         device->disk_total_bytes = new_size;
3137         /* Now btrfs_update_device() will change the on-disk size. */
3138         ret = btrfs_update_device(trans, device);
3139         if (ret) {
3140                 unlock_chunks(root);
3141                 btrfs_end_transaction(trans, root);
3142                 goto done;
3143         }
3144         WARN_ON(diff > old_total);
3145         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3146         unlock_chunks(root);
3147         btrfs_end_transaction(trans, root);
3148 done:
3149         btrfs_free_path(path);
3150         return ret;
3151 }
3152
3153 static int btrfs_add_system_chunk(struct btrfs_root *root,
3154                            struct btrfs_key *key,
3155                            struct btrfs_chunk *chunk, int item_size)
3156 {
3157         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3158         struct btrfs_disk_key disk_key;
3159         u32 array_size;
3160         u8 *ptr;
3161
3162         array_size = btrfs_super_sys_array_size(super_copy);
3163         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3164                 return -EFBIG;
3165
3166         ptr = super_copy->sys_chunk_array + array_size;
3167         btrfs_cpu_key_to_disk(&disk_key, key);
3168         memcpy(ptr, &disk_key, sizeof(disk_key));
3169         ptr += sizeof(disk_key);
3170         memcpy(ptr, chunk, item_size);
3171         item_size += sizeof(disk_key);
3172         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3173         return 0;
3174 }
3175
3176 /*
3177  * sort the devices in descending order by max_avail, total_avail
3178  */
3179 static int btrfs_cmp_device_info(const void *a, const void *b)
3180 {
3181         const struct btrfs_device_info *di_a = a;
3182         const struct btrfs_device_info *di_b = b;
3183
3184         if (di_a->max_avail > di_b->max_avail)
3185                 return -1;
3186         if (di_a->max_avail < di_b->max_avail)
3187                 return 1;
3188         if (di_a->total_avail > di_b->total_avail)
3189                 return -1;
3190         if (di_a->total_avail < di_b->total_avail)
3191                 return 1;
3192         return 0;
3193 }
3194
3195 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3196                                struct btrfs_root *extent_root,
3197                                struct map_lookup **map_ret,
3198                                u64 *num_bytes_out, u64 *stripe_size_out,
3199                                u64 start, u64 type)
3200 {
3201         struct btrfs_fs_info *info = extent_root->fs_info;
3202         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3203         struct list_head *cur;
3204         struct map_lookup *map = NULL;
3205         struct extent_map_tree *em_tree;
3206         struct extent_map *em;
3207         struct btrfs_device_info *devices_info = NULL;
3208         u64 total_avail;
3209         int num_stripes;        /* total number of stripes to allocate */
3210         int sub_stripes;        /* sub_stripes info for map */
3211         int dev_stripes;        /* stripes per dev */
3212         int devs_max;           /* max devs to use */
3213         int devs_min;           /* min devs needed */
3214         int devs_increment;     /* ndevs has to be a multiple of this */
3215         int ncopies;            /* how many copies to data has */
3216         int ret;
3217         u64 max_stripe_size;
3218         u64 max_chunk_size;
3219         u64 stripe_size;
3220         u64 num_bytes;
3221         int ndevs;
3222         int i;
3223         int j;
3224
3225         BUG_ON(!alloc_profile_is_valid(type, 0));
3226
3227         if (list_empty(&fs_devices->alloc_list))
3228                 return -ENOSPC;
3229
3230         sub_stripes = 1;
3231         dev_stripes = 1;
3232         devs_increment = 1;
3233         ncopies = 1;
3234         devs_max = 0;   /* 0 == as many as possible */
3235         devs_min = 1;
3236
3237         /*
3238          * define the properties of each RAID type.
3239          * FIXME: move this to a global table and use it in all RAID
3240          * calculation code
3241          */
3242         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3243                 dev_stripes = 2;
3244                 ncopies = 2;
3245                 devs_max = 1;
3246         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3247                 devs_min = 2;
3248         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3249                 devs_increment = 2;
3250                 ncopies = 2;
3251                 devs_max = 2;
3252                 devs_min = 2;
3253         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3254                 sub_stripes = 2;
3255                 devs_increment = 2;
3256                 ncopies = 2;
3257                 devs_min = 4;
3258         } else {
3259                 devs_max = 1;
3260         }
3261
3262         if (type & BTRFS_BLOCK_GROUP_DATA) {
3263                 max_stripe_size = 1024 * 1024 * 1024;
3264                 max_chunk_size = 10 * max_stripe_size;
3265         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3266                 /* for larger filesystems, use larger metadata chunks */
3267                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3268                         max_stripe_size = 1024 * 1024 * 1024;
3269                 else
3270                         max_stripe_size = 256 * 1024 * 1024;
3271                 max_chunk_size = max_stripe_size;
3272         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3273                 max_stripe_size = 32 * 1024 * 1024;
3274                 max_chunk_size = 2 * max_stripe_size;
3275         } else {
3276                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3277                        type);
3278                 BUG_ON(1);
3279         }
3280
3281         /* we don't want a chunk larger than 10% of writeable space */
3282         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3283                              max_chunk_size);
3284
3285         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3286                                GFP_NOFS);
3287         if (!devices_info)
3288                 return -ENOMEM;
3289
3290         cur = fs_devices->alloc_list.next;
3291
3292         /*
3293          * in the first pass through the devices list, we gather information
3294          * about the available holes on each device.
3295          */
3296         ndevs = 0;
3297         while (cur != &fs_devices->alloc_list) {
3298                 struct btrfs_device *device;
3299                 u64 max_avail;
3300                 u64 dev_offset;
3301
3302                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3303
3304                 cur = cur->next;
3305
3306                 if (!device->writeable) {
3307                         printk(KERN_ERR
3308                                "btrfs: read-only device in alloc_list\n");
3309                         WARN_ON(1);
3310                         continue;
3311                 }
3312
3313                 if (!device->in_fs_metadata)
3314                         continue;
3315
3316                 if (device->total_bytes > device->bytes_used)
3317                         total_avail = device->total_bytes - device->bytes_used;
3318                 else
3319                         total_avail = 0;
3320
3321                 /* If there is no space on this device, skip it. */
3322                 if (total_avail == 0)
3323                         continue;
3324
3325                 ret = find_free_dev_extent(device,
3326                                            max_stripe_size * dev_stripes,
3327                                            &dev_offset, &max_avail);
3328                 if (ret && ret != -ENOSPC)
3329                         goto error;
3330
3331                 if (ret == 0)
3332                         max_avail = max_stripe_size * dev_stripes;
3333
3334                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3335                         continue;
3336
3337                 devices_info[ndevs].dev_offset = dev_offset;
3338                 devices_info[ndevs].max_avail = max_avail;
3339                 devices_info[ndevs].total_avail = total_avail;
3340                 devices_info[ndevs].dev = device;
3341                 ++ndevs;
3342         }
3343
3344         /*
3345          * now sort the devices by hole size / available space
3346          */
3347         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3348              btrfs_cmp_device_info, NULL);
3349
3350         /* round down to number of usable stripes */
3351         ndevs -= ndevs % devs_increment;
3352
3353         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3354                 ret = -ENOSPC;
3355                 goto error;
3356         }
3357
3358         if (devs_max && ndevs > devs_max)
3359                 ndevs = devs_max;
3360         /*
3361          * the primary goal is to maximize the number of stripes, so use as many
3362          * devices as possible, even if the stripes are not maximum sized.
3363          */
3364         stripe_size = devices_info[ndevs-1].max_avail;
3365         num_stripes = ndevs * dev_stripes;
3366
3367         if (stripe_size * ndevs > max_chunk_size * ncopies) {
3368                 stripe_size = max_chunk_size * ncopies;
3369                 do_div(stripe_size, ndevs);
3370         }
3371
3372         do_div(stripe_size, dev_stripes);
3373
3374         /* align to BTRFS_STRIPE_LEN */
3375         do_div(stripe_size, BTRFS_STRIPE_LEN);
3376         stripe_size *= BTRFS_STRIPE_LEN;
3377
3378         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3379         if (!map) {
3380                 ret = -ENOMEM;
3381                 goto error;
3382         }
3383         map->num_stripes = num_stripes;
3384
3385         for (i = 0; i < ndevs; ++i) {
3386                 for (j = 0; j < dev_stripes; ++j) {
3387                         int s = i * dev_stripes + j;
3388                         map->stripes[s].dev = devices_info[i].dev;
3389                         map->stripes[s].physical = devices_info[i].dev_offset +
3390                                                    j * stripe_size;
3391                 }
3392         }
3393         map->sector_size = extent_root->sectorsize;
3394         map->stripe_len = BTRFS_STRIPE_LEN;
3395         map->io_align = BTRFS_STRIPE_LEN;
3396         map->io_width = BTRFS_STRIPE_LEN;
3397         map->type = type;
3398         map->sub_stripes = sub_stripes;
3399
3400         *map_ret = map;
3401         num_bytes = stripe_size * (num_stripes / ncopies);
3402
3403         *stripe_size_out = stripe_size;
3404         *num_bytes_out = num_bytes;
3405
3406         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3407
3408         em = alloc_extent_map();
3409         if (!em) {
3410                 ret = -ENOMEM;
3411                 goto error;
3412         }
3413         em->bdev = (struct block_device *)map;
3414         em->start = start;
3415         em->len = num_bytes;
3416         em->block_start = 0;
3417         em->block_len = em->len;
3418
3419         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3420         write_lock(&em_tree->lock);
3421         ret = add_extent_mapping(em_tree, em);
3422         write_unlock(&em_tree->lock);
3423         free_extent_map(em);
3424         if (ret)
3425                 goto error;
3426
3427         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3428                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3429                                      start, num_bytes);
3430         if (ret)
3431                 goto error;
3432
3433         for (i = 0; i < map->num_stripes; ++i) {
3434                 struct btrfs_device *device;
3435                 u64 dev_offset;
3436
3437                 device = map->stripes[i].dev;
3438                 dev_offset = map->stripes[i].physical;
3439
3440                 ret = btrfs_alloc_dev_extent(trans, device,
3441                                 info->chunk_root->root_key.objectid,
3442                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3443                                 start, dev_offset, stripe_size);
3444                 if (ret) {
3445                         btrfs_abort_transaction(trans, extent_root, ret);
3446                         goto error;
3447                 }
3448         }
3449
3450         kfree(devices_info);
3451         return 0;
3452
3453 error:
3454         kfree(map);
3455         kfree(devices_info);
3456         return ret;
3457 }
3458
3459 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3460                                 struct btrfs_root *extent_root,
3461                                 struct map_lookup *map, u64 chunk_offset,
3462                                 u64 chunk_size, u64 stripe_size)
3463 {
3464         u64 dev_offset;
3465         struct btrfs_key key;
3466         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3467         struct btrfs_device *device;
3468         struct btrfs_chunk *chunk;
3469         struct btrfs_stripe *stripe;
3470         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3471         int index = 0;
3472         int ret;
3473
3474         chunk = kzalloc(item_size, GFP_NOFS);
3475         if (!chunk)
3476                 return -ENOMEM;
3477
3478         index = 0;
3479         while (index < map->num_stripes) {
3480                 device = map->stripes[index].dev;
3481                 device->bytes_used += stripe_size;
3482                 ret = btrfs_update_device(trans, device);
3483                 if (ret)
3484                         goto out_free;
3485                 index++;
3486         }
3487
3488         spin_lock(&extent_root->fs_info->free_chunk_lock);
3489         extent_root->fs_info->free_chunk_space -= (stripe_size *
3490                                                    map->num_stripes);
3491         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3492
3493         index = 0;
3494         stripe = &chunk->stripe;
3495         while (index < map->num_stripes) {
3496                 device = map->stripes[index].dev;
3497                 dev_offset = map->stripes[index].physical;
3498
3499                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3500                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3501                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3502                 stripe++;
3503                 index++;
3504         }
3505
3506         btrfs_set_stack_chunk_length(chunk, chunk_size);
3507         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3508         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3509         btrfs_set_stack_chunk_type(chunk, map->type);
3510         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3511         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3512         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3513         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3514         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3515
3516         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3517         key.type = BTRFS_CHUNK_ITEM_KEY;
3518         key.offset = chunk_offset;
3519
3520         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3521
3522         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3523                 /*
3524                  * TODO: Cleanup of inserted chunk root in case of
3525                  * failure.
3526                  */
3527                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3528                                              item_size);
3529         }
3530
3531 out_free:
3532         kfree(chunk);
3533         return ret;
3534 }
3535
3536 /*
3537  * Chunk allocation falls into two parts. The first part does works
3538  * that make the new allocated chunk useable, but not do any operation
3539  * that modifies the chunk tree. The second part does the works that
3540  * require modifying the chunk tree. This division is important for the
3541  * bootstrap process of adding storage to a seed btrfs.
3542  */
3543 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3544                       struct btrfs_root *extent_root, u64 type)
3545 {
3546         u64 chunk_offset;
3547         u64 chunk_size;
3548         u64 stripe_size;
3549         struct map_lookup *map;
3550         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3551         int ret;
3552
3553         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3554                               &chunk_offset);
3555         if (ret)
3556                 return ret;
3557
3558         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3559                                   &stripe_size, chunk_offset, type);
3560         if (ret)
3561                 return ret;
3562
3563         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3564                                    chunk_size, stripe_size);
3565         if (ret)
3566                 return ret;
3567         return 0;
3568 }
3569
3570 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3571                                          struct btrfs_root *root,
3572                                          struct btrfs_device *device)
3573 {
3574         u64 chunk_offset;
3575         u64 sys_chunk_offset;
3576         u64 chunk_size;
3577         u64 sys_chunk_size;
3578         u64 stripe_size;
3579         u64 sys_stripe_size;
3580         u64 alloc_profile;
3581         struct map_lookup *map;
3582         struct map_lookup *sys_map;
3583         struct btrfs_fs_info *fs_info = root->fs_info;
3584         struct btrfs_root *extent_root = fs_info->extent_root;
3585         int ret;
3586
3587         ret = find_next_chunk(fs_info->chunk_root,
3588                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3589         if (ret)
3590                 return ret;
3591
3592         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3593                                 fs_info->avail_metadata_alloc_bits;
3594         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3595
3596         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3597                                   &stripe_size, chunk_offset, alloc_profile);
3598         if (ret)
3599                 return ret;
3600
3601         sys_chunk_offset = chunk_offset + chunk_size;
3602
3603         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3604                                 fs_info->avail_system_alloc_bits;
3605         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3606
3607         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3608                                   &sys_chunk_size, &sys_stripe_size,
3609                                   sys_chunk_offset, alloc_profile);
3610         if (ret)
3611                 goto abort;
3612
3613         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3614         if (ret)
3615                 goto abort;
3616
3617         /*
3618          * Modifying chunk tree needs allocating new blocks from both
3619          * system block group and metadata block group. So we only can
3620          * do operations require modifying the chunk tree after both
3621          * block groups were created.
3622          */
3623         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3624                                    chunk_size, stripe_size);
3625         if (ret)
3626                 goto abort;
3627
3628         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3629                                    sys_chunk_offset, sys_chunk_size,
3630                                    sys_stripe_size);
3631         if (ret)
3632                 goto abort;
3633
3634         return 0;
3635
3636 abort:
3637         btrfs_abort_transaction(trans, root, ret);
3638         return ret;
3639 }
3640
3641 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3642 {
3643         struct extent_map *em;
3644         struct map_lookup *map;
3645         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3646         int readonly = 0;
3647         int i;
3648
3649         read_lock(&map_tree->map_tree.lock);
3650         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3651         read_unlock(&map_tree->map_tree.lock);
3652         if (!em)
3653                 return 1;
3654
3655         if (btrfs_test_opt(root, DEGRADED)) {
3656                 free_extent_map(em);
3657                 return 0;
3658         }
3659
3660         map = (struct map_lookup *)em->bdev;
3661         for (i = 0; i < map->num_stripes; i++) {
3662                 if (!map->stripes[i].dev->writeable) {
3663                         readonly = 1;
3664                         break;
3665                 }
3666         }
3667         free_extent_map(em);
3668         return readonly;
3669 }
3670
3671 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3672 {
3673         extent_map_tree_init(&tree->map_tree);
3674 }
3675
3676 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3677 {
3678         struct extent_map *em;
3679
3680         while (1) {
3681                 write_lock(&tree->map_tree.lock);
3682                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3683                 if (em)
3684                         remove_extent_mapping(&tree->map_tree, em);
3685                 write_unlock(&tree->map_tree.lock);
3686                 if (!em)
3687                         break;
3688                 kfree(em->bdev);
3689                 /* once for us */
3690                 free_extent_map(em);
3691                 /* once for the tree */
3692                 free_extent_map(em);
3693         }
3694 }
3695
3696 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3697 {
3698         struct extent_map *em;
3699         struct map_lookup *map;
3700         struct extent_map_tree *em_tree = &map_tree->map_tree;
3701         int ret;
3702
3703         read_lock(&em_tree->lock);
3704         em = lookup_extent_mapping(em_tree, logical, len);
3705         read_unlock(&em_tree->lock);
3706         BUG_ON(!em);
3707
3708         BUG_ON(em->start > logical || em->start + em->len < logical);
3709         map = (struct map_lookup *)em->bdev;
3710         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3711                 ret = map->num_stripes;
3712         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3713                 ret = map->sub_stripes;
3714         else
3715                 ret = 1;
3716         free_extent_map(em);
3717         return ret;
3718 }
3719
3720 static int find_live_mirror(struct map_lookup *map, int first, int num,
3721                             int optimal)
3722 {
3723         int i;
3724         if (map->stripes[optimal].dev->bdev)
3725                 return optimal;
3726         for (i = first; i < first + num; i++) {
3727                 if (map->stripes[i].dev->bdev)
3728                         return i;
3729         }
3730         /* we couldn't find one that doesn't fail.  Just return something
3731          * and the io error handling code will clean up eventually
3732          */
3733         return optimal;
3734 }
3735
3736 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3737                              u64 logical, u64 *length,
3738                              struct btrfs_bio **bbio_ret,
3739                              int mirror_num)
3740 {
3741         struct extent_map *em;
3742         struct map_lookup *map;
3743         struct extent_map_tree *em_tree = &map_tree->map_tree;
3744         u64 offset;
3745         u64 stripe_offset;
3746         u64 stripe_end_offset;
3747         u64 stripe_nr;
3748         u64 stripe_nr_orig;
3749         u64 stripe_nr_end;
3750         int stripe_index;
3751         int i;
3752         int ret = 0;
3753         int num_stripes;
3754         int max_errors = 0;
3755         struct btrfs_bio *bbio = NULL;
3756
3757         read_lock(&em_tree->lock);
3758         em = lookup_extent_mapping(em_tree, logical, *length);
3759         read_unlock(&em_tree->lock);
3760
3761         if (!em) {
3762                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3763                        (unsigned long long)logical,
3764                        (unsigned long long)*length);
3765                 BUG();
3766         }
3767
3768         BUG_ON(em->start > logical || em->start + em->len < logical);
3769         map = (struct map_lookup *)em->bdev;
3770         offset = logical - em->start;
3771
3772         if (mirror_num > map->num_stripes)
3773                 mirror_num = 0;
3774
3775         stripe_nr = offset;
3776         /*
3777          * stripe_nr counts the total number of stripes we have to stride
3778          * to get to this block
3779          */
3780         do_div(stripe_nr, map->stripe_len);
3781
3782         stripe_offset = stripe_nr * map->stripe_len;
3783         BUG_ON(offset < stripe_offset);
3784
3785         /* stripe_offset is the offset of this block in its stripe*/
3786         stripe_offset = offset - stripe_offset;
3787
3788         if (rw & REQ_DISCARD)
3789                 *length = min_t(u64, em->len - offset, *length);
3790         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3791                 /* we limit the length of each bio to what fits in a stripe */
3792                 *length = min_t(u64, em->len - offset,
3793                                 map->stripe_len - stripe_offset);
3794         } else {
3795                 *length = em->len - offset;
3796         }
3797
3798         if (!bbio_ret)
3799                 goto out;
3800
3801         num_stripes = 1;
3802         stripe_index = 0;
3803         stripe_nr_orig = stripe_nr;
3804         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3805                         (~(map->stripe_len - 1));
3806         do_div(stripe_nr_end, map->stripe_len);
3807         stripe_end_offset = stripe_nr_end * map->stripe_len -
3808                             (offset + *length);
3809         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3810                 if (rw & REQ_DISCARD)
3811                         num_stripes = min_t(u64, map->num_stripes,
3812                                             stripe_nr_end - stripe_nr_orig);
3813                 stripe_index = do_div(stripe_nr, map->num_stripes);
3814         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3815                 if (rw & (REQ_WRITE | REQ_DISCARD))
3816                         num_stripes = map->num_stripes;
3817                 else if (mirror_num)
3818                         stripe_index = mirror_num - 1;
3819                 else {
3820                         stripe_index = find_live_mirror(map, 0,
3821                                             map->num_stripes,
3822                                             current->pid % map->num_stripes);
3823                         mirror_num = stripe_index + 1;
3824                 }
3825
3826         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3827                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3828                         num_stripes = map->num_stripes;
3829                 } else if (mirror_num) {
3830                         stripe_index = mirror_num - 1;
3831                 } else {
3832                         mirror_num = 1;
3833                 }
3834
3835         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3836                 int factor = map->num_stripes / map->sub_stripes;
3837
3838                 stripe_index = do_div(stripe_nr, factor);
3839                 stripe_index *= map->sub_stripes;
3840
3841                 if (rw & REQ_WRITE)
3842                         num_stripes = map->sub_stripes;
3843                 else if (rw & REQ_DISCARD)
3844                         num_stripes = min_t(u64, map->sub_stripes *
3845                                             (stripe_nr_end - stripe_nr_orig),
3846                                             map->num_stripes);
3847                 else if (mirror_num)
3848                         stripe_index += mirror_num - 1;
3849                 else {
3850                         int old_stripe_index = stripe_index;
3851                         stripe_index = find_live_mirror(map, stripe_index,
3852                                               map->sub_stripes, stripe_index +
3853                                               current->pid % map->sub_stripes);
3854                         mirror_num = stripe_index - old_stripe_index + 1;
3855                 }
3856         } else {
3857                 /*
3858                  * after this do_div call, stripe_nr is the number of stripes
3859                  * on this device we have to walk to find the data, and
3860                  * stripe_index is the number of our device in the stripe array
3861                  */
3862                 stripe_index = do_div(stripe_nr, map->num_stripes);
3863                 mirror_num = stripe_index + 1;
3864         }
3865         BUG_ON(stripe_index >= map->num_stripes);
3866
3867         bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3868         if (!bbio) {
3869                 ret = -ENOMEM;
3870                 goto out;
3871         }
3872         atomic_set(&bbio->error, 0);
3873
3874         if (rw & REQ_DISCARD) {
3875                 int factor = 0;
3876                 int sub_stripes = 0;
3877                 u64 stripes_per_dev = 0;
3878                 u32 remaining_stripes = 0;
3879                 u32 last_stripe = 0;
3880
3881                 if (map->type &
3882                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3883                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3884                                 sub_stripes = 1;
3885                         else
3886                                 sub_stripes = map->sub_stripes;
3887
3888                         factor = map->num_stripes / sub_stripes;
3889                         stripes_per_dev = div_u64_rem(stripe_nr_end -
3890                                                       stripe_nr_orig,
3891                                                       factor,
3892                                                       &remaining_stripes);
3893                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
3894                         last_stripe *= sub_stripes;
3895                 }
3896
3897                 for (i = 0; i < num_stripes; i++) {
3898                         bbio->stripes[i].physical =
3899                                 map->stripes[stripe_index].physical +
3900                                 stripe_offset + stripe_nr * map->stripe_len;
3901                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3902
3903                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3904                                          BTRFS_BLOCK_GROUP_RAID10)) {
3905                                 bbio->stripes[i].length = stripes_per_dev *
3906                                                           map->stripe_len;
3907
3908                                 if (i / sub_stripes < remaining_stripes)
3909                                         bbio->stripes[i].length +=
3910                                                 map->stripe_len;
3911
3912                                 /*
3913                                  * Special for the first stripe and
3914                                  * the last stripe:
3915                                  *
3916                                  * |-------|...|-------|
3917                                  *     |----------|
3918                                  *    off     end_off
3919                                  */
3920                                 if (i < sub_stripes)
3921                                         bbio->stripes[i].length -=
3922                                                 stripe_offset;
3923
3924                                 if (stripe_index >= last_stripe &&
3925                                     stripe_index <= (last_stripe +
3926                                                      sub_stripes - 1))
3927                                         bbio->stripes[i].length -=
3928                                                 stripe_end_offset;
3929
3930                                 if (i == sub_stripes - 1)
3931                                         stripe_offset = 0;
3932                         } else
3933                                 bbio->stripes[i].length = *length;
3934
3935                         stripe_index++;
3936                         if (stripe_index == map->num_stripes) {
3937                                 /* This could only happen for RAID0/10 */
3938                                 stripe_index = 0;
3939                                 stripe_nr++;
3940                         }
3941                 }
3942         } else {
3943                 for (i = 0; i < num_stripes; i++) {
3944                         bbio->stripes[i].physical =
3945                                 map->stripes[stripe_index].physical +
3946                                 stripe_offset +
3947                                 stripe_nr * map->stripe_len;
3948                         bbio->stripes[i].dev =
3949                                 map->stripes[stripe_index].dev;
3950                         stripe_index++;
3951                 }
3952         }
3953
3954         if (rw & REQ_WRITE) {
3955                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3956                                  BTRFS_BLOCK_GROUP_RAID10 |
3957                                  BTRFS_BLOCK_GROUP_DUP)) {
3958                         max_errors = 1;
3959                 }
3960         }
3961
3962         *bbio_ret = bbio;
3963         bbio->num_stripes = num_stripes;
3964         bbio->max_errors = max_errors;
3965         bbio->mirror_num = mirror_num;
3966 out:
3967         free_extent_map(em);
3968         return ret;
3969 }
3970
3971 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3972                       u64 logical, u64 *length,
3973                       struct btrfs_bio **bbio_ret, int mirror_num)
3974 {
3975         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3976                                  mirror_num);
3977 }
3978
3979 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3980                      u64 chunk_start, u64 physical, u64 devid,
3981                      u64 **logical, int *naddrs, int *stripe_len)
3982 {
3983         struct extent_map_tree *em_tree = &map_tree->map_tree;
3984         struct extent_map *em;
3985         struct map_lookup *map;
3986         u64 *buf;
3987         u64 bytenr;
3988         u64 length;
3989         u64 stripe_nr;
3990         int i, j, nr = 0;
3991
3992         read_lock(&em_tree->lock);
3993         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3994         read_unlock(&em_tree->lock);
3995
3996         BUG_ON(!em || em->start != chunk_start);
3997         map = (struct map_lookup *)em->bdev;
3998
3999         length = em->len;
4000         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4001                 do_div(length, map->num_stripes / map->sub_stripes);
4002         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4003                 do_div(length, map->num_stripes);
4004
4005         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4006         BUG_ON(!buf); /* -ENOMEM */
4007
4008         for (i = 0; i < map->num_stripes; i++) {
4009                 if (devid && map->stripes[i].dev->devid != devid)
4010                         continue;
4011                 if (map->stripes[i].physical > physical ||
4012                     map->stripes[i].physical + length <= physical)
4013                         continue;
4014
4015                 stripe_nr = physical - map->stripes[i].physical;
4016                 do_div(stripe_nr, map->stripe_len);
4017
4018                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4019                         stripe_nr = stripe_nr * map->num_stripes + i;
4020                         do_div(stripe_nr, map->sub_stripes);
4021                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4022                         stripe_nr = stripe_nr * map->num_stripes + i;
4023                 }
4024                 bytenr = chunk_start + stripe_nr * map->stripe_len;
4025                 WARN_ON(nr >= map->num_stripes);
4026                 for (j = 0; j < nr; j++) {
4027                         if (buf[j] == bytenr)
4028                                 break;
4029                 }
4030                 if (j == nr) {
4031                         WARN_ON(nr >= map->num_stripes);
4032                         buf[nr++] = bytenr;
4033                 }
4034         }
4035
4036         *logical = buf;
4037         *naddrs = nr;
4038         *stripe_len = map->stripe_len;
4039
4040         free_extent_map(em);
4041         return 0;
4042 }
4043
4044 static void *merge_stripe_index_into_bio_private(void *bi_private,
4045                                                  unsigned int stripe_index)
4046 {
4047         /*
4048          * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4049          * at most 1.
4050          * The alternative solution (instead of stealing bits from the
4051          * pointer) would be to allocate an intermediate structure
4052          * that contains the old private pointer plus the stripe_index.
4053          */
4054         BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4055         BUG_ON(stripe_index > 3);
4056         return (void *)(((uintptr_t)bi_private) | stripe_index);
4057 }
4058
4059 static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4060 {
4061         return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4062 }
4063
4064 static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4065 {
4066         return (unsigned int)((uintptr_t)bi_private) & 3;
4067 }
4068
4069 static void btrfs_end_bio(struct bio *bio, int err)
4070 {
4071         struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4072         int is_orig_bio = 0;
4073
4074         if (err) {
4075                 atomic_inc(&bbio->error);
4076                 if (err == -EIO || err == -EREMOTEIO) {
4077                         unsigned int stripe_index =
4078                                 extract_stripe_index_from_bio_private(
4079                                         bio->bi_private);
4080                         struct btrfs_device *dev;
4081
4082                         BUG_ON(stripe_index >= bbio->num_stripes);
4083                         dev = bbio->stripes[stripe_index].dev;
4084                         if (dev->bdev) {
4085                                 if (bio->bi_rw & WRITE)
4086                                         btrfs_dev_stat_inc(dev,
4087                                                 BTRFS_DEV_STAT_WRITE_ERRS);
4088                                 else
4089                                         btrfs_dev_stat_inc(dev,
4090                                                 BTRFS_DEV_STAT_READ_ERRS);
4091                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4092                                         btrfs_dev_stat_inc(dev,
4093                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
4094                                 btrfs_dev_stat_print_on_error(dev);
4095                         }
4096                 }
4097         }
4098
4099         if (bio == bbio->orig_bio)
4100                 is_orig_bio = 1;
4101
4102         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4103                 if (!is_orig_bio) {
4104                         bio_put(bio);
4105                         bio = bbio->orig_bio;
4106                 }
4107                 bio->bi_private = bbio->private;
4108                 bio->bi_end_io = bbio->end_io;
4109                 bio->bi_bdev = (struct block_device *)
4110                                         (unsigned long)bbio->mirror_num;
4111                 /* only send an error to the higher layers if it is
4112                  * beyond the tolerance of the multi-bio
4113                  */
4114                 if (atomic_read(&bbio->error) > bbio->max_errors) {
4115                         err = -EIO;
4116                 } else {
4117                         /*
4118                          * this bio is actually up to date, we didn't
4119                          * go over the max number of errors
4120                          */
4121                         set_bit(BIO_UPTODATE, &bio->bi_flags);
4122                         err = 0;
4123                 }
4124                 kfree(bbio);
4125
4126                 bio_endio(bio, err);
4127         } else if (!is_orig_bio) {
4128                 bio_put(bio);
4129         }
4130 }
4131
4132 struct async_sched {
4133         struct bio *bio;
4134         int rw;
4135         struct btrfs_fs_info *info;
4136         struct btrfs_work work;
4137 };
4138
4139 /*
4140  * see run_scheduled_bios for a description of why bios are collected for
4141  * async submit.
4142  *
4143  * This will add one bio to the pending list for a device and make sure
4144  * the work struct is scheduled.
4145  */
4146 static noinline void schedule_bio(struct btrfs_root *root,
4147                                  struct btrfs_device *device,
4148                                  int rw, struct bio *bio)
4149 {
4150         int should_queue = 1;
4151         struct btrfs_pending_bios *pending_bios;
4152
4153         /* don't bother with additional async steps for reads, right now */
4154         if (!(rw & REQ_WRITE)) {
4155                 bio_get(bio);
4156                 btrfsic_submit_bio(rw, bio);
4157                 bio_put(bio);
4158                 return;
4159         }
4160
4161         /*
4162          * nr_async_bios allows us to reliably return congestion to the
4163          * higher layers.  Otherwise, the async bio makes it appear we have
4164          * made progress against dirty pages when we've really just put it
4165          * on a queue for later
4166          */
4167         atomic_inc(&root->fs_info->nr_async_bios);
4168         WARN_ON(bio->bi_next);
4169         bio->bi_next = NULL;
4170         bio->bi_rw |= rw;
4171
4172         spin_lock(&device->io_lock);
4173         if (bio->bi_rw & REQ_SYNC)
4174                 pending_bios = &device->pending_sync_bios;
4175         else
4176                 pending_bios = &device->pending_bios;
4177
4178         if (pending_bios->tail)
4179                 pending_bios->tail->bi_next = bio;
4180
4181         pending_bios->tail = bio;
4182         if (!pending_bios->head)
4183                 pending_bios->head = bio;
4184         if (device->running_pending)
4185                 should_queue = 0;
4186
4187         spin_unlock(&device->io_lock);
4188
4189         if (should_queue)
4190                 btrfs_queue_worker(&root->fs_info->submit_workers,
4191                                    &device->work);
4192 }
4193
4194 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4195                   int mirror_num, int async_submit)
4196 {
4197         struct btrfs_mapping_tree *map_tree;
4198         struct btrfs_device *dev;
4199         struct bio *first_bio = bio;
4200         u64 logical = (u64)bio->bi_sector << 9;
4201         u64 length = 0;
4202         u64 map_length;
4203         int ret;
4204         int dev_nr = 0;
4205         int total_devs = 1;
4206         struct btrfs_bio *bbio = NULL;
4207
4208         length = bio->bi_size;
4209         map_tree = &root->fs_info->mapping_tree;
4210         map_length = length;
4211
4212         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4213                               mirror_num);
4214         if (ret) /* -ENOMEM */
4215                 return ret;
4216
4217         total_devs = bbio->num_stripes;
4218         if (map_length < length) {
4219                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4220                        "len %llu\n", (unsigned long long)logical,
4221                        (unsigned long long)length,
4222                        (unsigned long long)map_length);
4223                 BUG();
4224         }
4225
4226         bbio->orig_bio = first_bio;
4227         bbio->private = first_bio->bi_private;
4228         bbio->end_io = first_bio->bi_end_io;
4229         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4230
4231         while (dev_nr < total_devs) {
4232                 if (dev_nr < total_devs - 1) {
4233                         bio = bio_clone(first_bio, GFP_NOFS);
4234                         BUG_ON(!bio); /* -ENOMEM */
4235                 } else {
4236                         bio = first_bio;
4237                 }
4238                 bio->bi_private = bbio;
4239                 bio->bi_private = merge_stripe_index_into_bio_private(
4240                                 bio->bi_private, (unsigned int)dev_nr);
4241                 bio->bi_end_io = btrfs_end_bio;
4242                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4243                 dev = bbio->stripes[dev_nr].dev;
4244                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4245 #ifdef DEBUG
4246                         struct rcu_string *name;
4247
4248                         rcu_read_lock();
4249                         name = rcu_dereference(dev->name);
4250                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4251                                  "(%s id %llu), size=%u\n", rw,
4252                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4253                                  name->str, dev->devid, bio->bi_size);
4254                         rcu_read_unlock();
4255 #endif
4256                         bio->bi_bdev = dev->bdev;
4257                         if (async_submit)
4258                                 schedule_bio(root, dev, rw, bio);
4259                         else
4260                                 btrfsic_submit_bio(rw, bio);
4261                 } else {
4262                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4263                         bio->bi_sector = logical >> 9;
4264                         bio_endio(bio, -EIO);
4265                 }
4266                 dev_nr++;
4267         }
4268         return 0;
4269 }
4270
4271 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4272                                        u8 *uuid, u8 *fsid)
4273 {
4274         struct btrfs_device *device;
4275         struct btrfs_fs_devices *cur_devices;
4276
4277         cur_devices = root->fs_info->fs_devices;
4278         while (cur_devices) {
4279                 if (!fsid ||
4280                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4281                         device = __find_device(&cur_devices->devices,
4282                                                devid, uuid);
4283                         if (device)
4284                                 return device;
4285                 }
4286                 cur_devices = cur_devices->seed;
4287         }
4288         return NULL;
4289 }
4290
4291 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4292                                             u64 devid, u8 *dev_uuid)
4293 {
4294         struct btrfs_device *device;
4295         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4296
4297         device = kzalloc(sizeof(*device), GFP_NOFS);
4298         if (!device)
4299                 return NULL;
4300         list_add(&device->dev_list,
4301                  &fs_devices->devices);
4302         device->dev_root = root->fs_info->dev_root;
4303         device->devid = devid;
4304         device->work.func = pending_bios_fn;
4305         device->fs_devices = fs_devices;
4306         device->missing = 1;
4307         fs_devices->num_devices++;
4308         fs_devices->missing_devices++;
4309         spin_lock_init(&device->io_lock);
4310         INIT_LIST_HEAD(&device->dev_alloc_list);
4311         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4312         return device;
4313 }
4314
4315 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4316                           struct extent_buffer *leaf,
4317                           struct btrfs_chunk *chunk)
4318 {
4319         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4320         struct map_lookup *map;
4321         struct extent_map *em;
4322         u64 logical;
4323         u64 length;
4324         u64 devid;
4325         u8 uuid[BTRFS_UUID_SIZE];
4326         int num_stripes;
4327         int ret;
4328         int i;
4329
4330         logical = key->offset;
4331         length = btrfs_chunk_length(leaf, chunk);
4332
4333         read_lock(&map_tree->map_tree.lock);
4334         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4335         read_unlock(&map_tree->map_tree.lock);
4336
4337         /* already mapped? */
4338         if (em && em->start <= logical && em->start + em->len > logical) {
4339                 free_extent_map(em);
4340                 return 0;
4341         } else if (em) {
4342                 free_extent_map(em);
4343         }
4344
4345         em = alloc_extent_map();
4346         if (!em)
4347                 return -ENOMEM;
4348         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4349         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4350         if (!map) {
4351                 free_extent_map(em);
4352                 return -ENOMEM;
4353         }
4354
4355         em->bdev = (struct block_device *)map;
4356         em->start = logical;
4357         em->len = length;
4358         em->block_start = 0;
4359         em->block_len = em->len;
4360
4361         map->num_stripes = num_stripes;
4362         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4363         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4364         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4365         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4366         map->type = btrfs_chunk_type(leaf, chunk);
4367         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4368         for (i = 0; i < num_stripes; i++) {
4369                 map->stripes[i].physical =
4370                         btrfs_stripe_offset_nr(leaf, chunk, i);
4371                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4372                 read_extent_buffer(leaf, uuid, (unsigned long)
4373                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4374                                    BTRFS_UUID_SIZE);
4375                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4376                                                         NULL);
4377                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4378                         kfree(map);
4379                         free_extent_map(em);
4380                         return -EIO;
4381                 }
4382                 if (!map->stripes[i].dev) {
4383                         map->stripes[i].dev =
4384                                 add_missing_dev(root, devid, uuid);
4385                         if (!map->stripes[i].dev) {
4386                                 kfree(map);
4387                                 free_extent_map(em);
4388                                 return -EIO;
4389                         }
4390                 }
4391                 map->stripes[i].dev->in_fs_metadata = 1;
4392         }
4393
4394         write_lock(&map_tree->map_tree.lock);
4395         ret = add_extent_mapping(&map_tree->map_tree, em);
4396         write_unlock(&map_tree->map_tree.lock);
4397         BUG_ON(ret); /* Tree corruption */
4398         free_extent_map(em);
4399
4400         return 0;
4401 }
4402
4403 static void fill_device_from_item(struct extent_buffer *leaf,
4404                                  struct btrfs_dev_item *dev_item,
4405                                  struct btrfs_device *device)
4406 {
4407         unsigned long ptr;
4408
4409         device->devid = btrfs_device_id(leaf, dev_item);
4410         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4411         device->total_bytes = device->disk_total_bytes;
4412         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4413         device->type = btrfs_device_type(leaf, dev_item);
4414         device->io_align = btrfs_device_io_align(leaf, dev_item);
4415         device->io_width = btrfs_device_io_width(leaf, dev_item);
4416         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4417
4418         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4419         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4420 }
4421
4422 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4423 {
4424         struct btrfs_fs_devices *fs_devices;
4425         int ret;
4426
4427         BUG_ON(!mutex_is_locked(&uuid_mutex));
4428
4429         fs_devices = root->fs_info->fs_devices->seed;
4430         while (fs_devices) {
4431                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4432                         ret = 0;
4433                         goto out;
4434                 }
4435                 fs_devices = fs_devices->seed;
4436         }
4437
4438         fs_devices = find_fsid(fsid);
4439         if (!fs_devices) {
4440                 ret = -ENOENT;
4441                 goto out;
4442         }
4443
4444         fs_devices = clone_fs_devices(fs_devices);
4445         if (IS_ERR(fs_devices)) {
4446                 ret = PTR_ERR(fs_devices);
4447                 goto out;
4448         }
4449
4450         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4451                                    root->fs_info->bdev_holder);
4452         if (ret) {
4453                 free_fs_devices(fs_devices);
4454                 goto out;
4455         }
4456
4457         if (!fs_devices->seeding) {
4458                 __btrfs_close_devices(fs_devices);
4459                 free_fs_devices(fs_devices);
4460                 ret = -EINVAL;
4461                 goto out;
4462         }
4463
4464         fs_devices->seed = root->fs_info->fs_devices->seed;
4465         root->fs_info->fs_devices->seed = fs_devices;
4466 out:
4467         return ret;
4468 }
4469
4470 static int read_one_dev(struct btrfs_root *root,
4471                         struct extent_buffer *leaf,
4472                         struct btrfs_dev_item *dev_item)
4473 {
4474         struct btrfs_device *device;
4475         u64 devid;
4476         int ret;
4477         u8 fs_uuid[BTRFS_UUID_SIZE];
4478         u8 dev_uuid[BTRFS_UUID_SIZE];
4479
4480         devid = btrfs_device_id(leaf, dev_item);
4481         read_extent_buffer(leaf, dev_uuid,
4482                            (unsigned long)btrfs_device_uuid(dev_item),
4483                            BTRFS_UUID_SIZE);
4484         read_extent_buffer(leaf, fs_uuid,
4485                            (unsigned long)btrfs_device_fsid(dev_item),
4486                            BTRFS_UUID_SIZE);
4487
4488         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4489                 ret = open_seed_devices(root, fs_uuid);
4490                 if (ret && !btrfs_test_opt(root, DEGRADED))
4491                         return ret;
4492         }
4493
4494         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4495         if (!device || !device->bdev) {
4496                 if (!btrfs_test_opt(root, DEGRADED))
4497                         return -EIO;
4498
4499                 if (!device) {
4500                         printk(KERN_WARNING "warning devid %llu missing\n",
4501                                (unsigned long long)devid);
4502                         device = add_missing_dev(root, devid, dev_uuid);
4503                         if (!device)
4504                                 return -ENOMEM;
4505                 } else if (!device->missing) {
4506                         /*
4507                          * this happens when a device that was properly setup
4508                          * in the device info lists suddenly goes bad.
4509                          * device->bdev is NULL, and so we have to set
4510                          * device->missing to one here
4511                          */
4512                         root->fs_info->fs_devices->missing_devices++;
4513                         device->missing = 1;
4514                 }
4515         }
4516
4517         if (device->fs_devices != root->fs_info->fs_devices) {
4518                 BUG_ON(device->writeable);
4519                 if (device->generation !=
4520                     btrfs_device_generation(leaf, dev_item))
4521                         return -EINVAL;
4522         }
4523
4524         fill_device_from_item(leaf, dev_item, device);
4525         device->dev_root = root->fs_info->dev_root;
4526         device->in_fs_metadata = 1;
4527         if (device->writeable) {
4528                 device->fs_devices->total_rw_bytes += device->total_bytes;
4529                 spin_lock(&root->fs_info->free_chunk_lock);
4530                 root->fs_info->free_chunk_space += device->total_bytes -
4531                         device->bytes_used;
4532                 spin_unlock(&root->fs_info->free_chunk_lock);
4533         }
4534         ret = 0;
4535         return ret;
4536 }
4537
4538 int btrfs_read_sys_array(struct btrfs_root *root)
4539 {
4540         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4541         struct extent_buffer *sb;
4542         struct btrfs_disk_key *disk_key;
4543         struct btrfs_chunk *chunk;
4544         u8 *ptr;
4545         unsigned long sb_ptr;
4546         int ret = 0;
4547         u32 num_stripes;
4548         u32 array_size;
4549         u32 len = 0;
4550         u32 cur;
4551         struct btrfs_key key;
4552
4553         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4554                                           BTRFS_SUPER_INFO_SIZE);
4555         if (!sb)
4556                 return -ENOMEM;
4557         btrfs_set_buffer_uptodate(sb);
4558         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4559         /*
4560          * The sb extent buffer is artifical and just used to read the system array.
4561          * btrfs_set_buffer_uptodate() call does not properly mark all it's
4562          * pages up-to-date when the page is larger: extent does not cover the
4563          * whole page and consequently check_page_uptodate does not find all
4564          * the page's extents up-to-date (the hole beyond sb),
4565          * write_extent_buffer then triggers a WARN_ON.
4566          *
4567          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
4568          * but sb spans only this function. Add an explicit SetPageUptodate call
4569          * to silence the warning eg. on PowerPC 64.
4570          */
4571         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4572                 SetPageUptodate(sb->pages[0]);
4573
4574         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4575         array_size = btrfs_super_sys_array_size(super_copy);
4576
4577         ptr = super_copy->sys_chunk_array;
4578         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4579         cur = 0;
4580
4581         while (cur < array_size) {
4582                 disk_key = (struct btrfs_disk_key *)ptr;
4583                 btrfs_disk_key_to_cpu(&key, disk_key);
4584
4585                 len = sizeof(*disk_key); ptr += len;
4586                 sb_ptr += len;
4587                 cur += len;
4588
4589                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4590                         chunk = (struct btrfs_chunk *)sb_ptr;
4591                         ret = read_one_chunk(root, &key, sb, chunk);
4592                         if (ret)
4593                                 break;
4594                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4595                         len = btrfs_chunk_item_size(num_stripes);
4596                 } else {
4597                         ret = -EIO;
4598                         break;
4599                 }
4600                 ptr += len;
4601                 sb_ptr += len;
4602                 cur += len;
4603         }
4604         free_extent_buffer(sb);
4605         return ret;
4606 }
4607
4608 struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4609                                                    u64 logical, int mirror_num)
4610 {
4611         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4612         int ret;
4613         u64 map_length = 0;
4614         struct btrfs_bio *bbio = NULL;
4615         struct btrfs_device *device;
4616
4617         BUG_ON(mirror_num == 0);
4618         ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4619                               mirror_num);
4620         if (ret) {
4621                 BUG_ON(bbio != NULL);
4622                 return NULL;
4623         }
4624         BUG_ON(mirror_num != bbio->mirror_num);
4625         device = bbio->stripes[mirror_num - 1].dev;
4626         kfree(bbio);
4627         return device;
4628 }
4629
4630 int btrfs_read_chunk_tree(struct btrfs_root *root)
4631 {
4632         struct btrfs_path *path;
4633         struct extent_buffer *leaf;
4634         struct btrfs_key key;
4635         struct btrfs_key found_key;
4636         int ret;
4637         int slot;
4638
4639         root = root->fs_info->chunk_root;
4640
4641         path = btrfs_alloc_path();
4642         if (!path)
4643                 return -ENOMEM;
4644
4645         mutex_lock(&uuid_mutex);
4646         lock_chunks(root);
4647
4648         /* first we search for all of the device items, and then we
4649          * read in all of the chunk items.  This way we can create chunk
4650          * mappings that reference all of the devices that are afound
4651          */
4652         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4653         key.offset = 0;
4654         key.type = 0;
4655 again:
4656         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4657         if (ret < 0)
4658                 goto error;
4659         while (1) {
4660                 leaf = path->nodes[0];
4661                 slot = path->slots[0];
4662                 if (slot >= btrfs_header_nritems(leaf)) {
4663                         ret = btrfs_next_leaf(root, path);
4664                         if (ret == 0)
4665                                 continue;
4666                         if (ret < 0)
4667                                 goto error;
4668                         break;
4669                 }
4670                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4671                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4672                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4673                                 break;
4674                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4675                                 struct btrfs_dev_item *dev_item;
4676                                 dev_item = btrfs_item_ptr(leaf, slot,
4677                                                   struct btrfs_dev_item);
4678                                 ret = read_one_dev(root, leaf, dev_item);
4679                                 if (ret)
4680                                         goto error;
4681                         }
4682                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4683                         struct btrfs_chunk *chunk;
4684                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4685                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4686                         if (ret)
4687                                 goto error;
4688                 }
4689                 path->slots[0]++;
4690         }
4691         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4692                 key.objectid = 0;
4693                 btrfs_release_path(path);
4694                 goto again;
4695         }
4696         ret = 0;
4697 error:
4698         unlock_chunks(root);
4699         mutex_unlock(&uuid_mutex);
4700
4701         btrfs_free_path(path);
4702         return ret;
4703 }
4704
4705 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4706 {
4707         int i;
4708
4709         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4710                 btrfs_dev_stat_reset(dev, i);
4711 }
4712
4713 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4714 {
4715         struct btrfs_key key;
4716         struct btrfs_key found_key;
4717         struct btrfs_root *dev_root = fs_info->dev_root;
4718         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4719         struct extent_buffer *eb;
4720         int slot;
4721         int ret = 0;
4722         struct btrfs_device *device;
4723         struct btrfs_path *path = NULL;
4724         int i;
4725
4726         path = btrfs_alloc_path();
4727         if (!path) {
4728                 ret = -ENOMEM;
4729                 goto out;
4730         }
4731
4732         mutex_lock(&fs_devices->device_list_mutex);
4733         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4734                 int item_size;
4735                 struct btrfs_dev_stats_item *ptr;
4736
4737                 key.objectid = 0;
4738                 key.type = BTRFS_DEV_STATS_KEY;
4739                 key.offset = device->devid;
4740                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4741                 if (ret) {
4742                         __btrfs_reset_dev_stats(device);
4743                         device->dev_stats_valid = 1;
4744                         btrfs_release_path(path);
4745                         continue;
4746                 }
4747                 slot = path->slots[0];
4748                 eb = path->nodes[0];
4749                 btrfs_item_key_to_cpu(eb, &found_key, slot);
4750                 item_size = btrfs_item_size_nr(eb, slot);
4751
4752                 ptr = btrfs_item_ptr(eb, slot,
4753                                      struct btrfs_dev_stats_item);
4754
4755                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4756                         if (item_size >= (1 + i) * sizeof(__le64))
4757                                 btrfs_dev_stat_set(device, i,
4758                                         btrfs_dev_stats_value(eb, ptr, i));
4759                         else
4760                                 btrfs_dev_stat_reset(device, i);
4761                 }
4762
4763                 device->dev_stats_valid = 1;
4764                 btrfs_dev_stat_print_on_load(device);
4765                 btrfs_release_path(path);
4766         }
4767         mutex_unlock(&fs_devices->device_list_mutex);
4768
4769 out:
4770         btrfs_free_path(path);
4771         return ret < 0 ? ret : 0;
4772 }
4773
4774 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4775                                 struct btrfs_root *dev_root,
4776                                 struct btrfs_device *device)
4777 {
4778         struct btrfs_path *path;
4779         struct btrfs_key key;
4780         struct extent_buffer *eb;
4781         struct btrfs_dev_stats_item *ptr;
4782         int ret;
4783         int i;
4784
4785         key.objectid = 0;
4786         key.type = BTRFS_DEV_STATS_KEY;
4787         key.offset = device->devid;
4788
4789         path = btrfs_alloc_path();
4790         BUG_ON(!path);
4791         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4792         if (ret < 0) {
4793                 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4794                               ret, rcu_str_deref(device->name));
4795                 goto out;
4796         }
4797
4798         if (ret == 0 &&
4799             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4800                 /* need to delete old one and insert a new one */
4801                 ret = btrfs_del_item(trans, dev_root, path);
4802                 if (ret != 0) {
4803                         printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4804                                       rcu_str_deref(device->name), ret);
4805                         goto out;
4806                 }
4807                 ret = 1;
4808         }
4809
4810         if (ret == 1) {
4811                 /* need to insert a new item */
4812                 btrfs_release_path(path);
4813                 ret = btrfs_insert_empty_item(trans, dev_root, path,
4814                                               &key, sizeof(*ptr));
4815                 if (ret < 0) {
4816                         printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4817                                       rcu_str_deref(device->name), ret);
4818                         goto out;
4819                 }
4820         }
4821
4822         eb = path->nodes[0];
4823         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4824         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4825                 btrfs_set_dev_stats_value(eb, ptr, i,
4826                                           btrfs_dev_stat_read(device, i));
4827         btrfs_mark_buffer_dirty(eb);
4828
4829 out:
4830         btrfs_free_path(path);
4831         return ret;
4832 }
4833
4834 /*
4835  * called from commit_transaction. Writes all changed device stats to disk.
4836  */
4837 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4838                         struct btrfs_fs_info *fs_info)
4839 {
4840         struct btrfs_root *dev_root = fs_info->dev_root;
4841         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4842         struct btrfs_device *device;
4843         int ret = 0;
4844
4845         mutex_lock(&fs_devices->device_list_mutex);
4846         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4847                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4848                         continue;
4849
4850                 ret = update_dev_stat_item(trans, dev_root, device);
4851                 if (!ret)
4852                         device->dev_stats_dirty = 0;
4853         }
4854         mutex_unlock(&fs_devices->device_list_mutex);
4855
4856         return ret;
4857 }
4858
4859 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4860 {
4861         btrfs_dev_stat_inc(dev, index);
4862         btrfs_dev_stat_print_on_error(dev);
4863 }
4864
4865 void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4866 {
4867         if (!dev->dev_stats_valid)
4868                 return;
4869         printk_ratelimited_in_rcu(KERN_ERR
4870                            "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4871                            rcu_str_deref(dev->name),
4872                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4873                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4874                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4875                            btrfs_dev_stat_read(dev,
4876                                                BTRFS_DEV_STAT_CORRUPTION_ERRS),
4877                            btrfs_dev_stat_read(dev,
4878                                                BTRFS_DEV_STAT_GENERATION_ERRS));
4879 }
4880
4881 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4882 {
4883         int i;
4884
4885         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4886                 if (btrfs_dev_stat_read(dev, i) != 0)
4887                         break;
4888         if (i == BTRFS_DEV_STAT_VALUES_MAX)
4889                 return; /* all values == 0, suppress message */
4890
4891         printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4892                rcu_str_deref(dev->name),
4893                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4894                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4895                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4896                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4897                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4898 }
4899
4900 int btrfs_get_dev_stats(struct btrfs_root *root,
4901                         struct btrfs_ioctl_get_dev_stats *stats)
4902 {
4903         struct btrfs_device *dev;
4904         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4905         int i;
4906
4907         mutex_lock(&fs_devices->device_list_mutex);
4908         dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4909         mutex_unlock(&fs_devices->device_list_mutex);
4910
4911         if (!dev) {
4912                 printk(KERN_WARNING
4913                        "btrfs: get dev_stats failed, device not found\n");
4914                 return -ENODEV;
4915         } else if (!dev->dev_stats_valid) {
4916                 printk(KERN_WARNING
4917                        "btrfs: get dev_stats failed, not yet valid\n");
4918                 return -ENODEV;
4919         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4920                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4921                         if (stats->nr_items > i)
4922                                 stats->values[i] =
4923                                         btrfs_dev_stat_read_and_reset(dev, i);
4924                         else
4925                                 btrfs_dev_stat_reset(dev, i);
4926                 }
4927         } else {
4928                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4929                         if (stats->nr_items > i)
4930                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
4931         }
4932         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4933                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4934         return 0;
4935 }