]> git.karo-electronics.de Git - linux-beck.git/blob - fs/btrfs/volumes.c
Btrfs: keep processing bios for a given bdev if our proc is batching
[linux-beck.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/buffer_head.h>
21 #include <linux/blkdev.h>
22 #include <linux/random.h>
23 #include <linux/iocontext.h>
24 #include <asm/div64.h>
25 #include "compat.h"
26 #include "ctree.h"
27 #include "extent_map.h"
28 #include "disk-io.h"
29 #include "transaction.h"
30 #include "print-tree.h"
31 #include "volumes.h"
32 #include "async-thread.h"
33
34 struct map_lookup {
35         u64 type;
36         int io_align;
37         int io_width;
38         int stripe_len;
39         int sector_size;
40         int num_stripes;
41         int sub_stripes;
42         struct btrfs_bio_stripe stripes[];
43 };
44
45 static int init_first_rw_device(struct btrfs_trans_handle *trans,
46                                 struct btrfs_root *root,
47                                 struct btrfs_device *device);
48 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
51                             (sizeof(struct btrfs_bio_stripe) * (n)))
52
53 static DEFINE_MUTEX(uuid_mutex);
54 static LIST_HEAD(fs_uuids);
55
56 void btrfs_lock_volumes(void)
57 {
58         mutex_lock(&uuid_mutex);
59 }
60
61 void btrfs_unlock_volumes(void)
62 {
63         mutex_unlock(&uuid_mutex);
64 }
65
66 static void lock_chunks(struct btrfs_root *root)
67 {
68         mutex_lock(&root->fs_info->chunk_mutex);
69 }
70
71 static void unlock_chunks(struct btrfs_root *root)
72 {
73         mutex_unlock(&root->fs_info->chunk_mutex);
74 }
75
76 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77 {
78         struct btrfs_device *device;
79         WARN_ON(fs_devices->opened);
80         while (!list_empty(&fs_devices->devices)) {
81                 device = list_entry(fs_devices->devices.next,
82                                     struct btrfs_device, dev_list);
83                 list_del(&device->dev_list);
84                 kfree(device->name);
85                 kfree(device);
86         }
87         kfree(fs_devices);
88 }
89
90 int btrfs_cleanup_fs_uuids(void)
91 {
92         struct btrfs_fs_devices *fs_devices;
93
94         while (!list_empty(&fs_uuids)) {
95                 fs_devices = list_entry(fs_uuids.next,
96                                         struct btrfs_fs_devices, list);
97                 list_del(&fs_devices->list);
98                 free_fs_devices(fs_devices);
99         }
100         return 0;
101 }
102
103 static noinline struct btrfs_device *__find_device(struct list_head *head,
104                                                    u64 devid, u8 *uuid)
105 {
106         struct btrfs_device *dev;
107
108         list_for_each_entry(dev, head, dev_list) {
109                 if (dev->devid == devid &&
110                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
111                         return dev;
112                 }
113         }
114         return NULL;
115 }
116
117 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
118 {
119         struct btrfs_fs_devices *fs_devices;
120
121         list_for_each_entry(fs_devices, &fs_uuids, list) {
122                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
123                         return fs_devices;
124         }
125         return NULL;
126 }
127
128 /*
129  * we try to collect pending bios for a device so we don't get a large
130  * number of procs sending bios down to the same device.  This greatly
131  * improves the schedulers ability to collect and merge the bios.
132  *
133  * But, it also turns into a long list of bios to process and that is sure
134  * to eventually make the worker thread block.  The solution here is to
135  * make some progress and then put this work struct back at the end of
136  * the list if the block device is congested.  This way, multiple devices
137  * can make progress from a single worker thread.
138  */
139 static noinline int run_scheduled_bios(struct btrfs_device *device)
140 {
141         struct bio *pending;
142         struct backing_dev_info *bdi;
143         struct btrfs_fs_info *fs_info;
144         struct bio *tail;
145         struct bio *cur;
146         int again = 0;
147         unsigned long num_run = 0;
148         unsigned long limit;
149         unsigned long last_waited = 0;
150
151         bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
152         fs_info = device->dev_root->fs_info;
153         limit = btrfs_async_submit_limit(fs_info);
154         limit = limit * 2 / 3;
155
156 loop:
157         spin_lock(&device->io_lock);
158
159 loop_lock:
160         /* take all the bios off the list at once and process them
161          * later on (without the lock held).  But, remember the
162          * tail and other pointers so the bios can be properly reinserted
163          * into the list if we hit congestion
164          */
165         pending = device->pending_bios;
166         tail = device->pending_bio_tail;
167         WARN_ON(pending && !tail);
168         device->pending_bios = NULL;
169         device->pending_bio_tail = NULL;
170
171         /*
172          * if pending was null this time around, no bios need processing
173          * at all and we can stop.  Otherwise it'll loop back up again
174          * and do an additional check so no bios are missed.
175          *
176          * device->running_pending is used to synchronize with the
177          * schedule_bio code.
178          */
179         if (pending) {
180                 again = 1;
181                 device->running_pending = 1;
182         } else {
183                 again = 0;
184                 device->running_pending = 0;
185         }
186         spin_unlock(&device->io_lock);
187
188         while (pending) {
189                 cur = pending;
190                 pending = pending->bi_next;
191                 cur->bi_next = NULL;
192                 atomic_dec(&fs_info->nr_async_bios);
193
194                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
195                     waitqueue_active(&fs_info->async_submit_wait))
196                         wake_up(&fs_info->async_submit_wait);
197
198                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
199                 bio_get(cur);
200                 submit_bio(cur->bi_rw, cur);
201                 bio_put(cur);
202                 num_run++;
203
204                 /*
205                  * we made progress, there is more work to do and the bdi
206                  * is now congested.  Back off and let other work structs
207                  * run instead
208                  */
209                 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
210                     fs_info->fs_devices->open_devices > 1) {
211                         struct bio *old_head;
212                         struct io_context *ioc;
213
214                         ioc = current->io_context;
215
216                         /*
217                          * the main goal here is that we don't want to
218                          * block if we're going to be able to submit
219                          * more requests without blocking.
220                          *
221                          * This code does two great things, it pokes into
222                          * the elevator code from a filesystem _and_
223                          * it makes assumptions about how batching works.
224                          */
225                         if (ioc && ioc->nr_batch_requests > 0 &&
226                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227                             (last_waited == 0 ||
228                              ioc->last_waited == last_waited)) {
229                                 /*
230                                  * we want to go through our batch of
231                                  * requests and stop.  So, we copy out
232                                  * the ioc->last_waited time and test
233                                  * against it before looping
234                                  */
235                                 last_waited = ioc->last_waited;
236                                 continue;
237                         }
238                         spin_lock(&device->io_lock);
239
240                         old_head = device->pending_bios;
241                         device->pending_bios = pending;
242                         if (device->pending_bio_tail)
243                                 tail->bi_next = old_head;
244                         else
245                                 device->pending_bio_tail = tail;
246
247                         device->running_pending = 1;
248
249                         spin_unlock(&device->io_lock);
250                         btrfs_requeue_work(&device->work);
251                         goto done;
252                 }
253         }
254         if (again)
255                 goto loop;
256
257         spin_lock(&device->io_lock);
258         if (device->pending_bios)
259                 goto loop_lock;
260         spin_unlock(&device->io_lock);
261 done:
262         return 0;
263 }
264
265 static void pending_bios_fn(struct btrfs_work *work)
266 {
267         struct btrfs_device *device;
268
269         device = container_of(work, struct btrfs_device, work);
270         run_scheduled_bios(device);
271 }
272
273 static noinline int device_list_add(const char *path,
274                            struct btrfs_super_block *disk_super,
275                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
276 {
277         struct btrfs_device *device;
278         struct btrfs_fs_devices *fs_devices;
279         u64 found_transid = btrfs_super_generation(disk_super);
280
281         fs_devices = find_fsid(disk_super->fsid);
282         if (!fs_devices) {
283                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
284                 if (!fs_devices)
285                         return -ENOMEM;
286                 INIT_LIST_HEAD(&fs_devices->devices);
287                 INIT_LIST_HEAD(&fs_devices->alloc_list);
288                 list_add(&fs_devices->list, &fs_uuids);
289                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
290                 fs_devices->latest_devid = devid;
291                 fs_devices->latest_trans = found_transid;
292                 device = NULL;
293         } else {
294                 device = __find_device(&fs_devices->devices, devid,
295                                        disk_super->dev_item.uuid);
296         }
297         if (!device) {
298                 if (fs_devices->opened)
299                         return -EBUSY;
300
301                 device = kzalloc(sizeof(*device), GFP_NOFS);
302                 if (!device) {
303                         /* we can safely leave the fs_devices entry around */
304                         return -ENOMEM;
305                 }
306                 device->devid = devid;
307                 device->work.func = pending_bios_fn;
308                 memcpy(device->uuid, disk_super->dev_item.uuid,
309                        BTRFS_UUID_SIZE);
310                 device->barriers = 1;
311                 spin_lock_init(&device->io_lock);
312                 device->name = kstrdup(path, GFP_NOFS);
313                 if (!device->name) {
314                         kfree(device);
315                         return -ENOMEM;
316                 }
317                 INIT_LIST_HEAD(&device->dev_alloc_list);
318                 list_add(&device->dev_list, &fs_devices->devices);
319                 device->fs_devices = fs_devices;
320                 fs_devices->num_devices++;
321         }
322
323         if (found_transid > fs_devices->latest_trans) {
324                 fs_devices->latest_devid = devid;
325                 fs_devices->latest_trans = found_transid;
326         }
327         *fs_devices_ret = fs_devices;
328         return 0;
329 }
330
331 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
332 {
333         struct btrfs_fs_devices *fs_devices;
334         struct btrfs_device *device;
335         struct btrfs_device *orig_dev;
336
337         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
338         if (!fs_devices)
339                 return ERR_PTR(-ENOMEM);
340
341         INIT_LIST_HEAD(&fs_devices->devices);
342         INIT_LIST_HEAD(&fs_devices->alloc_list);
343         INIT_LIST_HEAD(&fs_devices->list);
344         fs_devices->latest_devid = orig->latest_devid;
345         fs_devices->latest_trans = orig->latest_trans;
346         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
347
348         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
349                 device = kzalloc(sizeof(*device), GFP_NOFS);
350                 if (!device)
351                         goto error;
352
353                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
354                 if (!device->name)
355                         goto error;
356
357                 device->devid = orig_dev->devid;
358                 device->work.func = pending_bios_fn;
359                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
360                 device->barriers = 1;
361                 spin_lock_init(&device->io_lock);
362                 INIT_LIST_HEAD(&device->dev_list);
363                 INIT_LIST_HEAD(&device->dev_alloc_list);
364
365                 list_add(&device->dev_list, &fs_devices->devices);
366                 device->fs_devices = fs_devices;
367                 fs_devices->num_devices++;
368         }
369         return fs_devices;
370 error:
371         free_fs_devices(fs_devices);
372         return ERR_PTR(-ENOMEM);
373 }
374
375 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
376 {
377         struct btrfs_device *device, *next;
378
379         mutex_lock(&uuid_mutex);
380 again:
381         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
382                 if (device->in_fs_metadata)
383                         continue;
384
385                 if (device->bdev) {
386                         close_bdev_exclusive(device->bdev, device->mode);
387                         device->bdev = NULL;
388                         fs_devices->open_devices--;
389                 }
390                 if (device->writeable) {
391                         list_del_init(&device->dev_alloc_list);
392                         device->writeable = 0;
393                         fs_devices->rw_devices--;
394                 }
395                 list_del_init(&device->dev_list);
396                 fs_devices->num_devices--;
397                 kfree(device->name);
398                 kfree(device);
399         }
400
401         if (fs_devices->seed) {
402                 fs_devices = fs_devices->seed;
403                 goto again;
404         }
405
406         mutex_unlock(&uuid_mutex);
407         return 0;
408 }
409
410 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
411 {
412         struct btrfs_device *device;
413
414         if (--fs_devices->opened > 0)
415                 return 0;
416
417         list_for_each_entry(device, &fs_devices->devices, dev_list) {
418                 if (device->bdev) {
419                         close_bdev_exclusive(device->bdev, device->mode);
420                         fs_devices->open_devices--;
421                 }
422                 if (device->writeable) {
423                         list_del_init(&device->dev_alloc_list);
424                         fs_devices->rw_devices--;
425                 }
426
427                 device->bdev = NULL;
428                 device->writeable = 0;
429                 device->in_fs_metadata = 0;
430         }
431         WARN_ON(fs_devices->open_devices);
432         WARN_ON(fs_devices->rw_devices);
433         fs_devices->opened = 0;
434         fs_devices->seeding = 0;
435
436         return 0;
437 }
438
439 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
440 {
441         struct btrfs_fs_devices *seed_devices = NULL;
442         int ret;
443
444         mutex_lock(&uuid_mutex);
445         ret = __btrfs_close_devices(fs_devices);
446         if (!fs_devices->opened) {
447                 seed_devices = fs_devices->seed;
448                 fs_devices->seed = NULL;
449         }
450         mutex_unlock(&uuid_mutex);
451
452         while (seed_devices) {
453                 fs_devices = seed_devices;
454                 seed_devices = fs_devices->seed;
455                 __btrfs_close_devices(fs_devices);
456                 free_fs_devices(fs_devices);
457         }
458         return ret;
459 }
460
461 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
462                                 fmode_t flags, void *holder)
463 {
464         struct block_device *bdev;
465         struct list_head *head = &fs_devices->devices;
466         struct btrfs_device *device;
467         struct block_device *latest_bdev = NULL;
468         struct buffer_head *bh;
469         struct btrfs_super_block *disk_super;
470         u64 latest_devid = 0;
471         u64 latest_transid = 0;
472         u64 devid;
473         int seeding = 1;
474         int ret = 0;
475
476         list_for_each_entry(device, head, dev_list) {
477                 if (device->bdev)
478                         continue;
479                 if (!device->name)
480                         continue;
481
482                 bdev = open_bdev_exclusive(device->name, flags, holder);
483                 if (IS_ERR(bdev)) {
484                         printk(KERN_INFO "open %s failed\n", device->name);
485                         goto error;
486                 }
487                 set_blocksize(bdev, 4096);
488
489                 bh = btrfs_read_dev_super(bdev);
490                 if (!bh)
491                         goto error_close;
492
493                 disk_super = (struct btrfs_super_block *)bh->b_data;
494                 devid = le64_to_cpu(disk_super->dev_item.devid);
495                 if (devid != device->devid)
496                         goto error_brelse;
497
498                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
499                            BTRFS_UUID_SIZE))
500                         goto error_brelse;
501
502                 device->generation = btrfs_super_generation(disk_super);
503                 if (!latest_transid || device->generation > latest_transid) {
504                         latest_devid = devid;
505                         latest_transid = device->generation;
506                         latest_bdev = bdev;
507                 }
508
509                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
510                         device->writeable = 0;
511                 } else {
512                         device->writeable = !bdev_read_only(bdev);
513                         seeding = 0;
514                 }
515
516                 device->bdev = bdev;
517                 device->in_fs_metadata = 0;
518                 device->mode = flags;
519
520                 fs_devices->open_devices++;
521                 if (device->writeable) {
522                         fs_devices->rw_devices++;
523                         list_add(&device->dev_alloc_list,
524                                  &fs_devices->alloc_list);
525                 }
526                 continue;
527
528 error_brelse:
529                 brelse(bh);
530 error_close:
531                 close_bdev_exclusive(bdev, FMODE_READ);
532 error:
533                 continue;
534         }
535         if (fs_devices->open_devices == 0) {
536                 ret = -EIO;
537                 goto out;
538         }
539         fs_devices->seeding = seeding;
540         fs_devices->opened = 1;
541         fs_devices->latest_bdev = latest_bdev;
542         fs_devices->latest_devid = latest_devid;
543         fs_devices->latest_trans = latest_transid;
544         fs_devices->total_rw_bytes = 0;
545 out:
546         return ret;
547 }
548
549 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
550                        fmode_t flags, void *holder)
551 {
552         int ret;
553
554         mutex_lock(&uuid_mutex);
555         if (fs_devices->opened) {
556                 fs_devices->opened++;
557                 ret = 0;
558         } else {
559                 ret = __btrfs_open_devices(fs_devices, flags, holder);
560         }
561         mutex_unlock(&uuid_mutex);
562         return ret;
563 }
564
565 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
566                           struct btrfs_fs_devices **fs_devices_ret)
567 {
568         struct btrfs_super_block *disk_super;
569         struct block_device *bdev;
570         struct buffer_head *bh;
571         int ret;
572         u64 devid;
573         u64 transid;
574
575         mutex_lock(&uuid_mutex);
576
577         bdev = open_bdev_exclusive(path, flags, holder);
578
579         if (IS_ERR(bdev)) {
580                 ret = PTR_ERR(bdev);
581                 goto error;
582         }
583
584         ret = set_blocksize(bdev, 4096);
585         if (ret)
586                 goto error_close;
587         bh = btrfs_read_dev_super(bdev);
588         if (!bh) {
589                 ret = -EIO;
590                 goto error_close;
591         }
592         disk_super = (struct btrfs_super_block *)bh->b_data;
593         devid = le64_to_cpu(disk_super->dev_item.devid);
594         transid = btrfs_super_generation(disk_super);
595         if (disk_super->label[0])
596                 printk(KERN_INFO "device label %s ", disk_super->label);
597         else {
598                 /* FIXME, make a readl uuid parser */
599                 printk(KERN_INFO "device fsid %llx-%llx ",
600                        *(unsigned long long *)disk_super->fsid,
601                        *(unsigned long long *)(disk_super->fsid + 8));
602         }
603         printk(KERN_CONT "devid %llu transid %llu %s\n",
604                (unsigned long long)devid, (unsigned long long)transid, path);
605         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
606
607         brelse(bh);
608 error_close:
609         close_bdev_exclusive(bdev, flags);
610 error:
611         mutex_unlock(&uuid_mutex);
612         return ret;
613 }
614
615 /*
616  * this uses a pretty simple search, the expectation is that it is
617  * called very infrequently and that a given device has a small number
618  * of extents
619  */
620 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
621                                          struct btrfs_device *device,
622                                          u64 num_bytes, u64 *start)
623 {
624         struct btrfs_key key;
625         struct btrfs_root *root = device->dev_root;
626         struct btrfs_dev_extent *dev_extent = NULL;
627         struct btrfs_path *path;
628         u64 hole_size = 0;
629         u64 last_byte = 0;
630         u64 search_start = 0;
631         u64 search_end = device->total_bytes;
632         int ret;
633         int slot = 0;
634         int start_found;
635         struct extent_buffer *l;
636
637         path = btrfs_alloc_path();
638         if (!path)
639                 return -ENOMEM;
640         path->reada = 2;
641         start_found = 0;
642
643         /* FIXME use last free of some kind */
644
645         /* we don't want to overwrite the superblock on the drive,
646          * so we make sure to start at an offset of at least 1MB
647          */
648         search_start = max((u64)1024 * 1024, search_start);
649
650         if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
651                 search_start = max(root->fs_info->alloc_start, search_start);
652
653         key.objectid = device->devid;
654         key.offset = search_start;
655         key.type = BTRFS_DEV_EXTENT_KEY;
656         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
657         if (ret < 0)
658                 goto error;
659         ret = btrfs_previous_item(root, path, 0, key.type);
660         if (ret < 0)
661                 goto error;
662         l = path->nodes[0];
663         btrfs_item_key_to_cpu(l, &key, path->slots[0]);
664         while (1) {
665                 l = path->nodes[0];
666                 slot = path->slots[0];
667                 if (slot >= btrfs_header_nritems(l)) {
668                         ret = btrfs_next_leaf(root, path);
669                         if (ret == 0)
670                                 continue;
671                         if (ret < 0)
672                                 goto error;
673 no_more_items:
674                         if (!start_found) {
675                                 if (search_start >= search_end) {
676                                         ret = -ENOSPC;
677                                         goto error;
678                                 }
679                                 *start = search_start;
680                                 start_found = 1;
681                                 goto check_pending;
682                         }
683                         *start = last_byte > search_start ?
684                                 last_byte : search_start;
685                         if (search_end <= *start) {
686                                 ret = -ENOSPC;
687                                 goto error;
688                         }
689                         goto check_pending;
690                 }
691                 btrfs_item_key_to_cpu(l, &key, slot);
692
693                 if (key.objectid < device->devid)
694                         goto next;
695
696                 if (key.objectid > device->devid)
697                         goto no_more_items;
698
699                 if (key.offset >= search_start && key.offset > last_byte &&
700                     start_found) {
701                         if (last_byte < search_start)
702                                 last_byte = search_start;
703                         hole_size = key.offset - last_byte;
704                         if (key.offset > last_byte &&
705                             hole_size >= num_bytes) {
706                                 *start = last_byte;
707                                 goto check_pending;
708                         }
709                 }
710                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
711                         goto next;
712
713                 start_found = 1;
714                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
715                 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
716 next:
717                 path->slots[0]++;
718                 cond_resched();
719         }
720 check_pending:
721         /* we have to make sure we didn't find an extent that has already
722          * been allocated by the map tree or the original allocation
723          */
724         BUG_ON(*start < search_start);
725
726         if (*start + num_bytes > search_end) {
727                 ret = -ENOSPC;
728                 goto error;
729         }
730         /* check for pending inserts here */
731         ret = 0;
732
733 error:
734         btrfs_free_path(path);
735         return ret;
736 }
737
738 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
739                           struct btrfs_device *device,
740                           u64 start)
741 {
742         int ret;
743         struct btrfs_path *path;
744         struct btrfs_root *root = device->dev_root;
745         struct btrfs_key key;
746         struct btrfs_key found_key;
747         struct extent_buffer *leaf = NULL;
748         struct btrfs_dev_extent *extent = NULL;
749
750         path = btrfs_alloc_path();
751         if (!path)
752                 return -ENOMEM;
753
754         key.objectid = device->devid;
755         key.offset = start;
756         key.type = BTRFS_DEV_EXTENT_KEY;
757
758         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
759         if (ret > 0) {
760                 ret = btrfs_previous_item(root, path, key.objectid,
761                                           BTRFS_DEV_EXTENT_KEY);
762                 BUG_ON(ret);
763                 leaf = path->nodes[0];
764                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
765                 extent = btrfs_item_ptr(leaf, path->slots[0],
766                                         struct btrfs_dev_extent);
767                 BUG_ON(found_key.offset > start || found_key.offset +
768                        btrfs_dev_extent_length(leaf, extent) < start);
769                 ret = 0;
770         } else if (ret == 0) {
771                 leaf = path->nodes[0];
772                 extent = btrfs_item_ptr(leaf, path->slots[0],
773                                         struct btrfs_dev_extent);
774         }
775         BUG_ON(ret);
776
777         if (device->bytes_used > 0)
778                 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
779         ret = btrfs_del_item(trans, root, path);
780         BUG_ON(ret);
781
782         btrfs_free_path(path);
783         return ret;
784 }
785
786 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
787                            struct btrfs_device *device,
788                            u64 chunk_tree, u64 chunk_objectid,
789                            u64 chunk_offset, u64 start, u64 num_bytes)
790 {
791         int ret;
792         struct btrfs_path *path;
793         struct btrfs_root *root = device->dev_root;
794         struct btrfs_dev_extent *extent;
795         struct extent_buffer *leaf;
796         struct btrfs_key key;
797
798         WARN_ON(!device->in_fs_metadata);
799         path = btrfs_alloc_path();
800         if (!path)
801                 return -ENOMEM;
802
803         key.objectid = device->devid;
804         key.offset = start;
805         key.type = BTRFS_DEV_EXTENT_KEY;
806         ret = btrfs_insert_empty_item(trans, root, path, &key,
807                                       sizeof(*extent));
808         BUG_ON(ret);
809
810         leaf = path->nodes[0];
811         extent = btrfs_item_ptr(leaf, path->slots[0],
812                                 struct btrfs_dev_extent);
813         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
814         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
815         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
816
817         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
818                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
819                     BTRFS_UUID_SIZE);
820
821         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
822         btrfs_mark_buffer_dirty(leaf);
823         btrfs_free_path(path);
824         return ret;
825 }
826
827 static noinline int find_next_chunk(struct btrfs_root *root,
828                                     u64 objectid, u64 *offset)
829 {
830         struct btrfs_path *path;
831         int ret;
832         struct btrfs_key key;
833         struct btrfs_chunk *chunk;
834         struct btrfs_key found_key;
835
836         path = btrfs_alloc_path();
837         BUG_ON(!path);
838
839         key.objectid = objectid;
840         key.offset = (u64)-1;
841         key.type = BTRFS_CHUNK_ITEM_KEY;
842
843         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
844         if (ret < 0)
845                 goto error;
846
847         BUG_ON(ret == 0);
848
849         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
850         if (ret) {
851                 *offset = 0;
852         } else {
853                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
854                                       path->slots[0]);
855                 if (found_key.objectid != objectid)
856                         *offset = 0;
857                 else {
858                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
859                                                struct btrfs_chunk);
860                         *offset = found_key.offset +
861                                 btrfs_chunk_length(path->nodes[0], chunk);
862                 }
863         }
864         ret = 0;
865 error:
866         btrfs_free_path(path);
867         return ret;
868 }
869
870 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
871 {
872         int ret;
873         struct btrfs_key key;
874         struct btrfs_key found_key;
875         struct btrfs_path *path;
876
877         root = root->fs_info->chunk_root;
878
879         path = btrfs_alloc_path();
880         if (!path)
881                 return -ENOMEM;
882
883         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
884         key.type = BTRFS_DEV_ITEM_KEY;
885         key.offset = (u64)-1;
886
887         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
888         if (ret < 0)
889                 goto error;
890
891         BUG_ON(ret == 0);
892
893         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
894                                   BTRFS_DEV_ITEM_KEY);
895         if (ret) {
896                 *objectid = 1;
897         } else {
898                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
899                                       path->slots[0]);
900                 *objectid = found_key.offset + 1;
901         }
902         ret = 0;
903 error:
904         btrfs_free_path(path);
905         return ret;
906 }
907
908 /*
909  * the device information is stored in the chunk root
910  * the btrfs_device struct should be fully filled in
911  */
912 int btrfs_add_device(struct btrfs_trans_handle *trans,
913                      struct btrfs_root *root,
914                      struct btrfs_device *device)
915 {
916         int ret;
917         struct btrfs_path *path;
918         struct btrfs_dev_item *dev_item;
919         struct extent_buffer *leaf;
920         struct btrfs_key key;
921         unsigned long ptr;
922
923         root = root->fs_info->chunk_root;
924
925         path = btrfs_alloc_path();
926         if (!path)
927                 return -ENOMEM;
928
929         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
930         key.type = BTRFS_DEV_ITEM_KEY;
931         key.offset = device->devid;
932
933         ret = btrfs_insert_empty_item(trans, root, path, &key,
934                                       sizeof(*dev_item));
935         if (ret)
936                 goto out;
937
938         leaf = path->nodes[0];
939         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
940
941         btrfs_set_device_id(leaf, dev_item, device->devid);
942         btrfs_set_device_generation(leaf, dev_item, 0);
943         btrfs_set_device_type(leaf, dev_item, device->type);
944         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
945         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
946         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
947         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
948         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
949         btrfs_set_device_group(leaf, dev_item, 0);
950         btrfs_set_device_seek_speed(leaf, dev_item, 0);
951         btrfs_set_device_bandwidth(leaf, dev_item, 0);
952         btrfs_set_device_start_offset(leaf, dev_item, 0);
953
954         ptr = (unsigned long)btrfs_device_uuid(dev_item);
955         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
956         ptr = (unsigned long)btrfs_device_fsid(dev_item);
957         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
958         btrfs_mark_buffer_dirty(leaf);
959
960         ret = 0;
961 out:
962         btrfs_free_path(path);
963         return ret;
964 }
965
966 static int btrfs_rm_dev_item(struct btrfs_root *root,
967                              struct btrfs_device *device)
968 {
969         int ret;
970         struct btrfs_path *path;
971         struct btrfs_key key;
972         struct btrfs_trans_handle *trans;
973
974         root = root->fs_info->chunk_root;
975
976         path = btrfs_alloc_path();
977         if (!path)
978                 return -ENOMEM;
979
980         trans = btrfs_start_transaction(root, 1);
981         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
982         key.type = BTRFS_DEV_ITEM_KEY;
983         key.offset = device->devid;
984         lock_chunks(root);
985
986         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
987         if (ret < 0)
988                 goto out;
989
990         if (ret > 0) {
991                 ret = -ENOENT;
992                 goto out;
993         }
994
995         ret = btrfs_del_item(trans, root, path);
996         if (ret)
997                 goto out;
998 out:
999         btrfs_free_path(path);
1000         unlock_chunks(root);
1001         btrfs_commit_transaction(trans, root);
1002         return ret;
1003 }
1004
1005 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1006 {
1007         struct btrfs_device *device;
1008         struct btrfs_device *next_device;
1009         struct block_device *bdev;
1010         struct buffer_head *bh = NULL;
1011         struct btrfs_super_block *disk_super;
1012         u64 all_avail;
1013         u64 devid;
1014         u64 num_devices;
1015         u8 *dev_uuid;
1016         int ret = 0;
1017
1018         mutex_lock(&uuid_mutex);
1019         mutex_lock(&root->fs_info->volume_mutex);
1020
1021         all_avail = root->fs_info->avail_data_alloc_bits |
1022                 root->fs_info->avail_system_alloc_bits |
1023                 root->fs_info->avail_metadata_alloc_bits;
1024
1025         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1026             root->fs_info->fs_devices->rw_devices <= 4) {
1027                 printk(KERN_ERR "btrfs: unable to go below four devices "
1028                        "on raid10\n");
1029                 ret = -EINVAL;
1030                 goto out;
1031         }
1032
1033         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1034             root->fs_info->fs_devices->rw_devices <= 2) {
1035                 printk(KERN_ERR "btrfs: unable to go below two "
1036                        "devices on raid1\n");
1037                 ret = -EINVAL;
1038                 goto out;
1039         }
1040
1041         if (strcmp(device_path, "missing") == 0) {
1042                 struct list_head *devices;
1043                 struct btrfs_device *tmp;
1044
1045                 device = NULL;
1046                 devices = &root->fs_info->fs_devices->devices;
1047                 list_for_each_entry(tmp, devices, dev_list) {
1048                         if (tmp->in_fs_metadata && !tmp->bdev) {
1049                                 device = tmp;
1050                                 break;
1051                         }
1052                 }
1053                 bdev = NULL;
1054                 bh = NULL;
1055                 disk_super = NULL;
1056                 if (!device) {
1057                         printk(KERN_ERR "btrfs: no missing devices found to "
1058                                "remove\n");
1059                         goto out;
1060                 }
1061         } else {
1062                 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1063                                       root->fs_info->bdev_holder);
1064                 if (IS_ERR(bdev)) {
1065                         ret = PTR_ERR(bdev);
1066                         goto out;
1067                 }
1068
1069                 set_blocksize(bdev, 4096);
1070                 bh = btrfs_read_dev_super(bdev);
1071                 if (!bh) {
1072                         ret = -EIO;
1073                         goto error_close;
1074                 }
1075                 disk_super = (struct btrfs_super_block *)bh->b_data;
1076                 devid = le64_to_cpu(disk_super->dev_item.devid);
1077                 dev_uuid = disk_super->dev_item.uuid;
1078                 device = btrfs_find_device(root, devid, dev_uuid,
1079                                            disk_super->fsid);
1080                 if (!device) {
1081                         ret = -ENOENT;
1082                         goto error_brelse;
1083                 }
1084         }
1085
1086         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1087                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1088                        "device\n");
1089                 ret = -EINVAL;
1090                 goto error_brelse;
1091         }
1092
1093         if (device->writeable) {
1094                 list_del_init(&device->dev_alloc_list);
1095                 root->fs_info->fs_devices->rw_devices--;
1096         }
1097
1098         ret = btrfs_shrink_device(device, 0);
1099         if (ret)
1100                 goto error_brelse;
1101
1102         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1103         if (ret)
1104                 goto error_brelse;
1105
1106         device->in_fs_metadata = 0;
1107         list_del_init(&device->dev_list);
1108         device->fs_devices->num_devices--;
1109
1110         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1111                                  struct btrfs_device, dev_list);
1112         if (device->bdev == root->fs_info->sb->s_bdev)
1113                 root->fs_info->sb->s_bdev = next_device->bdev;
1114         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1115                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1116
1117         if (device->bdev) {
1118                 close_bdev_exclusive(device->bdev, device->mode);
1119                 device->bdev = NULL;
1120                 device->fs_devices->open_devices--;
1121         }
1122
1123         num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1124         btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1125
1126         if (device->fs_devices->open_devices == 0) {
1127                 struct btrfs_fs_devices *fs_devices;
1128                 fs_devices = root->fs_info->fs_devices;
1129                 while (fs_devices) {
1130                         if (fs_devices->seed == device->fs_devices)
1131                                 break;
1132                         fs_devices = fs_devices->seed;
1133                 }
1134                 fs_devices->seed = device->fs_devices->seed;
1135                 device->fs_devices->seed = NULL;
1136                 __btrfs_close_devices(device->fs_devices);
1137                 free_fs_devices(device->fs_devices);
1138         }
1139
1140         /*
1141          * at this point, the device is zero sized.  We want to
1142          * remove it from the devices list and zero out the old super
1143          */
1144         if (device->writeable) {
1145                 /* make sure this device isn't detected as part of
1146                  * the FS anymore
1147                  */
1148                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1149                 set_buffer_dirty(bh);
1150                 sync_dirty_buffer(bh);
1151         }
1152
1153         kfree(device->name);
1154         kfree(device);
1155         ret = 0;
1156
1157 error_brelse:
1158         brelse(bh);
1159 error_close:
1160         if (bdev)
1161                 close_bdev_exclusive(bdev, FMODE_READ);
1162 out:
1163         mutex_unlock(&root->fs_info->volume_mutex);
1164         mutex_unlock(&uuid_mutex);
1165         return ret;
1166 }
1167
1168 /*
1169  * does all the dirty work required for changing file system's UUID.
1170  */
1171 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1172                                 struct btrfs_root *root)
1173 {
1174         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1175         struct btrfs_fs_devices *old_devices;
1176         struct btrfs_fs_devices *seed_devices;
1177         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1178         struct btrfs_device *device;
1179         u64 super_flags;
1180
1181         BUG_ON(!mutex_is_locked(&uuid_mutex));
1182         if (!fs_devices->seeding)
1183                 return -EINVAL;
1184
1185         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1186         if (!seed_devices)
1187                 return -ENOMEM;
1188
1189         old_devices = clone_fs_devices(fs_devices);
1190         if (IS_ERR(old_devices)) {
1191                 kfree(seed_devices);
1192                 return PTR_ERR(old_devices);
1193         }
1194
1195         list_add(&old_devices->list, &fs_uuids);
1196
1197         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1198         seed_devices->opened = 1;
1199         INIT_LIST_HEAD(&seed_devices->devices);
1200         INIT_LIST_HEAD(&seed_devices->alloc_list);
1201         list_splice_init(&fs_devices->devices, &seed_devices->devices);
1202         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1203         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1204                 device->fs_devices = seed_devices;
1205         }
1206
1207         fs_devices->seeding = 0;
1208         fs_devices->num_devices = 0;
1209         fs_devices->open_devices = 0;
1210         fs_devices->seed = seed_devices;
1211
1212         generate_random_uuid(fs_devices->fsid);
1213         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1214         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1215         super_flags = btrfs_super_flags(disk_super) &
1216                       ~BTRFS_SUPER_FLAG_SEEDING;
1217         btrfs_set_super_flags(disk_super, super_flags);
1218
1219         return 0;
1220 }
1221
1222 /*
1223  * strore the expected generation for seed devices in device items.
1224  */
1225 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1226                                struct btrfs_root *root)
1227 {
1228         struct btrfs_path *path;
1229         struct extent_buffer *leaf;
1230         struct btrfs_dev_item *dev_item;
1231         struct btrfs_device *device;
1232         struct btrfs_key key;
1233         u8 fs_uuid[BTRFS_UUID_SIZE];
1234         u8 dev_uuid[BTRFS_UUID_SIZE];
1235         u64 devid;
1236         int ret;
1237
1238         path = btrfs_alloc_path();
1239         if (!path)
1240                 return -ENOMEM;
1241
1242         root = root->fs_info->chunk_root;
1243         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1244         key.offset = 0;
1245         key.type = BTRFS_DEV_ITEM_KEY;
1246
1247         while (1) {
1248                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1249                 if (ret < 0)
1250                         goto error;
1251
1252                 leaf = path->nodes[0];
1253 next_slot:
1254                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1255                         ret = btrfs_next_leaf(root, path);
1256                         if (ret > 0)
1257                                 break;
1258                         if (ret < 0)
1259                                 goto error;
1260                         leaf = path->nodes[0];
1261                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1262                         btrfs_release_path(root, path);
1263                         continue;
1264                 }
1265
1266                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1267                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1268                     key.type != BTRFS_DEV_ITEM_KEY)
1269                         break;
1270
1271                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1272                                           struct btrfs_dev_item);
1273                 devid = btrfs_device_id(leaf, dev_item);
1274                 read_extent_buffer(leaf, dev_uuid,
1275                                    (unsigned long)btrfs_device_uuid(dev_item),
1276                                    BTRFS_UUID_SIZE);
1277                 read_extent_buffer(leaf, fs_uuid,
1278                                    (unsigned long)btrfs_device_fsid(dev_item),
1279                                    BTRFS_UUID_SIZE);
1280                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1281                 BUG_ON(!device);
1282
1283                 if (device->fs_devices->seeding) {
1284                         btrfs_set_device_generation(leaf, dev_item,
1285                                                     device->generation);
1286                         btrfs_mark_buffer_dirty(leaf);
1287                 }
1288
1289                 path->slots[0]++;
1290                 goto next_slot;
1291         }
1292         ret = 0;
1293 error:
1294         btrfs_free_path(path);
1295         return ret;
1296 }
1297
1298 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1299 {
1300         struct btrfs_trans_handle *trans;
1301         struct btrfs_device *device;
1302         struct block_device *bdev;
1303         struct list_head *devices;
1304         struct super_block *sb = root->fs_info->sb;
1305         u64 total_bytes;
1306         int seeding_dev = 0;
1307         int ret = 0;
1308
1309         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1310                 return -EINVAL;
1311
1312         bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1313         if (!bdev)
1314                 return -EIO;
1315
1316         if (root->fs_info->fs_devices->seeding) {
1317                 seeding_dev = 1;
1318                 down_write(&sb->s_umount);
1319                 mutex_lock(&uuid_mutex);
1320         }
1321
1322         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1323         mutex_lock(&root->fs_info->volume_mutex);
1324
1325         devices = &root->fs_info->fs_devices->devices;
1326         list_for_each_entry(device, devices, dev_list) {
1327                 if (device->bdev == bdev) {
1328                         ret = -EEXIST;
1329                         goto error;
1330                 }
1331         }
1332
1333         device = kzalloc(sizeof(*device), GFP_NOFS);
1334         if (!device) {
1335                 /* we can safely leave the fs_devices entry around */
1336                 ret = -ENOMEM;
1337                 goto error;
1338         }
1339
1340         device->name = kstrdup(device_path, GFP_NOFS);
1341         if (!device->name) {
1342                 kfree(device);
1343                 ret = -ENOMEM;
1344                 goto error;
1345         }
1346
1347         ret = find_next_devid(root, &device->devid);
1348         if (ret) {
1349                 kfree(device);
1350                 goto error;
1351         }
1352
1353         trans = btrfs_start_transaction(root, 1);
1354         lock_chunks(root);
1355
1356         device->barriers = 1;
1357         device->writeable = 1;
1358         device->work.func = pending_bios_fn;
1359         generate_random_uuid(device->uuid);
1360         spin_lock_init(&device->io_lock);
1361         device->generation = trans->transid;
1362         device->io_width = root->sectorsize;
1363         device->io_align = root->sectorsize;
1364         device->sector_size = root->sectorsize;
1365         device->total_bytes = i_size_read(bdev->bd_inode);
1366         device->dev_root = root->fs_info->dev_root;
1367         device->bdev = bdev;
1368         device->in_fs_metadata = 1;
1369         device->mode = 0;
1370         set_blocksize(device->bdev, 4096);
1371
1372         if (seeding_dev) {
1373                 sb->s_flags &= ~MS_RDONLY;
1374                 ret = btrfs_prepare_sprout(trans, root);
1375                 BUG_ON(ret);
1376         }
1377
1378         device->fs_devices = root->fs_info->fs_devices;
1379         list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1380         list_add(&device->dev_alloc_list,
1381                  &root->fs_info->fs_devices->alloc_list);
1382         root->fs_info->fs_devices->num_devices++;
1383         root->fs_info->fs_devices->open_devices++;
1384         root->fs_info->fs_devices->rw_devices++;
1385         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1386
1387         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1388         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1389                                     total_bytes + device->total_bytes);
1390
1391         total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1392         btrfs_set_super_num_devices(&root->fs_info->super_copy,
1393                                     total_bytes + 1);
1394
1395         if (seeding_dev) {
1396                 ret = init_first_rw_device(trans, root, device);
1397                 BUG_ON(ret);
1398                 ret = btrfs_finish_sprout(trans, root);
1399                 BUG_ON(ret);
1400         } else {
1401                 ret = btrfs_add_device(trans, root, device);
1402         }
1403
1404         /*
1405          * we've got more storage, clear any full flags on the space
1406          * infos
1407          */
1408         btrfs_clear_space_info_full(root->fs_info);
1409
1410         unlock_chunks(root);
1411         btrfs_commit_transaction(trans, root);
1412
1413         if (seeding_dev) {
1414                 mutex_unlock(&uuid_mutex);
1415                 up_write(&sb->s_umount);
1416
1417                 ret = btrfs_relocate_sys_chunks(root);
1418                 BUG_ON(ret);
1419         }
1420 out:
1421         mutex_unlock(&root->fs_info->volume_mutex);
1422         return ret;
1423 error:
1424         close_bdev_exclusive(bdev, 0);
1425         if (seeding_dev) {
1426                 mutex_unlock(&uuid_mutex);
1427                 up_write(&sb->s_umount);
1428         }
1429         goto out;
1430 }
1431
1432 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1433                                         struct btrfs_device *device)
1434 {
1435         int ret;
1436         struct btrfs_path *path;
1437         struct btrfs_root *root;
1438         struct btrfs_dev_item *dev_item;
1439         struct extent_buffer *leaf;
1440         struct btrfs_key key;
1441
1442         root = device->dev_root->fs_info->chunk_root;
1443
1444         path = btrfs_alloc_path();
1445         if (!path)
1446                 return -ENOMEM;
1447
1448         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1449         key.type = BTRFS_DEV_ITEM_KEY;
1450         key.offset = device->devid;
1451
1452         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1453         if (ret < 0)
1454                 goto out;
1455
1456         if (ret > 0) {
1457                 ret = -ENOENT;
1458                 goto out;
1459         }
1460
1461         leaf = path->nodes[0];
1462         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1463
1464         btrfs_set_device_id(leaf, dev_item, device->devid);
1465         btrfs_set_device_type(leaf, dev_item, device->type);
1466         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1467         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1468         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1469         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1470         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1471         btrfs_mark_buffer_dirty(leaf);
1472
1473 out:
1474         btrfs_free_path(path);
1475         return ret;
1476 }
1477
1478 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1479                       struct btrfs_device *device, u64 new_size)
1480 {
1481         struct btrfs_super_block *super_copy =
1482                 &device->dev_root->fs_info->super_copy;
1483         u64 old_total = btrfs_super_total_bytes(super_copy);
1484         u64 diff = new_size - device->total_bytes;
1485
1486         if (!device->writeable)
1487                 return -EACCES;
1488         if (new_size <= device->total_bytes)
1489                 return -EINVAL;
1490
1491         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1492         device->fs_devices->total_rw_bytes += diff;
1493
1494         device->total_bytes = new_size;
1495         btrfs_clear_space_info_full(device->dev_root->fs_info);
1496
1497         return btrfs_update_device(trans, device);
1498 }
1499
1500 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1501                       struct btrfs_device *device, u64 new_size)
1502 {
1503         int ret;
1504         lock_chunks(device->dev_root);
1505         ret = __btrfs_grow_device(trans, device, new_size);
1506         unlock_chunks(device->dev_root);
1507         return ret;
1508 }
1509
1510 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1511                             struct btrfs_root *root,
1512                             u64 chunk_tree, u64 chunk_objectid,
1513                             u64 chunk_offset)
1514 {
1515         int ret;
1516         struct btrfs_path *path;
1517         struct btrfs_key key;
1518
1519         root = root->fs_info->chunk_root;
1520         path = btrfs_alloc_path();
1521         if (!path)
1522                 return -ENOMEM;
1523
1524         key.objectid = chunk_objectid;
1525         key.offset = chunk_offset;
1526         key.type = BTRFS_CHUNK_ITEM_KEY;
1527
1528         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1529         BUG_ON(ret);
1530
1531         ret = btrfs_del_item(trans, root, path);
1532         BUG_ON(ret);
1533
1534         btrfs_free_path(path);
1535         return 0;
1536 }
1537
1538 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1539                         chunk_offset)
1540 {
1541         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1542         struct btrfs_disk_key *disk_key;
1543         struct btrfs_chunk *chunk;
1544         u8 *ptr;
1545         int ret = 0;
1546         u32 num_stripes;
1547         u32 array_size;
1548         u32 len = 0;
1549         u32 cur;
1550         struct btrfs_key key;
1551
1552         array_size = btrfs_super_sys_array_size(super_copy);
1553
1554         ptr = super_copy->sys_chunk_array;
1555         cur = 0;
1556
1557         while (cur < array_size) {
1558                 disk_key = (struct btrfs_disk_key *)ptr;
1559                 btrfs_disk_key_to_cpu(&key, disk_key);
1560
1561                 len = sizeof(*disk_key);
1562
1563                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1564                         chunk = (struct btrfs_chunk *)(ptr + len);
1565                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1566                         len += btrfs_chunk_item_size(num_stripes);
1567                 } else {
1568                         ret = -EIO;
1569                         break;
1570                 }
1571                 if (key.objectid == chunk_objectid &&
1572                     key.offset == chunk_offset) {
1573                         memmove(ptr, ptr + len, array_size - (cur + len));
1574                         array_size -= len;
1575                         btrfs_set_super_sys_array_size(super_copy, array_size);
1576                 } else {
1577                         ptr += len;
1578                         cur += len;
1579                 }
1580         }
1581         return ret;
1582 }
1583
1584 static int btrfs_relocate_chunk(struct btrfs_root *root,
1585                          u64 chunk_tree, u64 chunk_objectid,
1586                          u64 chunk_offset)
1587 {
1588         struct extent_map_tree *em_tree;
1589         struct btrfs_root *extent_root;
1590         struct btrfs_trans_handle *trans;
1591         struct extent_map *em;
1592         struct map_lookup *map;
1593         int ret;
1594         int i;
1595
1596         printk(KERN_INFO "btrfs relocating chunk %llu\n",
1597                (unsigned long long)chunk_offset);
1598         root = root->fs_info->chunk_root;
1599         extent_root = root->fs_info->extent_root;
1600         em_tree = &root->fs_info->mapping_tree.map_tree;
1601
1602         /* step one, relocate all the extents inside this chunk */
1603         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1604         BUG_ON(ret);
1605
1606         trans = btrfs_start_transaction(root, 1);
1607         BUG_ON(!trans);
1608
1609         lock_chunks(root);
1610
1611         /*
1612          * step two, delete the device extents and the
1613          * chunk tree entries
1614          */
1615         spin_lock(&em_tree->lock);
1616         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1617         spin_unlock(&em_tree->lock);
1618
1619         BUG_ON(em->start > chunk_offset ||
1620                em->start + em->len < chunk_offset);
1621         map = (struct map_lookup *)em->bdev;
1622
1623         for (i = 0; i < map->num_stripes; i++) {
1624                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1625                                             map->stripes[i].physical);
1626                 BUG_ON(ret);
1627
1628                 if (map->stripes[i].dev) {
1629                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1630                         BUG_ON(ret);
1631                 }
1632         }
1633         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1634                                chunk_offset);
1635
1636         BUG_ON(ret);
1637
1638         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1639                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1640                 BUG_ON(ret);
1641         }
1642
1643         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1644         BUG_ON(ret);
1645
1646         spin_lock(&em_tree->lock);
1647         remove_extent_mapping(em_tree, em);
1648         spin_unlock(&em_tree->lock);
1649
1650         kfree(map);
1651         em->bdev = NULL;
1652
1653         /* once for the tree */
1654         free_extent_map(em);
1655         /* once for us */
1656         free_extent_map(em);
1657
1658         unlock_chunks(root);
1659         btrfs_end_transaction(trans, root);
1660         return 0;
1661 }
1662
1663 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1664 {
1665         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1666         struct btrfs_path *path;
1667         struct extent_buffer *leaf;
1668         struct btrfs_chunk *chunk;
1669         struct btrfs_key key;
1670         struct btrfs_key found_key;
1671         u64 chunk_tree = chunk_root->root_key.objectid;
1672         u64 chunk_type;
1673         int ret;
1674
1675         path = btrfs_alloc_path();
1676         if (!path)
1677                 return -ENOMEM;
1678
1679         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1680         key.offset = (u64)-1;
1681         key.type = BTRFS_CHUNK_ITEM_KEY;
1682
1683         while (1) {
1684                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1685                 if (ret < 0)
1686                         goto error;
1687                 BUG_ON(ret == 0);
1688
1689                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1690                                           key.type);
1691                 if (ret < 0)
1692                         goto error;
1693                 if (ret > 0)
1694                         break;
1695
1696                 leaf = path->nodes[0];
1697                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1698
1699                 chunk = btrfs_item_ptr(leaf, path->slots[0],
1700                                        struct btrfs_chunk);
1701                 chunk_type = btrfs_chunk_type(leaf, chunk);
1702                 btrfs_release_path(chunk_root, path);
1703
1704                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1705                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1706                                                    found_key.objectid,
1707                                                    found_key.offset);
1708                         BUG_ON(ret);
1709                 }
1710
1711                 if (found_key.offset == 0)
1712                         break;
1713                 key.offset = found_key.offset - 1;
1714         }
1715         ret = 0;
1716 error:
1717         btrfs_free_path(path);
1718         return ret;
1719 }
1720
1721 static u64 div_factor(u64 num, int factor)
1722 {
1723         if (factor == 10)
1724                 return num;
1725         num *= factor;
1726         do_div(num, 10);
1727         return num;
1728 }
1729
1730 int btrfs_balance(struct btrfs_root *dev_root)
1731 {
1732         int ret;
1733         struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1734         struct btrfs_device *device;
1735         u64 old_size;
1736         u64 size_to_free;
1737         struct btrfs_path *path;
1738         struct btrfs_key key;
1739         struct btrfs_chunk *chunk;
1740         struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1741         struct btrfs_trans_handle *trans;
1742         struct btrfs_key found_key;
1743
1744         if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1745                 return -EROFS;
1746
1747         mutex_lock(&dev_root->fs_info->volume_mutex);
1748         dev_root = dev_root->fs_info->dev_root;
1749
1750         /* step one make some room on all the devices */
1751         list_for_each_entry(device, devices, dev_list) {
1752                 old_size = device->total_bytes;
1753                 size_to_free = div_factor(old_size, 1);
1754                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1755                 if (!device->writeable ||
1756                     device->total_bytes - device->bytes_used > size_to_free)
1757                         continue;
1758
1759                 ret = btrfs_shrink_device(device, old_size - size_to_free);
1760                 BUG_ON(ret);
1761
1762                 trans = btrfs_start_transaction(dev_root, 1);
1763                 BUG_ON(!trans);
1764
1765                 ret = btrfs_grow_device(trans, device, old_size);
1766                 BUG_ON(ret);
1767
1768                 btrfs_end_transaction(trans, dev_root);
1769         }
1770
1771         /* step two, relocate all the chunks */
1772         path = btrfs_alloc_path();
1773         BUG_ON(!path);
1774
1775         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1776         key.offset = (u64)-1;
1777         key.type = BTRFS_CHUNK_ITEM_KEY;
1778
1779         while (1) {
1780                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1781                 if (ret < 0)
1782                         goto error;
1783
1784                 /*
1785                  * this shouldn't happen, it means the last relocate
1786                  * failed
1787                  */
1788                 if (ret == 0)
1789                         break;
1790
1791                 ret = btrfs_previous_item(chunk_root, path, 0,
1792                                           BTRFS_CHUNK_ITEM_KEY);
1793                 if (ret)
1794                         break;
1795
1796                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1797                                       path->slots[0]);
1798                 if (found_key.objectid != key.objectid)
1799                         break;
1800
1801                 chunk = btrfs_item_ptr(path->nodes[0],
1802                                        path->slots[0],
1803                                        struct btrfs_chunk);
1804                 key.offset = found_key.offset;
1805                 /* chunk zero is special */
1806                 if (key.offset == 0)
1807                         break;
1808
1809                 btrfs_release_path(chunk_root, path);
1810                 ret = btrfs_relocate_chunk(chunk_root,
1811                                            chunk_root->root_key.objectid,
1812                                            found_key.objectid,
1813                                            found_key.offset);
1814                 BUG_ON(ret);
1815         }
1816         ret = 0;
1817 error:
1818         btrfs_free_path(path);
1819         mutex_unlock(&dev_root->fs_info->volume_mutex);
1820         return ret;
1821 }
1822
1823 /*
1824  * shrinking a device means finding all of the device extents past
1825  * the new size, and then following the back refs to the chunks.
1826  * The chunk relocation code actually frees the device extent
1827  */
1828 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1829 {
1830         struct btrfs_trans_handle *trans;
1831         struct btrfs_root *root = device->dev_root;
1832         struct btrfs_dev_extent *dev_extent = NULL;
1833         struct btrfs_path *path;
1834         u64 length;
1835         u64 chunk_tree;
1836         u64 chunk_objectid;
1837         u64 chunk_offset;
1838         int ret;
1839         int slot;
1840         struct extent_buffer *l;
1841         struct btrfs_key key;
1842         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1843         u64 old_total = btrfs_super_total_bytes(super_copy);
1844         u64 diff = device->total_bytes - new_size;
1845
1846         if (new_size >= device->total_bytes)
1847                 return -EINVAL;
1848
1849         path = btrfs_alloc_path();
1850         if (!path)
1851                 return -ENOMEM;
1852
1853         trans = btrfs_start_transaction(root, 1);
1854         if (!trans) {
1855                 ret = -ENOMEM;
1856                 goto done;
1857         }
1858
1859         path->reada = 2;
1860
1861         lock_chunks(root);
1862
1863         device->total_bytes = new_size;
1864         if (device->writeable)
1865                 device->fs_devices->total_rw_bytes -= diff;
1866         ret = btrfs_update_device(trans, device);
1867         if (ret) {
1868                 unlock_chunks(root);
1869                 btrfs_end_transaction(trans, root);
1870                 goto done;
1871         }
1872         WARN_ON(diff > old_total);
1873         btrfs_set_super_total_bytes(super_copy, old_total - diff);
1874         unlock_chunks(root);
1875         btrfs_end_transaction(trans, root);
1876
1877         key.objectid = device->devid;
1878         key.offset = (u64)-1;
1879         key.type = BTRFS_DEV_EXTENT_KEY;
1880
1881         while (1) {
1882                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1883                 if (ret < 0)
1884                         goto done;
1885
1886                 ret = btrfs_previous_item(root, path, 0, key.type);
1887                 if (ret < 0)
1888                         goto done;
1889                 if (ret) {
1890                         ret = 0;
1891                         goto done;
1892                 }
1893
1894                 l = path->nodes[0];
1895                 slot = path->slots[0];
1896                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1897
1898                 if (key.objectid != device->devid)
1899                         goto done;
1900
1901                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1902                 length = btrfs_dev_extent_length(l, dev_extent);
1903
1904                 if (key.offset + length <= new_size)
1905                         goto done;
1906
1907                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1908                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1909                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1910                 btrfs_release_path(root, path);
1911
1912                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1913                                            chunk_offset);
1914                 if (ret)
1915                         goto done;
1916         }
1917
1918 done:
1919         btrfs_free_path(path);
1920         return ret;
1921 }
1922
1923 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1924                            struct btrfs_root *root,
1925                            struct btrfs_key *key,
1926                            struct btrfs_chunk *chunk, int item_size)
1927 {
1928         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1929         struct btrfs_disk_key disk_key;
1930         u32 array_size;
1931         u8 *ptr;
1932
1933         array_size = btrfs_super_sys_array_size(super_copy);
1934         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1935                 return -EFBIG;
1936
1937         ptr = super_copy->sys_chunk_array + array_size;
1938         btrfs_cpu_key_to_disk(&disk_key, key);
1939         memcpy(ptr, &disk_key, sizeof(disk_key));
1940         ptr += sizeof(disk_key);
1941         memcpy(ptr, chunk, item_size);
1942         item_size += sizeof(disk_key);
1943         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1944         return 0;
1945 }
1946
1947 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1948                                         int num_stripes, int sub_stripes)
1949 {
1950         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1951                 return calc_size;
1952         else if (type & BTRFS_BLOCK_GROUP_RAID10)
1953                 return calc_size * (num_stripes / sub_stripes);
1954         else
1955                 return calc_size * num_stripes;
1956 }
1957
1958 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1959                                struct btrfs_root *extent_root,
1960                                struct map_lookup **map_ret,
1961                                u64 *num_bytes, u64 *stripe_size,
1962                                u64 start, u64 type)
1963 {
1964         struct btrfs_fs_info *info = extent_root->fs_info;
1965         struct btrfs_device *device = NULL;
1966         struct btrfs_fs_devices *fs_devices = info->fs_devices;
1967         struct list_head *cur;
1968         struct map_lookup *map = NULL;
1969         struct extent_map_tree *em_tree;
1970         struct extent_map *em;
1971         struct list_head private_devs;
1972         int min_stripe_size = 1 * 1024 * 1024;
1973         u64 calc_size = 1024 * 1024 * 1024;
1974         u64 max_chunk_size = calc_size;
1975         u64 min_free;
1976         u64 avail;
1977         u64 max_avail = 0;
1978         u64 dev_offset;
1979         int num_stripes = 1;
1980         int min_stripes = 1;
1981         int sub_stripes = 0;
1982         int looped = 0;
1983         int ret;
1984         int index;
1985         int stripe_len = 64 * 1024;
1986
1987         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1988             (type & BTRFS_BLOCK_GROUP_DUP)) {
1989                 WARN_ON(1);
1990                 type &= ~BTRFS_BLOCK_GROUP_DUP;
1991         }
1992         if (list_empty(&fs_devices->alloc_list))
1993                 return -ENOSPC;
1994
1995         if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1996                 num_stripes = fs_devices->rw_devices;
1997                 min_stripes = 2;
1998         }
1999         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2000                 num_stripes = 2;
2001                 min_stripes = 2;
2002         }
2003         if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2004                 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
2005                 if (num_stripes < 2)
2006                         return -ENOSPC;
2007                 min_stripes = 2;
2008         }
2009         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2010                 num_stripes = fs_devices->rw_devices;
2011                 if (num_stripes < 4)
2012                         return -ENOSPC;
2013                 num_stripes &= ~(u32)1;
2014                 sub_stripes = 2;
2015                 min_stripes = 4;
2016         }
2017
2018         if (type & BTRFS_BLOCK_GROUP_DATA) {
2019                 max_chunk_size = 10 * calc_size;
2020                 min_stripe_size = 64 * 1024 * 1024;
2021         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2022                 max_chunk_size = 4 * calc_size;
2023                 min_stripe_size = 32 * 1024 * 1024;
2024         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2025                 calc_size = 8 * 1024 * 1024;
2026                 max_chunk_size = calc_size * 2;
2027                 min_stripe_size = 1 * 1024 * 1024;
2028         }
2029
2030         /* we don't want a chunk larger than 10% of writeable space */
2031         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2032                              max_chunk_size);
2033
2034 again:
2035         if (!map || map->num_stripes != num_stripes) {
2036                 kfree(map);
2037                 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2038                 if (!map)
2039                         return -ENOMEM;
2040                 map->num_stripes = num_stripes;
2041         }
2042
2043         if (calc_size * num_stripes > max_chunk_size) {
2044                 calc_size = max_chunk_size;
2045                 do_div(calc_size, num_stripes);
2046                 do_div(calc_size, stripe_len);
2047                 calc_size *= stripe_len;
2048         }
2049         /* we don't want tiny stripes */
2050         calc_size = max_t(u64, min_stripe_size, calc_size);
2051
2052         do_div(calc_size, stripe_len);
2053         calc_size *= stripe_len;
2054
2055         cur = fs_devices->alloc_list.next;
2056         index = 0;
2057
2058         if (type & BTRFS_BLOCK_GROUP_DUP)
2059                 min_free = calc_size * 2;
2060         else
2061                 min_free = calc_size;
2062
2063         /*
2064          * we add 1MB because we never use the first 1MB of the device, unless
2065          * we've looped, then we are likely allocating the maximum amount of
2066          * space left already
2067          */
2068         if (!looped)
2069                 min_free += 1024 * 1024;
2070
2071         INIT_LIST_HEAD(&private_devs);
2072         while (index < num_stripes) {
2073                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2074                 BUG_ON(!device->writeable);
2075                 if (device->total_bytes > device->bytes_used)
2076                         avail = device->total_bytes - device->bytes_used;
2077                 else
2078                         avail = 0;
2079                 cur = cur->next;
2080
2081                 if (device->in_fs_metadata && avail >= min_free) {
2082                         ret = find_free_dev_extent(trans, device,
2083                                                    min_free, &dev_offset);
2084                         if (ret == 0) {
2085                                 list_move_tail(&device->dev_alloc_list,
2086                                                &private_devs);
2087                                 map->stripes[index].dev = device;
2088                                 map->stripes[index].physical = dev_offset;
2089                                 index++;
2090                                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2091                                         map->stripes[index].dev = device;
2092                                         map->stripes[index].physical =
2093                                                 dev_offset + calc_size;
2094                                         index++;
2095                                 }
2096                         }
2097                 } else if (device->in_fs_metadata && avail > max_avail)
2098                         max_avail = avail;
2099                 if (cur == &fs_devices->alloc_list)
2100                         break;
2101         }
2102         list_splice(&private_devs, &fs_devices->alloc_list);
2103         if (index < num_stripes) {
2104                 if (index >= min_stripes) {
2105                         num_stripes = index;
2106                         if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2107                                 num_stripes /= sub_stripes;
2108                                 num_stripes *= sub_stripes;
2109                         }
2110                         looped = 1;
2111                         goto again;
2112                 }
2113                 if (!looped && max_avail > 0) {
2114                         looped = 1;
2115                         calc_size = max_avail;
2116                         goto again;
2117                 }
2118                 kfree(map);
2119                 return -ENOSPC;
2120         }
2121         map->sector_size = extent_root->sectorsize;
2122         map->stripe_len = stripe_len;
2123         map->io_align = stripe_len;
2124         map->io_width = stripe_len;
2125         map->type = type;
2126         map->num_stripes = num_stripes;
2127         map->sub_stripes = sub_stripes;
2128
2129         *map_ret = map;
2130         *stripe_size = calc_size;
2131         *num_bytes = chunk_bytes_by_type(type, calc_size,
2132                                          num_stripes, sub_stripes);
2133
2134         em = alloc_extent_map(GFP_NOFS);
2135         if (!em) {
2136                 kfree(map);
2137                 return -ENOMEM;
2138         }
2139         em->bdev = (struct block_device *)map;
2140         em->start = start;
2141         em->len = *num_bytes;
2142         em->block_start = 0;
2143         em->block_len = em->len;
2144
2145         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2146         spin_lock(&em_tree->lock);
2147         ret = add_extent_mapping(em_tree, em);
2148         spin_unlock(&em_tree->lock);
2149         BUG_ON(ret);
2150         free_extent_map(em);
2151
2152         ret = btrfs_make_block_group(trans, extent_root, 0, type,
2153                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2154                                      start, *num_bytes);
2155         BUG_ON(ret);
2156
2157         index = 0;
2158         while (index < map->num_stripes) {
2159                 device = map->stripes[index].dev;
2160                 dev_offset = map->stripes[index].physical;
2161
2162                 ret = btrfs_alloc_dev_extent(trans, device,
2163                                 info->chunk_root->root_key.objectid,
2164                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2165                                 start, dev_offset, calc_size);
2166                 BUG_ON(ret);
2167                 index++;
2168         }
2169
2170         return 0;
2171 }
2172
2173 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2174                                 struct btrfs_root *extent_root,
2175                                 struct map_lookup *map, u64 chunk_offset,
2176                                 u64 chunk_size, u64 stripe_size)
2177 {
2178         u64 dev_offset;
2179         struct btrfs_key key;
2180         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2181         struct btrfs_device *device;
2182         struct btrfs_chunk *chunk;
2183         struct btrfs_stripe *stripe;
2184         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2185         int index = 0;
2186         int ret;
2187
2188         chunk = kzalloc(item_size, GFP_NOFS);
2189         if (!chunk)
2190                 return -ENOMEM;
2191
2192         index = 0;
2193         while (index < map->num_stripes) {
2194                 device = map->stripes[index].dev;
2195                 device->bytes_used += stripe_size;
2196                 ret = btrfs_update_device(trans, device);
2197                 BUG_ON(ret);
2198                 index++;
2199         }
2200
2201         index = 0;
2202         stripe = &chunk->stripe;
2203         while (index < map->num_stripes) {
2204                 device = map->stripes[index].dev;
2205                 dev_offset = map->stripes[index].physical;
2206
2207                 btrfs_set_stack_stripe_devid(stripe, device->devid);
2208                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2209                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2210                 stripe++;
2211                 index++;
2212         }
2213
2214         btrfs_set_stack_chunk_length(chunk, chunk_size);
2215         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2216         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2217         btrfs_set_stack_chunk_type(chunk, map->type);
2218         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2219         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2220         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2221         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2222         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2223
2224         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2225         key.type = BTRFS_CHUNK_ITEM_KEY;
2226         key.offset = chunk_offset;
2227
2228         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2229         BUG_ON(ret);
2230
2231         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2232                 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2233                                              item_size);
2234                 BUG_ON(ret);
2235         }
2236         kfree(chunk);
2237         return 0;
2238 }
2239
2240 /*
2241  * Chunk allocation falls into two parts. The first part does works
2242  * that make the new allocated chunk useable, but not do any operation
2243  * that modifies the chunk tree. The second part does the works that
2244  * require modifying the chunk tree. This division is important for the
2245  * bootstrap process of adding storage to a seed btrfs.
2246  */
2247 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2248                       struct btrfs_root *extent_root, u64 type)
2249 {
2250         u64 chunk_offset;
2251         u64 chunk_size;
2252         u64 stripe_size;
2253         struct map_lookup *map;
2254         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2255         int ret;
2256
2257         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2258                               &chunk_offset);
2259         if (ret)
2260                 return ret;
2261
2262         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2263                                   &stripe_size, chunk_offset, type);
2264         if (ret)
2265                 return ret;
2266
2267         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2268                                    chunk_size, stripe_size);
2269         BUG_ON(ret);
2270         return 0;
2271 }
2272
2273 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2274                                          struct btrfs_root *root,
2275                                          struct btrfs_device *device)
2276 {
2277         u64 chunk_offset;
2278         u64 sys_chunk_offset;
2279         u64 chunk_size;
2280         u64 sys_chunk_size;
2281         u64 stripe_size;
2282         u64 sys_stripe_size;
2283         u64 alloc_profile;
2284         struct map_lookup *map;
2285         struct map_lookup *sys_map;
2286         struct btrfs_fs_info *fs_info = root->fs_info;
2287         struct btrfs_root *extent_root = fs_info->extent_root;
2288         int ret;
2289
2290         ret = find_next_chunk(fs_info->chunk_root,
2291                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2292         BUG_ON(ret);
2293
2294         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2295                         (fs_info->metadata_alloc_profile &
2296                          fs_info->avail_metadata_alloc_bits);
2297         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2298
2299         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2300                                   &stripe_size, chunk_offset, alloc_profile);
2301         BUG_ON(ret);
2302
2303         sys_chunk_offset = chunk_offset + chunk_size;
2304
2305         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2306                         (fs_info->system_alloc_profile &
2307                          fs_info->avail_system_alloc_bits);
2308         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2309
2310         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2311                                   &sys_chunk_size, &sys_stripe_size,
2312                                   sys_chunk_offset, alloc_profile);
2313         BUG_ON(ret);
2314
2315         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2316         BUG_ON(ret);
2317
2318         /*
2319          * Modifying chunk tree needs allocating new blocks from both
2320          * system block group and metadata block group. So we only can
2321          * do operations require modifying the chunk tree after both
2322          * block groups were created.
2323          */
2324         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2325                                    chunk_size, stripe_size);
2326         BUG_ON(ret);
2327
2328         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2329                                    sys_chunk_offset, sys_chunk_size,
2330                                    sys_stripe_size);
2331         BUG_ON(ret);
2332         return 0;
2333 }
2334
2335 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2336 {
2337         struct extent_map *em;
2338         struct map_lookup *map;
2339         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2340         int readonly = 0;
2341         int i;
2342
2343         spin_lock(&map_tree->map_tree.lock);
2344         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2345         spin_unlock(&map_tree->map_tree.lock);
2346         if (!em)
2347                 return 1;
2348
2349         map = (struct map_lookup *)em->bdev;
2350         for (i = 0; i < map->num_stripes; i++) {
2351                 if (!map->stripes[i].dev->writeable) {
2352                         readonly = 1;
2353                         break;
2354                 }
2355         }
2356         free_extent_map(em);
2357         return readonly;
2358 }
2359
2360 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2361 {
2362         extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2363 }
2364
2365 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2366 {
2367         struct extent_map *em;
2368
2369         while (1) {
2370                 spin_lock(&tree->map_tree.lock);
2371                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2372                 if (em)
2373                         remove_extent_mapping(&tree->map_tree, em);
2374                 spin_unlock(&tree->map_tree.lock);
2375                 if (!em)
2376                         break;
2377                 kfree(em->bdev);
2378                 /* once for us */
2379                 free_extent_map(em);
2380                 /* once for the tree */
2381                 free_extent_map(em);
2382         }
2383 }
2384
2385 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2386 {
2387         struct extent_map *em;
2388         struct map_lookup *map;
2389         struct extent_map_tree *em_tree = &map_tree->map_tree;
2390         int ret;
2391
2392         spin_lock(&em_tree->lock);
2393         em = lookup_extent_mapping(em_tree, logical, len);
2394         spin_unlock(&em_tree->lock);
2395         BUG_ON(!em);
2396
2397         BUG_ON(em->start > logical || em->start + em->len < logical);
2398         map = (struct map_lookup *)em->bdev;
2399         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2400                 ret = map->num_stripes;
2401         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2402                 ret = map->sub_stripes;
2403         else
2404                 ret = 1;
2405         free_extent_map(em);
2406         return ret;
2407 }
2408
2409 static int find_live_mirror(struct map_lookup *map, int first, int num,
2410                             int optimal)
2411 {
2412         int i;
2413         if (map->stripes[optimal].dev->bdev)
2414                 return optimal;
2415         for (i = first; i < first + num; i++) {
2416                 if (map->stripes[i].dev->bdev)
2417                         return i;
2418         }
2419         /* we couldn't find one that doesn't fail.  Just return something
2420          * and the io error handling code will clean up eventually
2421          */
2422         return optimal;
2423 }
2424
2425 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2426                              u64 logical, u64 *length,
2427                              struct btrfs_multi_bio **multi_ret,
2428                              int mirror_num, struct page *unplug_page)
2429 {
2430         struct extent_map *em;
2431         struct map_lookup *map;
2432         struct extent_map_tree *em_tree = &map_tree->map_tree;
2433         u64 offset;
2434         u64 stripe_offset;
2435         u64 stripe_nr;
2436         int stripes_allocated = 8;
2437         int stripes_required = 1;
2438         int stripe_index;
2439         int i;
2440         int num_stripes;
2441         int max_errors = 0;
2442         struct btrfs_multi_bio *multi = NULL;
2443
2444         if (multi_ret && !(rw & (1 << BIO_RW)))
2445                 stripes_allocated = 1;
2446 again:
2447         if (multi_ret) {
2448                 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2449                                 GFP_NOFS);
2450                 if (!multi)
2451                         return -ENOMEM;
2452
2453                 atomic_set(&multi->error, 0);
2454         }
2455
2456         spin_lock(&em_tree->lock);
2457         em = lookup_extent_mapping(em_tree, logical, *length);
2458         spin_unlock(&em_tree->lock);
2459
2460         if (!em && unplug_page)
2461                 return 0;
2462
2463         if (!em) {
2464                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2465                        (unsigned long long)logical,
2466                        (unsigned long long)*length);
2467                 BUG();
2468         }
2469
2470         BUG_ON(em->start > logical || em->start + em->len < logical);
2471         map = (struct map_lookup *)em->bdev;
2472         offset = logical - em->start;
2473
2474         if (mirror_num > map->num_stripes)
2475                 mirror_num = 0;
2476
2477         /* if our multi bio struct is too small, back off and try again */
2478         if (rw & (1 << BIO_RW)) {
2479                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2480                                  BTRFS_BLOCK_GROUP_DUP)) {
2481                         stripes_required = map->num_stripes;
2482                         max_errors = 1;
2483                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2484                         stripes_required = map->sub_stripes;
2485                         max_errors = 1;
2486                 }
2487         }
2488         if (multi_ret && rw == WRITE &&
2489             stripes_allocated < stripes_required) {
2490                 stripes_allocated = map->num_stripes;
2491                 free_extent_map(em);
2492                 kfree(multi);
2493                 goto again;
2494         }
2495         stripe_nr = offset;
2496         /*
2497          * stripe_nr counts the total number of stripes we have to stride
2498          * to get to this block
2499          */
2500         do_div(stripe_nr, map->stripe_len);
2501
2502         stripe_offset = stripe_nr * map->stripe_len;
2503         BUG_ON(offset < stripe_offset);
2504
2505         /* stripe_offset is the offset of this block in its stripe*/
2506         stripe_offset = offset - stripe_offset;
2507
2508         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2509                          BTRFS_BLOCK_GROUP_RAID10 |
2510                          BTRFS_BLOCK_GROUP_DUP)) {
2511                 /* we limit the length of each bio to what fits in a stripe */
2512                 *length = min_t(u64, em->len - offset,
2513                               map->stripe_len - stripe_offset);
2514         } else {
2515                 *length = em->len - offset;
2516         }
2517
2518         if (!multi_ret && !unplug_page)
2519                 goto out;
2520
2521         num_stripes = 1;
2522         stripe_index = 0;
2523         if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2524                 if (unplug_page || (rw & (1 << BIO_RW)))
2525                         num_stripes = map->num_stripes;
2526                 else if (mirror_num)
2527                         stripe_index = mirror_num - 1;
2528                 else {
2529                         stripe_index = find_live_mirror(map, 0,
2530                                             map->num_stripes,
2531                                             current->pid % map->num_stripes);
2532                 }
2533
2534         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2535                 if (rw & (1 << BIO_RW))
2536                         num_stripes = map->num_stripes;
2537                 else if (mirror_num)
2538                         stripe_index = mirror_num - 1;
2539
2540         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2541                 int factor = map->num_stripes / map->sub_stripes;
2542
2543                 stripe_index = do_div(stripe_nr, factor);
2544                 stripe_index *= map->sub_stripes;
2545
2546                 if (unplug_page || (rw & (1 << BIO_RW)))
2547                         num_stripes = map->sub_stripes;
2548                 else if (mirror_num)
2549                         stripe_index += mirror_num - 1;
2550                 else {
2551                         stripe_index = find_live_mirror(map, stripe_index,
2552                                               map->sub_stripes, stripe_index +
2553                                               current->pid % map->sub_stripes);
2554                 }
2555         } else {
2556                 /*
2557                  * after this do_div call, stripe_nr is the number of stripes
2558                  * on this device we have to walk to find the data, and
2559                  * stripe_index is the number of our device in the stripe array
2560                  */
2561                 stripe_index = do_div(stripe_nr, map->num_stripes);
2562         }
2563         BUG_ON(stripe_index >= map->num_stripes);
2564
2565         for (i = 0; i < num_stripes; i++) {
2566                 if (unplug_page) {
2567                         struct btrfs_device *device;
2568                         struct backing_dev_info *bdi;
2569
2570                         device = map->stripes[stripe_index].dev;
2571                         if (device->bdev) {
2572                                 bdi = blk_get_backing_dev_info(device->bdev);
2573                                 if (bdi->unplug_io_fn)
2574                                         bdi->unplug_io_fn(bdi, unplug_page);
2575                         }
2576                 } else {
2577                         multi->stripes[i].physical =
2578                                 map->stripes[stripe_index].physical +
2579                                 stripe_offset + stripe_nr * map->stripe_len;
2580                         multi->stripes[i].dev = map->stripes[stripe_index].dev;
2581                 }
2582                 stripe_index++;
2583         }
2584         if (multi_ret) {
2585                 *multi_ret = multi;
2586                 multi->num_stripes = num_stripes;
2587                 multi->max_errors = max_errors;
2588         }
2589 out:
2590         free_extent_map(em);
2591         return 0;
2592 }
2593
2594 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2595                       u64 logical, u64 *length,
2596                       struct btrfs_multi_bio **multi_ret, int mirror_num)
2597 {
2598         return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2599                                  mirror_num, NULL);
2600 }
2601
2602 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2603                      u64 chunk_start, u64 physical, u64 devid,
2604                      u64 **logical, int *naddrs, int *stripe_len)
2605 {
2606         struct extent_map_tree *em_tree = &map_tree->map_tree;
2607         struct extent_map *em;
2608         struct map_lookup *map;
2609         u64 *buf;
2610         u64 bytenr;
2611         u64 length;
2612         u64 stripe_nr;
2613         int i, j, nr = 0;
2614
2615         spin_lock(&em_tree->lock);
2616         em = lookup_extent_mapping(em_tree, chunk_start, 1);
2617         spin_unlock(&em_tree->lock);
2618
2619         BUG_ON(!em || em->start != chunk_start);
2620         map = (struct map_lookup *)em->bdev;
2621
2622         length = em->len;
2623         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2624                 do_div(length, map->num_stripes / map->sub_stripes);
2625         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2626                 do_div(length, map->num_stripes);
2627
2628         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2629         BUG_ON(!buf);
2630
2631         for (i = 0; i < map->num_stripes; i++) {
2632                 if (devid && map->stripes[i].dev->devid != devid)
2633                         continue;
2634                 if (map->stripes[i].physical > physical ||
2635                     map->stripes[i].physical + length <= physical)
2636                         continue;
2637
2638                 stripe_nr = physical - map->stripes[i].physical;
2639                 do_div(stripe_nr, map->stripe_len);
2640
2641                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2642                         stripe_nr = stripe_nr * map->num_stripes + i;
2643                         do_div(stripe_nr, map->sub_stripes);
2644                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2645                         stripe_nr = stripe_nr * map->num_stripes + i;
2646                 }
2647                 bytenr = chunk_start + stripe_nr * map->stripe_len;
2648                 WARN_ON(nr >= map->num_stripes);
2649                 for (j = 0; j < nr; j++) {
2650                         if (buf[j] == bytenr)
2651                                 break;
2652                 }
2653                 if (j == nr) {
2654                         WARN_ON(nr >= map->num_stripes);
2655                         buf[nr++] = bytenr;
2656                 }
2657         }
2658
2659         for (i = 0; i > nr; i++) {
2660                 struct btrfs_multi_bio *multi;
2661                 struct btrfs_bio_stripe *stripe;
2662                 int ret;
2663
2664                 length = 1;
2665                 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2666                                       &length, &multi, 0);
2667                 BUG_ON(ret);
2668
2669                 stripe = multi->stripes;
2670                 for (j = 0; j < multi->num_stripes; j++) {
2671                         if (stripe->physical >= physical &&
2672                             physical < stripe->physical + length)
2673                                 break;
2674                 }
2675                 BUG_ON(j >= multi->num_stripes);
2676                 kfree(multi);
2677         }
2678
2679         *logical = buf;
2680         *naddrs = nr;
2681         *stripe_len = map->stripe_len;
2682
2683         free_extent_map(em);
2684         return 0;
2685 }
2686
2687 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2688                       u64 logical, struct page *page)
2689 {
2690         u64 length = PAGE_CACHE_SIZE;
2691         return __btrfs_map_block(map_tree, READ, logical, &length,
2692                                  NULL, 0, page);
2693 }
2694
2695 static void end_bio_multi_stripe(struct bio *bio, int err)
2696 {
2697         struct btrfs_multi_bio *multi = bio->bi_private;
2698         int is_orig_bio = 0;
2699
2700         if (err)
2701                 atomic_inc(&multi->error);
2702
2703         if (bio == multi->orig_bio)
2704                 is_orig_bio = 1;
2705
2706         if (atomic_dec_and_test(&multi->stripes_pending)) {
2707                 if (!is_orig_bio) {
2708                         bio_put(bio);
2709                         bio = multi->orig_bio;
2710                 }
2711                 bio->bi_private = multi->private;
2712                 bio->bi_end_io = multi->end_io;
2713                 /* only send an error to the higher layers if it is
2714                  * beyond the tolerance of the multi-bio
2715                  */
2716                 if (atomic_read(&multi->error) > multi->max_errors) {
2717                         err = -EIO;
2718                 } else if (err) {
2719                         /*
2720                          * this bio is actually up to date, we didn't
2721                          * go over the max number of errors
2722                          */
2723                         set_bit(BIO_UPTODATE, &bio->bi_flags);
2724                         err = 0;
2725                 }
2726                 kfree(multi);
2727
2728                 bio_endio(bio, err);
2729         } else if (!is_orig_bio) {
2730                 bio_put(bio);
2731         }
2732 }
2733
2734 struct async_sched {
2735         struct bio *bio;
2736         int rw;
2737         struct btrfs_fs_info *info;
2738         struct btrfs_work work;
2739 };
2740
2741 /*
2742  * see run_scheduled_bios for a description of why bios are collected for
2743  * async submit.
2744  *
2745  * This will add one bio to the pending list for a device and make sure
2746  * the work struct is scheduled.
2747  */
2748 static noinline int schedule_bio(struct btrfs_root *root,
2749                                  struct btrfs_device *device,
2750                                  int rw, struct bio *bio)
2751 {
2752         int should_queue = 1;
2753
2754         /* don't bother with additional async steps for reads, right now */
2755         if (!(rw & (1 << BIO_RW))) {
2756                 bio_get(bio);
2757                 submit_bio(rw, bio);
2758                 bio_put(bio);
2759                 return 0;
2760         }
2761
2762         /*
2763          * nr_async_bios allows us to reliably return congestion to the
2764          * higher layers.  Otherwise, the async bio makes it appear we have
2765          * made progress against dirty pages when we've really just put it
2766          * on a queue for later
2767          */
2768         atomic_inc(&root->fs_info->nr_async_bios);
2769         WARN_ON(bio->bi_next);
2770         bio->bi_next = NULL;
2771         bio->bi_rw |= rw;
2772
2773         spin_lock(&device->io_lock);
2774
2775         if (device->pending_bio_tail)
2776                 device->pending_bio_tail->bi_next = bio;
2777
2778         device->pending_bio_tail = bio;
2779         if (!device->pending_bios)
2780                 device->pending_bios = bio;
2781         if (device->running_pending)
2782                 should_queue = 0;
2783
2784         spin_unlock(&device->io_lock);
2785
2786         if (should_queue)
2787                 btrfs_queue_worker(&root->fs_info->submit_workers,
2788                                    &device->work);
2789         return 0;
2790 }
2791
2792 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2793                   int mirror_num, int async_submit)
2794 {
2795         struct btrfs_mapping_tree *map_tree;
2796         struct btrfs_device *dev;
2797         struct bio *first_bio = bio;
2798         u64 logical = (u64)bio->bi_sector << 9;
2799         u64 length = 0;
2800         u64 map_length;
2801         struct btrfs_multi_bio *multi = NULL;
2802         int ret;
2803         int dev_nr = 0;
2804         int total_devs = 1;
2805
2806         length = bio->bi_size;
2807         map_tree = &root->fs_info->mapping_tree;
2808         map_length = length;
2809
2810         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2811                               mirror_num);
2812         BUG_ON(ret);
2813
2814         total_devs = multi->num_stripes;
2815         if (map_length < length) {
2816                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2817                        "len %llu\n", (unsigned long long)logical,
2818                        (unsigned long long)length,
2819                        (unsigned long long)map_length);
2820                 BUG();
2821         }
2822         multi->end_io = first_bio->bi_end_io;
2823         multi->private = first_bio->bi_private;
2824         multi->orig_bio = first_bio;
2825         atomic_set(&multi->stripes_pending, multi->num_stripes);
2826
2827         while (dev_nr < total_devs) {
2828                 if (total_devs > 1) {
2829                         if (dev_nr < total_devs - 1) {
2830                                 bio = bio_clone(first_bio, GFP_NOFS);
2831                                 BUG_ON(!bio);
2832                         } else {
2833                                 bio = first_bio;
2834                         }
2835                         bio->bi_private = multi;
2836                         bio->bi_end_io = end_bio_multi_stripe;
2837                 }
2838                 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2839                 dev = multi->stripes[dev_nr].dev;
2840                 BUG_ON(rw == WRITE && !dev->writeable);
2841                 if (dev && dev->bdev) {
2842                         bio->bi_bdev = dev->bdev;
2843                         if (async_submit)
2844                                 schedule_bio(root, dev, rw, bio);
2845                         else
2846                                 submit_bio(rw, bio);
2847                 } else {
2848                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2849                         bio->bi_sector = logical >> 9;
2850                         bio_endio(bio, -EIO);
2851                 }
2852                 dev_nr++;
2853         }
2854         if (total_devs == 1)
2855                 kfree(multi);
2856         return 0;
2857 }
2858
2859 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2860                                        u8 *uuid, u8 *fsid)
2861 {
2862         struct btrfs_device *device;
2863         struct btrfs_fs_devices *cur_devices;
2864
2865         cur_devices = root->fs_info->fs_devices;
2866         while (cur_devices) {
2867                 if (!fsid ||
2868                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2869                         device = __find_device(&cur_devices->devices,
2870                                                devid, uuid);
2871                         if (device)
2872                                 return device;
2873                 }
2874                 cur_devices = cur_devices->seed;
2875         }
2876         return NULL;
2877 }
2878
2879 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2880                                             u64 devid, u8 *dev_uuid)
2881 {
2882         struct btrfs_device *device;
2883         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2884
2885         device = kzalloc(sizeof(*device), GFP_NOFS);
2886         if (!device)
2887                 return NULL;
2888         list_add(&device->dev_list,
2889                  &fs_devices->devices);
2890         device->barriers = 1;
2891         device->dev_root = root->fs_info->dev_root;
2892         device->devid = devid;
2893         device->work.func = pending_bios_fn;
2894         device->fs_devices = fs_devices;
2895         fs_devices->num_devices++;
2896         spin_lock_init(&device->io_lock);
2897         INIT_LIST_HEAD(&device->dev_alloc_list);
2898         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2899         return device;
2900 }
2901
2902 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2903                           struct extent_buffer *leaf,
2904                           struct btrfs_chunk *chunk)
2905 {
2906         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2907         struct map_lookup *map;
2908         struct extent_map *em;
2909         u64 logical;
2910         u64 length;
2911         u64 devid;
2912         u8 uuid[BTRFS_UUID_SIZE];
2913         int num_stripes;
2914         int ret;
2915         int i;
2916
2917         logical = key->offset;
2918         length = btrfs_chunk_length(leaf, chunk);
2919
2920         spin_lock(&map_tree->map_tree.lock);
2921         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2922         spin_unlock(&map_tree->map_tree.lock);
2923
2924         /* already mapped? */
2925         if (em && em->start <= logical && em->start + em->len > logical) {
2926                 free_extent_map(em);
2927                 return 0;
2928         } else if (em) {
2929                 free_extent_map(em);
2930         }
2931
2932         em = alloc_extent_map(GFP_NOFS);
2933         if (!em)
2934                 return -ENOMEM;
2935         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2936         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2937         if (!map) {
2938                 free_extent_map(em);
2939                 return -ENOMEM;
2940         }
2941
2942         em->bdev = (struct block_device *)map;
2943         em->start = logical;
2944         em->len = length;
2945         em->block_start = 0;
2946         em->block_len = em->len;
2947
2948         map->num_stripes = num_stripes;
2949         map->io_width = btrfs_chunk_io_width(leaf, chunk);
2950         map->io_align = btrfs_chunk_io_align(leaf, chunk);
2951         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2952         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2953         map->type = btrfs_chunk_type(leaf, chunk);
2954         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2955         for (i = 0; i < num_stripes; i++) {
2956                 map->stripes[i].physical =
2957                         btrfs_stripe_offset_nr(leaf, chunk, i);
2958                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2959                 read_extent_buffer(leaf, uuid, (unsigned long)
2960                                    btrfs_stripe_dev_uuid_nr(chunk, i),
2961                                    BTRFS_UUID_SIZE);
2962                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2963                                                         NULL);
2964                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2965                         kfree(map);
2966                         free_extent_map(em);
2967                         return -EIO;
2968                 }
2969                 if (!map->stripes[i].dev) {
2970                         map->stripes[i].dev =
2971                                 add_missing_dev(root, devid, uuid);
2972                         if (!map->stripes[i].dev) {
2973                                 kfree(map);
2974                                 free_extent_map(em);
2975                                 return -EIO;
2976                         }
2977                 }
2978                 map->stripes[i].dev->in_fs_metadata = 1;
2979         }
2980
2981         spin_lock(&map_tree->map_tree.lock);
2982         ret = add_extent_mapping(&map_tree->map_tree, em);
2983         spin_unlock(&map_tree->map_tree.lock);
2984         BUG_ON(ret);
2985         free_extent_map(em);
2986
2987         return 0;
2988 }
2989
2990 static int fill_device_from_item(struct extent_buffer *leaf,
2991                                  struct btrfs_dev_item *dev_item,
2992                                  struct btrfs_device *device)
2993 {
2994         unsigned long ptr;
2995
2996         device->devid = btrfs_device_id(leaf, dev_item);
2997         device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2998         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2999         device->type = btrfs_device_type(leaf, dev_item);
3000         device->io_align = btrfs_device_io_align(leaf, dev_item);
3001         device->io_width = btrfs_device_io_width(leaf, dev_item);
3002         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3003
3004         ptr = (unsigned long)btrfs_device_uuid(dev_item);
3005         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3006
3007         return 0;
3008 }
3009
3010 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3011 {
3012         struct btrfs_fs_devices *fs_devices;
3013         int ret;
3014
3015         mutex_lock(&uuid_mutex);
3016
3017         fs_devices = root->fs_info->fs_devices->seed;
3018         while (fs_devices) {
3019                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3020                         ret = 0;
3021                         goto out;
3022                 }
3023                 fs_devices = fs_devices->seed;
3024         }
3025
3026         fs_devices = find_fsid(fsid);
3027         if (!fs_devices) {
3028                 ret = -ENOENT;
3029                 goto out;
3030         }
3031
3032         fs_devices = clone_fs_devices(fs_devices);
3033         if (IS_ERR(fs_devices)) {
3034                 ret = PTR_ERR(fs_devices);
3035                 goto out;
3036         }
3037
3038         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3039                                    root->fs_info->bdev_holder);
3040         if (ret)
3041                 goto out;
3042
3043         if (!fs_devices->seeding) {
3044                 __btrfs_close_devices(fs_devices);
3045                 free_fs_devices(fs_devices);
3046                 ret = -EINVAL;
3047                 goto out;
3048         }
3049
3050         fs_devices->seed = root->fs_info->fs_devices->seed;
3051         root->fs_info->fs_devices->seed = fs_devices;
3052 out:
3053         mutex_unlock(&uuid_mutex);
3054         return ret;
3055 }
3056
3057 static int read_one_dev(struct btrfs_root *root,
3058                         struct extent_buffer *leaf,
3059                         struct btrfs_dev_item *dev_item)
3060 {
3061         struct btrfs_device *device;
3062         u64 devid;
3063         int ret;
3064         u8 fs_uuid[BTRFS_UUID_SIZE];
3065         u8 dev_uuid[BTRFS_UUID_SIZE];
3066
3067         devid = btrfs_device_id(leaf, dev_item);
3068         read_extent_buffer(leaf, dev_uuid,
3069                            (unsigned long)btrfs_device_uuid(dev_item),
3070                            BTRFS_UUID_SIZE);
3071         read_extent_buffer(leaf, fs_uuid,
3072                            (unsigned long)btrfs_device_fsid(dev_item),
3073                            BTRFS_UUID_SIZE);
3074
3075         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3076                 ret = open_seed_devices(root, fs_uuid);
3077                 if (ret && !btrfs_test_opt(root, DEGRADED))
3078                         return ret;
3079         }
3080
3081         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3082         if (!device || !device->bdev) {
3083                 if (!btrfs_test_opt(root, DEGRADED))
3084                         return -EIO;
3085
3086                 if (!device) {
3087                         printk(KERN_WARNING "warning devid %llu missing\n",
3088                                (unsigned long long)devid);
3089                         device = add_missing_dev(root, devid, dev_uuid);
3090                         if (!device)
3091                                 return -ENOMEM;
3092                 }
3093         }
3094
3095         if (device->fs_devices != root->fs_info->fs_devices) {
3096                 BUG_ON(device->writeable);
3097                 if (device->generation !=
3098                     btrfs_device_generation(leaf, dev_item))
3099                         return -EINVAL;
3100         }
3101
3102         fill_device_from_item(leaf, dev_item, device);
3103         device->dev_root = root->fs_info->dev_root;
3104         device->in_fs_metadata = 1;
3105         if (device->writeable)
3106                 device->fs_devices->total_rw_bytes += device->total_bytes;
3107         ret = 0;
3108         return ret;
3109 }
3110
3111 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3112 {
3113         struct btrfs_dev_item *dev_item;
3114
3115         dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3116                                                      dev_item);
3117         return read_one_dev(root, buf, dev_item);
3118 }
3119
3120 int btrfs_read_sys_array(struct btrfs_root *root)
3121 {
3122         struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3123         struct extent_buffer *sb;
3124         struct btrfs_disk_key *disk_key;
3125         struct btrfs_chunk *chunk;
3126         u8 *ptr;
3127         unsigned long sb_ptr;
3128         int ret = 0;
3129         u32 num_stripes;
3130         u32 array_size;
3131         u32 len = 0;
3132         u32 cur;
3133         struct btrfs_key key;
3134
3135         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3136                                           BTRFS_SUPER_INFO_SIZE);
3137         if (!sb)
3138                 return -ENOMEM;
3139         btrfs_set_buffer_uptodate(sb);
3140         btrfs_set_buffer_lockdep_class(sb, 0);
3141
3142         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3143         array_size = btrfs_super_sys_array_size(super_copy);
3144
3145         ptr = super_copy->sys_chunk_array;
3146         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3147         cur = 0;
3148
3149         while (cur < array_size) {
3150                 disk_key = (struct btrfs_disk_key *)ptr;
3151                 btrfs_disk_key_to_cpu(&key, disk_key);
3152
3153                 len = sizeof(*disk_key); ptr += len;
3154                 sb_ptr += len;
3155                 cur += len;
3156
3157                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3158                         chunk = (struct btrfs_chunk *)sb_ptr;
3159                         ret = read_one_chunk(root, &key, sb, chunk);
3160                         if (ret)
3161                                 break;
3162                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3163                         len = btrfs_chunk_item_size(num_stripes);
3164                 } else {
3165                         ret = -EIO;
3166                         break;
3167                 }
3168                 ptr += len;
3169                 sb_ptr += len;
3170                 cur += len;
3171         }
3172         free_extent_buffer(sb);
3173         return ret;
3174 }
3175
3176 int btrfs_read_chunk_tree(struct btrfs_root *root)
3177 {
3178         struct btrfs_path *path;
3179         struct extent_buffer *leaf;
3180         struct btrfs_key key;
3181         struct btrfs_key found_key;
3182         int ret;
3183         int slot;
3184
3185         root = root->fs_info->chunk_root;
3186
3187         path = btrfs_alloc_path();
3188         if (!path)
3189                 return -ENOMEM;
3190
3191         /* first we search for all of the device items, and then we
3192          * read in all of the chunk items.  This way we can create chunk
3193          * mappings that reference all of the devices that are afound
3194          */
3195         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3196         key.offset = 0;
3197         key.type = 0;
3198 again:
3199         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3200         while (1) {
3201                 leaf = path->nodes[0];
3202                 slot = path->slots[0];
3203                 if (slot >= btrfs_header_nritems(leaf)) {
3204                         ret = btrfs_next_leaf(root, path);
3205                         if (ret == 0)
3206                                 continue;
3207                         if (ret < 0)
3208                                 goto error;
3209                         break;
3210                 }
3211                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3212                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3213                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3214                                 break;
3215                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3216                                 struct btrfs_dev_item *dev_item;
3217                                 dev_item = btrfs_item_ptr(leaf, slot,
3218                                                   struct btrfs_dev_item);
3219                                 ret = read_one_dev(root, leaf, dev_item);
3220                                 if (ret)
3221                                         goto error;
3222                         }
3223                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3224                         struct btrfs_chunk *chunk;
3225                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3226                         ret = read_one_chunk(root, &found_key, leaf, chunk);
3227                         if (ret)
3228                                 goto error;
3229                 }
3230                 path->slots[0]++;
3231         }
3232         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3233                 key.objectid = 0;
3234                 btrfs_release_path(root, path);
3235                 goto again;
3236         }
3237         ret = 0;
3238 error:
3239         btrfs_free_path(path);
3240         return ret;
3241 }