]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - fs/btrfs/inode.c
Merge branch 'for-2.6.35' into for-2.6.36
[mv-sheeva.git] / fs / btrfs / inode.c
index bef69bedf3cf716f8c826532dd2c6034d1d1f58e..fa6ccc1bfe2a9a73e2de97ed04cac8708efa99f9 100644 (file)
@@ -698,6 +698,38 @@ retry:
        return 0;
 }
 
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                     u64 num_bytes)
+{
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
+       u64 alloc_hint = 0;
+
+       read_lock(&em_tree->lock);
+       em = search_extent_mapping(em_tree, start, num_bytes);
+       if (em) {
+               /*
+                * if block start isn't an actual block number then find the
+                * first block in this inode and use that as a hint.  If that
+                * block is also bogus then just don't worry about it.
+                */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                       free_extent_map(em);
+                       em = search_extent_mapping(em_tree, 0, 0);
+                       if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                               alloc_hint = em->block_start;
+                       if (em)
+                               free_extent_map(em);
+               } else {
+                       alloc_hint = em->block_start;
+                       free_extent_map(em);
+               }
+       }
+       read_unlock(&em_tree->lock);
+
+       return alloc_hint;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
@@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-
-       read_lock(&BTRFS_I(inode)->extent_tree.lock);
-       em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                  start, num_bytes);
-       if (em) {
-               /*
-                * if block start isn't an actual block number then find the
-                * first block in this inode and use that as a hint.  If that
-                * block is also bogus then just don't worry about it.
-                */
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                       free_extent_map(em);
-                       em = search_extent_mapping(em_tree, 0, 0);
-                       if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                               alloc_hint = em->block_start;
-                       if (em)
-                               free_extent_map(em);
-               } else {
-                       alloc_hint = em->block_start;
-                       free_extent_map(em);
-               }
-       }
-       read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+       alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
        while (disk_num_bytes > 0) {
@@ -1175,6 +1185,13 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
 
+               if (root->root_key.objectid ==
+                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                       ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                     num_bytes);
+                       BUG_ON(ret);
+               }
+
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1368,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                   unsigned long bio_flags)
+                                   unsigned long bio_flags,
+                                   u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1387,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  * are inserted into the btree
  */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1398,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1423,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                  bio_flags, __btrfs_submit_bio_start,
+                                  bio_flags, bio_offset,
+                                  __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
 
@@ -1827,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                     failrec->bio_flags);
+                                                     failrec->bio_flags, 0);
        return 0;
 }
 
@@ -4287,10 +4308,29 @@ void btrfs_dirty_inode(struct inode *inode)
        btrfs_set_trans_block_group(trans, inode);
 
        ret = btrfs_update_inode(trans, root, inode);
-       if (ret)
-               printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
-                       inode->i_ino, ret);
+       if (ret && ret == -ENOSPC) {
+               /* whoops, lets try again with the full transaction */
+               btrfs_end_transaction(trans, root);
+               trans = btrfs_start_transaction(root, 1);
+               if (IS_ERR(trans)) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %ld\n",
+                                      inode->i_ino, PTR_ERR(trans));
+                       }
+                       return;
+               }
+               btrfs_set_trans_block_group(trans, inode);
 
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       if (printk_ratelimit()) {
+                               printk(KERN_ERR "btrfs: fail to "
+                                      "dirty  inode %lu error %d\n",
+                                      inode->i_ino, ret);
+                       }
+               }
+       }
        btrfs_end_transaction(trans, root);
 }
 
@@ -4436,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
 
-       inode->i_uid = current_fsuid();
-
-       if (dir && (dir->i_mode & S_ISGID)) {
-               inode->i_gid = dir->i_gid;
-               if (S_ISDIR(mode))
-                       mode |= S_ISGID;
-       } else
-               inode->i_gid = current_fsgid();
-
-       inode->i_mode = mode;
+       inode_init_owner(inode, dir, mode);
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -5164,11 +5195,651 @@ out:
        return em;
 }
 
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                 u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct extent_map *em;
+       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct btrfs_key ins;
+       u64 alloc_hint;
+       int ret;
+
+       btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+       trans = btrfs_join_transaction(root, 0);
+       if (!trans)
+               return ERR_PTR(-ENOMEM);
+
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+       alloc_hint = get_extent_allocation_hint(inode, start, len);
+       ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                  alloc_hint, (u64)-1, &ins, 1);
+       if (ret) {
+               em = ERR_PTR(ret);
+               goto out;
+       }
+
+       em = alloc_extent_map(GFP_NOFS);
+       if (!em) {
+               em = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       em->start = start;
+       em->orig_start = em->start;
+       em->len = ins.offset;
+
+       em->block_start = ins.objectid;
+       em->block_len = ins.offset;
+       em->bdev = root->fs_info->fs_devices->latest_bdev;
+       set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+       while (1) {
+               write_lock(&em_tree->lock);
+               ret = add_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               if (ret != -EEXIST)
+                       break;
+               btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+       }
+
+       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                          ins.offset, ins.offset, 0);
+       if (ret) {
+               btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+               em = ERR_PTR(ret);
+       }
+out:
+       btrfs_end_transaction(trans, root);
+       return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                     struct inode *inode, u64 offset, u64 len)
+{
+       struct btrfs_path *path;
+       int ret;
+       struct extent_buffer *leaf;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       u64 backref_offset;
+       u64 extent_end;
+       u64 num_bytes;
+       int slot;
+       int found_type;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                      offset, 0);
+       if (ret < 0)
+               goto out;
+
+       slot = path->slots[0];
+       if (ret == 1) {
+               if (slot == 0) {
+                       /* can't find the item, must cow */
+                       ret = 0;
+                       goto out;
+               }
+               slot--;
+       }
+       ret = 0;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != inode->i_ino ||
+           key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* not our file or wrong item type, must cow */
+               goto out;
+       }
+
+       if (key.offset > offset) {
+               /* Wrong offset, must cow */
+               goto out;
+       }
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       found_type = btrfs_file_extent_type(leaf, fi);
+       if (found_type != BTRFS_FILE_EXTENT_REG &&
+           found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+               /* not a regular extent, must cow */
+               goto out;
+       }
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       if (extent_end < offset + len) {
+               /* extent doesn't include our full range, must cow */
+               goto out;
+       }
+
+       if (btrfs_extent_readonly(root, disk_bytenr))
+               goto out;
+
+       /*
+        * look for other files referencing this extent, if we
+        * find any we must cow
+        */
+       if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                 key.offset - backref_offset, disk_bytenr))
+               goto out;
+
+       /*
+        * adjust disk_bytenr and num_bytes to cover just the bytes
+        * in this extent we are about to write.  If there
+        * are any csums in that range we have to cow in order
+        * to keep the csums correct
+        */
+       disk_bytenr += backref_offset;
+       disk_bytenr += offset - key.offset;
+       num_bytes = min(offset + len, extent_end) - offset;
+       if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                               goto out;
+       /*
+        * all of the above have passed, it is safe to overwrite this extent
+        * without cow
+        */
+       ret = 1;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       struct extent_map *em;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start = iblock << inode->i_blkbits;
+       u64 len = bh_result->b_size;
+       struct btrfs_trans_handle *trans;
+
+       em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+
+       /*
+        * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+        * io.  INLINE is special, and we could probably kludge it in here, but
+        * it's still buffered so for safety lets just fall back to the generic
+        * buffered path.
+        *
+        * For COMPRESSED we _have_ to read the entire extent in so we can
+        * decompress it, so there will be buffering required no matter what we
+        * do, so go ahead and fallback to buffered.
+        *
+        * We return -ENOTBLK because thats what makes DIO go ahead and go back
+        * to buffered IO.  Don't blame me, this is the price we pay for using
+        * the generic code.
+        */
+       if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+           em->block_start == EXTENT_MAP_INLINE) {
+               free_extent_map(em);
+               return -ENOTBLK;
+       }
+
+       /* Just a good old fashioned hole, return */
+       if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+               free_extent_map(em);
+               /* DIO will do one hole at a time, so just unlock a sector */
+               unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                             start + root->sectorsize - 1, GFP_NOFS);
+               return 0;
+       }
+
+       /*
+        * We don't allocate a new extent in the following cases
+        *
+        * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+        * existing extent.
+        * 2) The extent is marked as PREALLOC.  We're good to go here and can
+        * just use the extent.
+        *
+        */
+       if (!create) {
+               len = em->len - (start - em->start);
+               goto map;
+       }
+
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            em->block_start != EXTENT_MAP_HOLE)) {
+               int type;
+               int ret;
+               u64 block_start;
+
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       type = BTRFS_ORDERED_PREALLOC;
+               else
+                       type = BTRFS_ORDERED_NOCOW;
+               len = min(len, em->len - (start - em->start));
+               block_start = em->block_start + (start - em->start);
+
+               /*
+                * we're not going to log anything, but we do need
+                * to make sure the current transaction stays open
+                * while we look for nocow cross refs
+                */
+               trans = btrfs_join_transaction(root, 0);
+               if (!trans)
+                       goto must_cow;
+
+               if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       btrfs_end_transaction(trans, root);
+                       if (ret) {
+                               free_extent_map(em);
+                               return ret;
+                       }
+                       goto unlock;
+               }
+               btrfs_end_transaction(trans, root);
+       }
+must_cow:
+       /*
+        * this will cow the extent, reset the len in case we changed
+        * it above
+        */
+       len = bh_result->b_size;
+       free_extent_map(em);
+       em = btrfs_new_extent_direct(inode, start, len);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+       len = min(len, em->len - (start - em->start));
+unlock:
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                         EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                         0, NULL, GFP_NOFS);
+map:
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = em->bdev;
+       set_buffer_mapped(bh_result);
+       if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               set_buffer_new(bh_result);
+
+       free_extent_map(em);
+
+       return 0;
+}
+
+struct btrfs_dio_private {
+       struct inode *inode;
+       u64 logical_offset;
+       u64 disk_bytenr;
+       u64 bytes;
+       u32 *csums;
+       void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+       struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 start;
+       u32 *private = dip->csums;
+
+       start = dip->logical_offset;
+       do {
+               if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                       struct page *page = bvec->bv_page;
+                       char *kaddr;
+                       u32 csum = ~(u32)0;
+                       unsigned long flags;
+
+                       local_irq_save(flags);
+                       kaddr = kmap_atomic(page, KM_IRQ0);
+                       csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                              csum, bvec->bv_len);
+                       btrfs_csum_final(csum, (char *)&csum);
+                       kunmap_atomic(kaddr, KM_IRQ0);
+                       local_irq_restore(flags);
+
+                       flush_dcache_page(bvec->bv_page);
+                       if (csum != *private) {
+                               printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                     " %llu csum %u private %u\n",
+                                     inode->i_ino, (unsigned long long)start,
+                                     csum, *private);
+                               err = -EIO;
+                       }
+               }
+
+               start += bvec->bv_len;
+               private++;
+               bvec++;
+       } while (bvec <= bvec_end);
+
+       unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                     dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+       bio->bi_private = dip->private;
+
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+       struct btrfs_dio_private *dip = bio->bi_private;
+       struct inode *inode = dip->inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_ordered_extent *ordered = NULL;
+       struct extent_state *cached_state = NULL;
+       int ret;
+
+       if (err)
+               goto out_done;
+
+       ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                            dip->logical_offset, dip->bytes);
+       if (!ret)
+               goto out_done;
+
+       BUG_ON(!ordered);
+
+       trans = btrfs_join_transaction(root, 1);
+       if (!trans) {
+               err = -ENOMEM;
+               goto out;
+       }
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, inode);
+               err = ret;
+               goto out;
+       }
+
+       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                        ordered->file_offset + ordered->len - 1, 0,
+                        &cached_state, GFP_NOFS);
+
+       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+               ret = btrfs_mark_extent_written(trans, inode,
+                                               ordered->file_offset,
+                                               ordered->file_offset +
+                                               ordered->len);
+               if (ret) {
+                       err = ret;
+                       goto out_unlock;
+               }
+       } else {
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 ordered->file_offset,
+                                                 ordered->start,
+                                                 ordered->disk_len,
+                                                 ordered->len,
+                                                 ordered->len,
+                                                 0, 0, 0,
+                                                 BTRFS_FILE_EXTENT_REG);
+               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                  ordered->file_offset, ordered->len);
+               if (ret) {
+                       err = ret;
+                       WARN_ON(1);
+                       goto out_unlock;
+               }
+       }
+
+       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+       btrfs_ordered_update_i_size(inode, 0, ordered);
+       btrfs_update_inode(trans, root, inode);
+out_unlock:
+       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                            ordered->file_offset + ordered->len - 1,
+                            &cached_state, GFP_NOFS);
+out:
+       btrfs_delalloc_release_metadata(inode, ordered->len);
+       btrfs_end_transaction(trans, root);
+       btrfs_put_ordered_extent(ordered);
+       btrfs_put_ordered_extent(ordered);
+out_done:
+       bio->bi_private = dip->private;
+
+       kfree(dip->csums);
+       kfree(dip);
+       dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                   struct bio *bio, int mirror_num,
+                                   unsigned long bio_flags, u64 offset)
+{
+       int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+       BUG_ON(ret);
+       return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                               loff_t file_offset)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_dio_private *dip;
+       struct bio_vec *bvec = bio->bi_io_vec;
+       u64 start;
+       int skip_sum;
+       int write = rw & (1 << BIO_RW);
+       int ret = 0;
+
+       skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+       dip = kmalloc(sizeof(*dip), GFP_NOFS);
+       if (!dip) {
+               ret = -ENOMEM;
+               goto free_ordered;
+       }
+       dip->csums = NULL;
+
+       if (!skip_sum) {
+               dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+               if (!dip->csums) {
+                       ret = -ENOMEM;
+                       goto free_ordered;
+               }
+       }
+
+       dip->private = bio->bi_private;
+       dip->inode = inode;
+       dip->logical_offset = file_offset;
+
+       start = dip->logical_offset;
+       dip->bytes = 0;
+       do {
+               dip->bytes += bvec->bv_len;
+               bvec++;
+       } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+       dip->disk_bytenr = (u64)bio->bi_sector << 9;
+       bio->bi_private = dip;
+
+       if (write)
+               bio->bi_end_io = btrfs_endio_direct_write;
+       else
+               bio->bi_end_io = btrfs_endio_direct_read;
+
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+       if (ret)
+               goto out_err;
+
+       if (write && !skip_sum) {
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                  inode, rw, bio, 0, 0,
+                                  dip->logical_offset,
+                                  __btrfs_submit_bio_start_direct_io,
+                                  __btrfs_submit_bio_done);
+               if (ret)
+                       goto out_err;
+               return;
+       } else if (!skip_sum)
+               btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                         dip->logical_offset, dip->csums);
+
+       ret = btrfs_map_bio(root, rw, bio, 0, 1);
+       if (ret)
+               goto out_err;
+       return;
+out_err:
+       kfree(dip->csums);
+       kfree(dip);
+free_ordered:
+       /*
+        * If this is a write, we need to clean up the reserved space and kill
+        * the ordered extent.
+        */
+       if (write) {
+               struct btrfs_ordered_extent *ordered;
+               ordered = btrfs_lookup_ordered_extent(inode,
+                                                     dip->logical_offset);
+               if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                       btrfs_free_reserved_extent(root, ordered->start,
+                                                  ordered->disk_len);
+               btrfs_put_ordered_extent(ordered);
+               btrfs_put_ordered_extent(ordered);
+       }
+       bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                       const struct iovec *iov, loff_t offset,
+                       unsigned long nr_segs)
+{
+       int seg;
+       size_t size;
+       unsigned long addr;
+       unsigned blocksize_mask = root->sectorsize - 1;
+       ssize_t retval = -EINVAL;
+       loff_t end = offset;
+
+       if (offset & blocksize_mask)
+               goto out;
+
+       /* Check the memory alignment.  Blocks cannot straddle pages */
+       for (seg = 0; seg < nr_segs; seg++) {
+               addr = (unsigned long)iov[seg].iov_base;
+               size = iov[seg].iov_len;
+               end += size;
+               if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                       goto out;
+       }
+       retval = 0;
+out:
+       return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-       return -EINVAL;
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
+       u64 lockstart, lockend;
+       ssize_t ret;
+       int writing = rw & WRITE;
+       int write_bits = 0;
+       size_t count = iov_length(iov, nr_segs);
+
+       if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                           offset, nr_segs)) {
+               return 0;
+       }
+
+       lockstart = offset;
+       lockend = offset + count - 1;
+
+       if (writing) {
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       }
+
+       while (1) {
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state, GFP_NOFS);
+               /*
+                * We're concerned with the entire range that we're going to be
+                * doing DIO to, so we need to make sure theres no ordered
+                * extents in this range.
+                */
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered)
+                       break;
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    &cached_state, GFP_NOFS);
+               btrfs_start_ordered_extent(inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               cond_resched();
+       }
+
+       /*
+        * we don't use btrfs_set_extent_delalloc because we don't want
+        * the dirty or uptodate bits
+        */
+       if (writing) {
+               write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                    GFP_NOFS);
+               if (ret) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, EXTENT_LOCKED | write_bits,
+                                        1, 0, &cached_state, GFP_NOFS);
+                       goto out;
+               }
+       }
+
+       free_extent_state(cached_state);
+       cached_state = NULL;
+
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                  BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                  btrfs_submit_direct, 0);
+
+       if (ret < 0 && ret != -EIOCBQUEUED) {
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+               /*
+                * We're falling back to buffered, unlock the section we didn't
+                * do IO on.
+                */
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
+       }
+out:
+       free_extent_state(cached_state);
+       return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -6080,16 +6751,15 @@ out_unlock:
        return err;
 }
 
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-                       u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                             u64 start, u64 num_bytes, u64 min_size,
+                             loff_t actual_len, u64 *alloc_hint)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
-       u64 num_bytes = end - start;
        int ret = 0;
-       u64 i_size;
 
        while (num_bytes > 0) {
                trans = btrfs_start_transaction(root, 3);
@@ -6098,9 +6768,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                        break;
                }
 
-               ret = btrfs_reserve_extent(trans, root, num_bytes,
-                                          root->sectorsize, 0, alloc_hint,
-                                          (u64)-1, &ins, 1);
+               ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                          0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
                        btrfs_end_transaction(trans, root);
                        break;
@@ -6117,20 +6786,19 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-               alloc_hint = ins.objectid + ins.offset;
+               *alloc_hint = ins.objectid + ins.offset;
 
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                       (actual_len > inode->i_size) &&
-                       (cur_offset > inode->i_size)) {
-
+                   (actual_len > inode->i_size) &&
+                   (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                               i_size  = actual_len;
+                               i_size_write(inode, actual_len);
                        else
-                               i_size = cur_offset;
-                       i_size_write(inode, i_size);
-                       btrfs_ordered_update_i_size(inode, i_size, NULL);
+                               i_size_write(inode, cur_offset);
+                       i_size_write(inode, cur_offset);
+                       btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
 
                ret = btrfs_update_inode(trans, root, inode);
@@ -6216,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                       ret = prealloc_file_range(inode,
-                                                 cur_offset, last_byte,
-                                               alloc_hint, mode, offset+len);
+                       ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+                                                       last_byte - cur_offset,
+                                                       1 << inode->i_blkbits,
+                                                       offset + len,
+                                                       &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-               if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                       alloc_hint = em->block_start;
                free_extent_map(em);
 
                cur_offset = last_byte;