]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - fs/btrfs/inode.c
Btrfs: fix preallocation and nodatacow checks in O_DIRECT
[mv-sheeva.git] / fs / btrfs / inode.c
index 1695440a59a49d73642602995f140881930bfc28..5ab120d544bc6c2f44269cf709e7556df040e4cf 100644 (file)
@@ -1385,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                   unsigned long bio_flags)
+                                   unsigned long bio_flags,
+                                   u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1404,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
  * are inserted into the btree
  */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1415,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags)
+                         int mirror_num, unsigned long bio_flags,
+                         u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1440,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                  bio_flags, __btrfs_submit_bio_start,
+                                  bio_flags, bio_offset,
+                                  __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
 
@@ -1844,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                     failrec->bio_flags);
+                                                     failrec->bio_flags, 0);
        return 0;
 }
 
@@ -4304,10 +4308,18 @@ void btrfs_dirty_inode(struct inode *inode)
        btrfs_set_trans_block_group(trans, inode);
 
        ret = btrfs_update_inode(trans, root, inode);
-       if (ret)
-               printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
-                       inode->i_ino, ret);
+       if (ret && ret == -ENOSPC) {
+               /* whoops, lets try again with the full transaction */
+               btrfs_end_transaction(trans, root);
+               trans = btrfs_start_transaction(root, 1);
+               btrfs_set_trans_block_group(trans, inode);
 
+               ret = btrfs_update_inode(trans, root, inode);
+               if (ret) {
+                       printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
+                               inode->i_ino, ret);
+               }
+       }
        btrfs_end_transaction(trans, root);
 }
 
@@ -5243,6 +5255,106 @@ out:
        return em;
 }
 
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                     struct inode *inode, u64 offset, u64 len)
+{
+       struct btrfs_path *path;
+       int ret;
+       struct extent_buffer *leaf;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       u64 backref_offset;
+       u64 extent_end;
+       u64 num_bytes;
+       int slot;
+       int found_type;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                      offset, 0);
+       if (ret < 0)
+               goto out;
+
+       slot = path->slots[0];
+       if (ret == 1) {
+               if (slot == 0) {
+                       /* can't find the item, must cow */
+                       ret = 0;
+                       goto out;
+               }
+               slot--;
+       }
+       ret = 0;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+       if (key.objectid != inode->i_ino ||
+           key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* not our file or wrong item type, must cow */
+               goto out;
+       }
+
+       if (key.offset > offset) {
+               /* Wrong offset, must cow */
+               goto out;
+       }
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       found_type = btrfs_file_extent_type(leaf, fi);
+       if (found_type != BTRFS_FILE_EXTENT_REG &&
+           found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+               /* not a regular extent, must cow */
+               goto out;
+       }
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       if (extent_end < offset + len) {
+               /* extent doesn't include our full range, must cow */
+               goto out;
+       }
+
+       if (btrfs_extent_readonly(root, disk_bytenr))
+               goto out;
+
+       /*
+        * look for other files referencing this extent, if we
+        * find any we must cow
+        */
+       if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                 key.offset - backref_offset, disk_bytenr))
+               goto out;
+
+       /*
+        * adjust disk_bytenr and num_bytes to cover just the bytes
+        * in this extent we are about to write.  If there
+        * are any csums in that range we have to cow in order
+        * to keep the csums correct
+        */
+       disk_bytenr += backref_offset;
+       disk_bytenr += offset - key.offset;
+       num_bytes = min(offset + len, extent_end) - offset;
+       if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                               goto out;
+       /*
+        * all of the above have passed, it is safe to overwrite this extent
+        * without cow
+        */
+       ret = 1;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
@@ -5250,6 +5362,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start = iblock << inode->i_blkbits;
        u64 len = bh_result->b_size;
+       struct btrfs_trans_handle *trans;
 
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em))
@@ -5294,41 +5407,65 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         * just use the extent.
         *
         */
-       if (!create)
+       if (!create) {
+               len = em->len - (start - em->start);
                goto map;
+       }
 
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
             em->block_start != EXTENT_MAP_HOLE)) {
-               u64 block_start;
                int type;
                int ret;
+               u64 block_start;
 
                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        type = BTRFS_ORDERED_PREALLOC;
                else
                        type = BTRFS_ORDERED_NOCOW;
-               len = min(len, em->block_len - (start - em->start));
+               len = min(len, em->len - (start - em->start));
                block_start = em->block_start + (start - em->start);
-               ret = btrfs_add_ordered_extent_dio(inode, start,
-                                                  start, len, len, type);
-               if (ret) {
-                       free_extent_map(em);
-                       return ret;
+
+               /*
+                * we're not going to log anything, but we do need
+                * to make sure the current transaction stays open
+                * while we look for nocow cross refs
+                */
+               trans = btrfs_join_transaction(root, 0);
+               if (!trans)
+                       goto must_cow;
+
+               if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                       ret = btrfs_add_ordered_extent_dio(inode, start,
+                                          block_start, len, len, type);
+                       btrfs_end_transaction(trans, root);
+                       if (ret) {
+                               free_extent_map(em);
+                               return ret;
+                       }
+                       goto unlock;
                }
-       } else {
-               free_extent_map(em);
-               em = btrfs_new_extent_direct(inode, start, len);
-               if (IS_ERR(em))
-                       return PTR_ERR(em);
-               len = min(len, em->block_len);
+               btrfs_end_transaction(trans, root);
        }
-       unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-                     GFP_NOFS);
+must_cow:
+       /*
+        * this will cow the extent, reset the len in case we changed
+        * it above
+        */
+       len = bh_result->b_size;
+       free_extent_map(em);
+       em = btrfs_new_extent_direct(inode, start, len);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+       len = min(len, em->len - (start - em->start));
+unlock:
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                         EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                         0, NULL, GFP_NOFS);
 map:
        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
                inode->i_blkbits;
-       bh_result->b_size = em->len - (start - em->start);
+       bh_result->b_size = len;
        bh_result->b_bdev = em->bdev;
        set_buffer_mapped(bh_result);
        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -5484,6 +5621,17 @@ out_done:
        dio_end_io(bio, err);
 }
 
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                   struct bio *bio, int mirror_num,
+                                   unsigned long bio_flags, u64 offset)
+{
+       int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+       BUG_ON(ret);
+       return 0;
+}
+
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                                loff_t file_offset)
 {
@@ -5523,7 +5671,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                bvec++;
        } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
 
-       dip->disk_bytenr = bio->bi_sector << 9;
+       dip->disk_bytenr = (u64)bio->bi_sector << 9;
        bio->bi_private = dip;
 
        if (write)
@@ -5535,13 +5683,20 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        if (ret)
                goto out_err;
 
-       if (write && !skip_sum)
-               btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
-       else if (!skip_sum)
+       if (write && !skip_sum) {
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                  inode, rw, bio, 0, 0,
+                                  dip->logical_offset,
+                                  __btrfs_submit_bio_start_direct_io,
+                                  __btrfs_submit_bio_done);
+               if (ret)
+                       goto out_err;
+               return;
+       } else if (!skip_sum)
                btrfs_lookup_bio_sums_dio(root, inode, bio,
                                          dip->logical_offset, dip->csums);
 
-       ret = btrfs_map_bio(root, rw, bio, 0, 0);
+       ret = btrfs_map_bio(root, rw, bio, 0, 1);
        if (ret)
                goto out_err;
        return;
@@ -5574,14 +5729,25 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct btrfs_ordered_extent *ordered;
+       struct extent_state *cached_state = NULL;
        u64 lockstart, lockend;
        ssize_t ret;
+       int writing = rw & WRITE;
+       int write_bits = 0;
+       size_t count = iov_length(iov, nr_segs);
 
        lockstart = offset;
-       lockend = offset + iov_length(iov, nr_segs) - 1;
+       lockend = offset + count - 1;
+
+       if (writing) {
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       }
+
        while (1) {
-               lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                           GFP_NOFS);
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                0, &cached_state, GFP_NOFS);
                /*
                 * We're concerned with the entire range that we're going to be
                 * doing DIO to, so we need to make sure theres no ordered
@@ -5591,29 +5757,54 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                                                     lockend - lockstart + 1);
                if (!ordered)
                        break;
-               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                             GFP_NOFS);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    &cached_state, GFP_NOFS);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
                cond_resched();
        }
 
+       /*
+        * we don't use btrfs_set_extent_delalloc because we don't want
+        * the dirty or uptodate bits
+        */
+       if (writing) {
+               write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                    EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                    GFP_NOFS);
+               if (ret) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, EXTENT_LOCKED | write_bits,
+                                        1, 0, &cached_state, GFP_NOFS);
+                       goto out;
+               }
+       }
+
+       free_extent_state(cached_state);
+       cached_state = NULL;
+
        ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
                                   btrfs_get_blocks_direct, NULL,
                                   btrfs_submit_direct, 0);
 
        if (ret < 0 && ret != -EIOCBQUEUED) {
-               unlock_extent(&BTRFS_I(inode)->io_tree, offset,
-                             offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
                /*
                 * We're falling back to buffered, unlock the section we didn't
                 * do IO on.
                 */
-               unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
-                             offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                             offset + iov_length(iov, nr_segs) - 1,
+                             EXTENT_LOCKED | write_bits, 1, 0,
+                             &cached_state, GFP_NOFS);
        }
-
+out:
+       free_extent_state(cached_state);
        return ret;
 }