Merge branch 'for-chris-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/fdmanan...

author Chris Mason <clm@fb.com>

Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)

committer Chris Mason <clm@fb.com>

Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)
author Chris Mason <clm@fb.com>
Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)
committer Chris Mason <clm@fb.com>
Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)
diff --combined fs/btrfs/ctree.h

index e613e48d71220ff5aea1a36b546e6ca01ff7c6f2,7ae758685c7b8be19d3c96fcc5bbaaf2133553d9..ddcc58f03c79c229314bec522ef1aab8d2b570a7
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -33,7 -33,6 +33,7 @@@
   #include <asm/kmap_types.h>
   #include <linux/pagemap.h>
   #include <linux/btrfs.h>
+ +#include <linux/btrfs_tree.h>
   #include <linux/workqueue.h>
   #include <linux/security.h>
   #include <linux/sizes.h>
@@@ -65,6 -64,98 +65,6 @@@ struct btrfs_ordered_sum
   
   #define BTRFS_COMPAT_EXTENT_TREE_V0
   
- -/* holds pointers to all of the tree roots */
- -#define BTRFS_ROOT_TREE_OBJECTID 1ULL
- -
- -/* stores information about which extents are in use, and reference counts */
- -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
- -
- -/*
- - * chunk tree stores translations from logical -> physical block numbering
- - * the super block points to the chunk tree
- - */
- -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
- -
- -/*
- - * stores information about which areas of a given device are in use.
- - * one per device.  The tree of tree roots points to the device tree
- - */
- -#define BTRFS_DEV_TREE_OBJECTID 4ULL
- -
- -/* one per subvolume, storing files and directories */
- -#define BTRFS_FS_TREE_OBJECTID 5ULL
- -
- -/* directory objectid inside the root tree */
- -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
- -
- -/* holds checksums of all the data extents */
- -#define BTRFS_CSUM_TREE_OBJECTID 7ULL
- -
- -/* holds quota configuration and tracking */
- -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
- -
- -/* for storing items that use the BTRFS_UUID_KEY* types */
- -#define BTRFS_UUID_TREE_OBJECTID 9ULL
- -
- -/* tracks free space in block groups. */
- -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
- -
- -/* device stats in the device tree */
- -#define BTRFS_DEV_STATS_OBJECTID 0ULL
- -
- -/* for storing balance parameters in the root tree */
- -#define BTRFS_BALANCE_OBJECTID -4ULL
- -
- -/* orhpan objectid for tracking unlinked/truncated files */
- -#define BTRFS_ORPHAN_OBJECTID -5ULL
- -
- -/* does write ahead logging to speed up fsyncs */
- -#define BTRFS_TREE_LOG_OBJECTID -6ULL
- -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
- -
- -/* for space balancing */
- -#define BTRFS_TREE_RELOC_OBJECTID -8ULL
- -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
- -
- -/*
- - * extent checksums all have this objectid
- - * this allows them to share the logging tree
- - * for fsyncs
- - */
- -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
- -
- -/* For storing free space cache */
- -#define BTRFS_FREE_SPACE_OBJECTID -11ULL
- -
- -/*
- - * The inode number assigned to the special inode for storing
- - * free ino cache
- - */
- -#define BTRFS_FREE_INO_OBJECTID -12ULL
- -
- -/* dummy objectid represents multiple objectids */
- -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
- -
- -/*
- - * All files have objectids in this range.
- - */
- -#define BTRFS_FIRST_FREE_OBJECTID 256ULL
- -#define BTRFS_LAST_FREE_OBJECTID -256ULL
- -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
- -
- -
- -/*
- - * the device items go into the chunk tree.  The key is in the form
- - * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
- - */
- -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
- -
- -#define BTRFS_BTREE_INODE_OBJECTID 1
- -
- -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
- -
- -#define BTRFS_DEV_REPLACE_DEVID 0ULL
- -
   /*
    * the max metadata block size.  This limit is somewhat artificial,
    * but the memmove costs go through the roof for larger blocks.
@@@ -84,6 -175,12 +84,6 @@@
    */
   #define BTRFS_LINK_MAX 65535U
   
- -/* 32 bytes in various csum fields */
- -#define BTRFS_CSUM_SIZE 32
- -
- -/* csum types */
- -#define BTRFS_CSUM_TYPE_CRC32 0
- -
   static const int btrfs_csum_sizes[] = { 4 };
   
   /* four bytes for CRC32 */
@@@ -92,6 -189,17 +92,6 @@@
   /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
   #define REQ_GET_READ_MIRRORS  (1 << 30)
   
- -#define BTRFS_FT_UNKNOWN      0
- -#define BTRFS_FT_REG_FILE     1
- -#define BTRFS_FT_DIR          2
- -#define BTRFS_FT_CHRDEV               3
- -#define BTRFS_FT_BLKDEV               4
- -#define BTRFS_FT_FIFO         5
- -#define BTRFS_FT_SOCK         6
- -#define BTRFS_FT_SYMLINK      7
- -#define BTRFS_FT_XATTR                8
- -#define BTRFS_FT_MAX          9
- -
   /* ioprio of readahead is set to idle */
   #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
   
@@@ -99,10 -207,138 +99,10 @@@
   
   #define BTRFS_MAX_EXTENT_SIZE SZ_128M
   
- -/*
- - * The key defines the order in the tree, and so it also defines (optimal)
- - * block layout.
- - *
- - * objectid corresponds to the inode number.
- - *
- - * type tells us things about the object, and is a kind of stream selector.
- - * so for a given inode, keys with type of 1 might refer to the inode data,
- - * type of 2 may point to file data in the btree and type == 3 may point to
- - * extents.
- - *
- - * offset is the starting byte offset for this key in the stream.
- - *
- - * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
- - * in cpu native order.  Otherwise they are identical and their sizes
- - * should be the same (ie both packed)
- - */
- -struct btrfs_disk_key {
- -      __le64 objectid;
- -      u8 type;
- -      __le64 offset;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_key {
- -      u64 objectid;
- -      u8 type;
- -      u64 offset;
- -} __attribute__ ((__packed__));
- -
   struct btrfs_mapping_tree {
         struct extent_map_tree map_tree;
   };
   
- -struct btrfs_dev_item {
- -      /* the internal btrfs device id */
- -      __le64 devid;
- -
- -      /* size of the device */
- -      __le64 total_bytes;
- -
- -      /* bytes used */
- -      __le64 bytes_used;
- -
- -      /* optimal io alignment for this device */
- -      __le32 io_align;
- -
- -      /* optimal io width for this device */
- -      __le32 io_width;
- -
- -      /* minimal io size for this device */
- -      __le32 sector_size;
- -
- -      /* type and info about this device */
- -      __le64 type;
- -
- -      /* expected generation for this device */
- -      __le64 generation;
- -
- -      /*
- -       * starting byte of this partition on the device,
- -       * to allow for stripe alignment in the future
- -       */
- -      __le64 start_offset;
- -
- -      /* grouping information for allocation decisions */
- -      __le32 dev_group;
- -
- -      /* seek speed 0-100 where 100 is fastest */
- -      u8 seek_speed;
- -
- -      /* bandwidth 0-100 where 100 is fastest */
- -      u8 bandwidth;
- -
- -      /* btrfs generated uuid for this device */
- -      u8 uuid[BTRFS_UUID_SIZE];
- -
- -      /* uuid of FS who owns this device */
- -      u8 fsid[BTRFS_UUID_SIZE];
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_stripe {
- -      __le64 devid;
- -      __le64 offset;
- -      u8 dev_uuid[BTRFS_UUID_SIZE];
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_chunk {
- -      /* size of this chunk in bytes */
- -      __le64 length;
- -
- -      /* objectid of the root referencing this chunk */
- -      __le64 owner;
- -
- -      __le64 stripe_len;
- -      __le64 type;
- -
- -      /* optimal io alignment for this chunk */
- -      __le32 io_align;
- -
- -      /* optimal io width for this chunk */
- -      __le32 io_width;
- -
- -      /* minimal io size for this chunk */
- -      __le32 sector_size;
- -
- -      /* 2^16 stripes is quite a lot, a second limit is the size of a single
- -       * item in the btree
- -       */
- -      __le16 num_stripes;
- -
- -      /* sub stripes only matter for raid10 */
- -      __le16 sub_stripes;
- -      struct btrfs_stripe stripe;
- -      /* additional stripes go here */
- -} __attribute__ ((__packed__));
- -
- -#define BTRFS_FREE_SPACE_EXTENT       1
- -#define BTRFS_FREE_SPACE_BITMAP       2
- -
- -struct btrfs_free_space_entry {
- -      __le64 offset;
- -      __le64 bytes;
- -      u8 type;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_free_space_header {
- -      struct btrfs_disk_key location;
- -      __le64 generation;
- -      __le64 num_entries;
- -      __le64 num_bitmaps;
- -} __attribute__ ((__packed__));
- -
   static inline unsigned long btrfs_chunk_item_size(int num_stripes)
   {
         BUG_ON(num_stripes == 0);
@@@ -110,6 -346,9 +110,6 @@@
                 sizeof(struct btrfs_stripe) * (num_stripes - 1);
   }
   
- -#define BTRFS_HEADER_FLAG_WRITTEN     (1ULL << 0)
- -#define BTRFS_HEADER_FLAG_RELOC               (1ULL << 1)
- -
   /*
    * File system states
    */
@@@ -118,6 -357,13 +118,6 @@@
   #define BTRFS_FS_STATE_TRANS_ABORTED  2
   #define BTRFS_FS_STATE_DEV_REPLACING  3
   
- -/* Super block flags */
- -/* Errors detected */
- -#define BTRFS_SUPER_FLAG_ERROR                (1ULL << 2)
- -
- -#define BTRFS_SUPER_FLAG_SEEDING      (1ULL << 32)
- -#define BTRFS_SUPER_FLAG_METADUMP     (1ULL << 33)
- -
   #define BTRFS_BACKREF_REV_MAX         256
   #define BTRFS_BACKREF_REV_SHIFT               56
   #define BTRFS_BACKREF_REV_MASK                (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
@@@ -164,6 -410,7 +164,6 @@@ struct btrfs_header 
    * room to translate 14 chunks with 3 stripes each.
    */
   #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
- -#define BTRFS_LABEL_SIZE 256
   
   /*
    * just in case we somehow lose the roots and are not able to mount,
@@@ -260,6 -507,31 +260,6 @@@ struct btrfs_super_block 
    * Compat flags that we support.  If any incompat flags are set other than the
    * ones specified below then we will fail to mount
    */
- -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE       (1ULL << 0)
- -
- -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF  (1ULL << 0)
- -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
- -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS   (1ULL << 2)
- -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO   (1ULL << 3)
- -/*
- - * some patches floated around with a second compression method
- - * lets save that incompat here for when they do get in
- - * Note we don't actually support it, we're just reserving the
- - * number
- - */
- -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
- -
- -/*
- - * older kernels tried to do bigger metadata blocks, but the
- - * code was pretty buggy.  Lets not let them try anymore.
- - */
- -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA   (1ULL << 5)
- -
- -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF  (1ULL << 6)
- -#define BTRFS_FEATURE_INCOMPAT_RAID56         (1ULL << 7)
- -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA        (1ULL << 8)
- -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES               (1ULL << 9)
- -
   #define BTRFS_FEATURE_COMPAT_SUPP             0ULL
   #define BTRFS_FEATURE_COMPAT_SAFE_SET         0ULL
   #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR               0ULL
@@@ -352,8 -624,357 +352,8 @@@ struct btrfs_path 
         unsigned int need_commit_sem:1;
         unsigned int skip_release_on_error:1;
   };
- -
- -/*
- - * items in the extent btree are used to record the objectid of the
- - * owner of the block and the number of references
- - */
- -
- -struct btrfs_extent_item {
- -      __le64 refs;
- -      __le64 generation;
- -      __le64 flags;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_extent_item_v0 {
- -      __le32 refs;
- -} __attribute__ ((__packed__));
- -
   #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
                                         sizeof(struct btrfs_item))
- -
- -#define BTRFS_EXTENT_FLAG_DATA                (1ULL << 0)
- -#define BTRFS_EXTENT_FLAG_TREE_BLOCK  (1ULL << 1)
- -
- -/* following flags only apply to tree blocks */
- -
- -/* use full backrefs for extent pointers in the block */
- -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
- -
- -/*
- - * this flag is only used internally by scrub and may be changed at any time
- - * it is only declared here to avoid collisions
- - */
- -#define BTRFS_EXTENT_FLAG_SUPER               (1ULL << 48)
- -
- -struct btrfs_tree_block_info {
- -      struct btrfs_disk_key key;
- -      u8 level;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_extent_data_ref {
- -      __le64 root;
- -      __le64 objectid;
- -      __le64 offset;
- -      __le32 count;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_shared_data_ref {
- -      __le32 count;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_extent_inline_ref {
- -      u8 type;
- -      __le64 offset;
- -} __attribute__ ((__packed__));
- -
- -/* old style backrefs item */
- -struct btrfs_extent_ref_v0 {
- -      __le64 root;
- -      __le64 generation;
- -      __le64 objectid;
- -      __le32 count;
- -} __attribute__ ((__packed__));
- -
- -
- -/* dev extents record free space on individual devices.  The owner
- - * field points back to the chunk allocation mapping tree that allocated
- - * the extent.  The chunk tree uuid field is a way to double check the owner
- - */
- -struct btrfs_dev_extent {
- -      __le64 chunk_tree;
- -      __le64 chunk_objectid;
- -      __le64 chunk_offset;
- -      __le64 length;
- -      u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_inode_ref {
- -      __le64 index;
- -      __le16 name_len;
- -      /* name goes here */
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_inode_extref {
- -      __le64 parent_objectid;
- -      __le64 index;
- -      __le16 name_len;
- -      __u8   name[0];
- -      /* name goes here */
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_timespec {
- -      __le64 sec;
- -      __le32 nsec;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_inode_item {
- -      /* nfs style generation number */
- -      __le64 generation;
- -      /* transid that last touched this inode */
- -      __le64 transid;
- -      __le64 size;
- -      __le64 nbytes;
- -      __le64 block_group;
- -      __le32 nlink;
- -      __le32 uid;
- -      __le32 gid;
- -      __le32 mode;
- -      __le64 rdev;
- -      __le64 flags;
- -
- -      /* modification sequence number for NFS */
- -      __le64 sequence;
- -
- -      /*
- -       * a little future expansion, for more than this we can
- -       * just grow the inode item and version it
- -       */
- -      __le64 reserved[4];
- -      struct btrfs_timespec atime;
- -      struct btrfs_timespec ctime;
- -      struct btrfs_timespec mtime;
- -      struct btrfs_timespec otime;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_dir_log_item {
- -      __le64 end;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_dir_item {
- -      struct btrfs_disk_key location;
- -      __le64 transid;
- -      __le16 data_len;
- -      __le16 name_len;
- -      u8 type;
- -} __attribute__ ((__packed__));
- -
- -#define BTRFS_ROOT_SUBVOL_RDONLY      (1ULL << 0)
- -
- -/*
- - * Internal in-memory flag that a subvolume has been marked for deletion but
- - * still visible as a directory
- - */
- -#define BTRFS_ROOT_SUBVOL_DEAD                (1ULL << 48)
- -
- -struct btrfs_root_item {
- -      struct btrfs_inode_item inode;
- -      __le64 generation;
- -      __le64 root_dirid;
- -      __le64 bytenr;
- -      __le64 byte_limit;
- -      __le64 bytes_used;
- -      __le64 last_snapshot;
- -      __le64 flags;
- -      __le32 refs;
- -      struct btrfs_disk_key drop_progress;
- -      u8 drop_level;
- -      u8 level;
- -
- -      /*
- -       * The following fields appear after subvol_uuids+subvol_times
- -       * were introduced.
- -       */
- -
- -      /*
- -       * This generation number is used to test if the new fields are valid
- -       * and up to date while reading the root item. Every time the root item
- -       * is written out, the "generation" field is copied into this field. If
- -       * anyone ever mounted the fs with an older kernel, we will have
- -       * mismatching generation values here and thus must invalidate the
- -       * new fields. See btrfs_update_root and btrfs_find_last_root for
- -       * details.
- -       * the offset of generation_v2 is also used as the start for the memset
- -       * when invalidating the fields.
- -       */
- -      __le64 generation_v2;
- -      u8 uuid[BTRFS_UUID_SIZE];
- -      u8 parent_uuid[BTRFS_UUID_SIZE];
- -      u8 received_uuid[BTRFS_UUID_SIZE];
- -      __le64 ctransid; /* updated when an inode changes */
- -      __le64 otransid; /* trans when created */
- -      __le64 stransid; /* trans when sent. non-zero for received subvol */
- -      __le64 rtransid; /* trans when received. non-zero for received subvol */
- -      struct btrfs_timespec ctime;
- -      struct btrfs_timespec otime;
- -      struct btrfs_timespec stime;
- -      struct btrfs_timespec rtime;
- -      __le64 reserved[8]; /* for future */
- -} __attribute__ ((__packed__));
- -
- -/*
- - * this is used for both forward and backward root refs
- - */
- -struct btrfs_root_ref {
- -      __le64 dirid;
- -      __le64 sequence;
- -      __le16 name_len;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_disk_balance_args {
- -      /*
- -       * profiles to operate on, single is denoted by
- -       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- -       */
- -      __le64 profiles;
- -
- -      /*
- -       * usage filter
- -       * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
- -       * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
- -       */
- -      union {
- -              __le64 usage;
- -              struct {
- -                      __le32 usage_min;
- -                      __le32 usage_max;
- -              };
- -      };
- -
- -      /* devid filter */
- -      __le64 devid;
- -
- -      /* devid subset filter [pstart..pend) */
- -      __le64 pstart;
- -      __le64 pend;
- -
- -      /* btrfs virtual address space subset filter [vstart..vend) */
- -      __le64 vstart;
- -      __le64 vend;
- -
- -      /*
- -       * profile to convert to, single is denoted by
- -       * BTRFS_AVAIL_ALLOC_BIT_SINGLE
- -       */
- -      __le64 target;
- -
- -      /* BTRFS_BALANCE_ARGS_* */
- -      __le64 flags;
- -
- -      /*
- -       * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
- -       * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
- -       * and maximum
- -       */
- -      union {
- -              __le64 limit;
- -              struct {
- -                      __le32 limit_min;
- -                      __le32 limit_max;
- -              };
- -      };
- -
- -      /*
- -       * Process chunks that cross stripes_min..stripes_max devices,
- -       * BTRFS_BALANCE_ARGS_STRIPES_RANGE
- -       */
- -      __le32 stripes_min;
- -      __le32 stripes_max;
- -
- -      __le64 unused[6];
- -} __attribute__ ((__packed__));
- -
- -/*
- - * store balance parameters to disk so that balance can be properly
- - * resumed after crash or unmount
- - */
- -struct btrfs_balance_item {
- -      /* BTRFS_BALANCE_* */
- -      __le64 flags;
- -
- -      struct btrfs_disk_balance_args data;
- -      struct btrfs_disk_balance_args meta;
- -      struct btrfs_disk_balance_args sys;
- -
- -      __le64 unused[4];
- -} __attribute__ ((__packed__));
- -
- -#define BTRFS_FILE_EXTENT_INLINE 0
- -#define BTRFS_FILE_EXTENT_REG 1
- -#define BTRFS_FILE_EXTENT_PREALLOC 2
- -
- -struct btrfs_file_extent_item {
- -      /*
- -       * transaction id that created this extent
- -       */
- -      __le64 generation;
- -      /*
- -       * max number of bytes to hold this extent in ram
- -       * when we split a compressed extent we can't know how big
- -       * each of the resulting pieces will be.  So, this is
- -       * an upper limit on the size of the extent in ram instead of
- -       * an exact limit.
- -       */
- -      __le64 ram_bytes;
- -
- -      /*
- -       * 32 bits for the various ways we might encode the data,
- -       * including compression and encryption.  If any of these
- -       * are set to something a given disk format doesn't understand
- -       * it is treated like an incompat flag for reading and writing,
- -       * but not for stat.
- -       */
- -      u8 compression;
- -      u8 encryption;
- -      __le16 other_encoding; /* spare for later use */
- -
- -      /* are we inline data or a real extent? */
- -      u8 type;
- -
- -      /*
- -       * disk space consumed by the extent, checksum blocks are included
- -       * in these numbers
- -       *
- -       * At this offset in the structure, the inline extent data start.
- -       */
- -      __le64 disk_bytenr;
- -      __le64 disk_num_bytes;
- -      /*
- -       * the logical offset in file blocks (no csums)
- -       * this extent record is for.  This allows a file extent to point
- -       * into the middle of an existing extent on disk, sharing it
- -       * between two snapshots (useful if some bytes in the middle of the
- -       * extent have changed
- -       */
- -      __le64 offset;
- -      /*
- -       * the logical number of file blocks (no csums included).  This
- -       * always reflects the size uncompressed and without encoding.
- -       */
- -      __le64 num_bytes;
- -
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_csum_item {
- -      u8 csum;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_dev_stats_item {
- -      /*
- -       * grow this item struct at the end for future enhancements and keep
- -       * the existing values unchanged
- -       */
- -      __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
- -} __attribute__ ((__packed__));
- -
- -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS   0
- -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID    1
- -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED    0
- -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED          1
- -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED                2
- -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED         3
- -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED         4
- -
   struct btrfs_dev_replace {
         u64 replace_state;      /* see #define above */
         u64 time_started;       /* seconds since 1-Jan-1970 */
@@@ -384,6 -1005,175 +384,6 @@@
         struct btrfs_scrub_progress scrub_progress;
   };
   
- -struct btrfs_dev_replace_item {
- -      /*
- -       * grow this item struct at the end for future enhancements and keep
- -       * the existing values unchanged
- -       */
- -      __le64 src_devid;
- -      __le64 cursor_left;
- -      __le64 cursor_right;
- -      __le64 cont_reading_from_srcdev_mode;
- -
- -      __le64 replace_state;
- -      __le64 time_started;
- -      __le64 time_stopped;
- -      __le64 num_write_errors;
- -      __le64 num_uncorrectable_read_errors;
- -} __attribute__ ((__packed__));
- -
- -/* different types of block groups (and chunks) */
- -#define BTRFS_BLOCK_GROUP_DATA                (1ULL << 0)
- -#define BTRFS_BLOCK_GROUP_SYSTEM      (1ULL << 1)
- -#define BTRFS_BLOCK_GROUP_METADATA    (1ULL << 2)
- -#define BTRFS_BLOCK_GROUP_RAID0               (1ULL << 3)
- -#define BTRFS_BLOCK_GROUP_RAID1               (1ULL << 4)
- -#define BTRFS_BLOCK_GROUP_DUP         (1ULL << 5)
- -#define BTRFS_BLOCK_GROUP_RAID10      (1ULL << 6)
- -#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
- -#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
- -#define BTRFS_BLOCK_GROUP_RESERVED    (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
- -                                       BTRFS_SPACE_INFO_GLOBAL_RSV)
- -
- -enum btrfs_raid_types {
- -      BTRFS_RAID_RAID10,
- -      BTRFS_RAID_RAID1,
- -      BTRFS_RAID_DUP,
- -      BTRFS_RAID_RAID0,
- -      BTRFS_RAID_SINGLE,
- -      BTRFS_RAID_RAID5,
- -      BTRFS_RAID_RAID6,
- -      BTRFS_NR_RAID_TYPES
- -};
- -
- -#define BTRFS_BLOCK_GROUP_TYPE_MASK   (BTRFS_BLOCK_GROUP_DATA |    \
- -                                       BTRFS_BLOCK_GROUP_SYSTEM |  \
- -                                       BTRFS_BLOCK_GROUP_METADATA)
- -
- -#define BTRFS_BLOCK_GROUP_PROFILE_MASK        (BTRFS_BLOCK_GROUP_RAID0 |   \
- -                                       BTRFS_BLOCK_GROUP_RAID1 |   \
- -                                       BTRFS_BLOCK_GROUP_RAID5 |   \
- -                                       BTRFS_BLOCK_GROUP_RAID6 |   \
- -                                       BTRFS_BLOCK_GROUP_DUP |     \
- -                                       BTRFS_BLOCK_GROUP_RAID10)
- -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 |   \
- -                                       BTRFS_BLOCK_GROUP_RAID6)
- -
- -/*
- - * We need a bit for restriper to be able to tell when chunks of type
- - * SINGLE are available.  This "extended" profile format is used in
- - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
- - * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
- - * to avoid remappings between two formats in future.
- - */
- -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE  (1ULL << 48)
- -
- -/*
- - * A fake block group type that is used to communicate global block reserve
- - * size to userspace via the SPACE_INFO ioctl.
- - */
- -#define BTRFS_SPACE_INFO_GLOBAL_RSV   (1ULL << 49)
- -
- -#define BTRFS_EXTENDED_PROFILE_MASK   (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
- -                                       BTRFS_AVAIL_ALLOC_BIT_SINGLE)
- -
- -static inline u64 chunk_to_extended(u64 flags)
- -{
- -      if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
- -              flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- -
- -      return flags;
- -}
- -static inline u64 extended_to_chunk(u64 flags)
- -{
- -      return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
- -}
- -
- -struct btrfs_block_group_item {
- -      __le64 used;
- -      __le64 chunk_objectid;
- -      __le64 flags;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_free_space_info {
- -      __le32 extent_count;
- -      __le32 flags;
- -} __attribute__ ((__packed__));
- -
- -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
- -
- -#define BTRFS_QGROUP_LEVEL_SHIFT              48
- -static inline u64 btrfs_qgroup_level(u64 qgroupid)
- -{
- -      return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
- -}
- -
- -/*
- - * is subvolume quota turned on?
- - */
- -#define BTRFS_QGROUP_STATUS_FLAG_ON           (1ULL << 0)
- -/*
- - * RESCAN is set during the initialization phase
- - */
- -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN               (1ULL << 1)
- -/*
- - * Some qgroup entries are known to be out of date,
- - * either because the configuration has changed in a way that
- - * makes a rescan necessary, or because the fs has been mounted
- - * with a non-qgroup-aware version.
- - * Turning qouta off and on again makes it inconsistent, too.
- - */
- -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
- -
- -#define BTRFS_QGROUP_STATUS_VERSION        1
- -
- -struct btrfs_qgroup_status_item {
- -      __le64 version;
- -      /*
- -       * the generation is updated during every commit. As older
- -       * versions of btrfs are not aware of qgroups, it will be
- -       * possible to detect inconsistencies by checking the
- -       * generation on mount time
- -       */
- -      __le64 generation;
- -
- -      /* flag definitions see above */
- -      __le64 flags;
- -
- -      /*
- -       * only used during scanning to record the progress
- -       * of the scan. It contains a logical address
- -       */
- -      __le64 rescan;
- -} __attribute__ ((__packed__));
- -
- -struct btrfs_qgroup_info_item {
- -      __le64 generation;
- -      __le64 rfer;
- -      __le64 rfer_cmpr;
- -      __le64 excl;
- -      __le64 excl_cmpr;
- -} __attribute__ ((__packed__));
- -
- -/* flags definition for qgroup limits */
- -#define BTRFS_QGROUP_LIMIT_MAX_RFER   (1ULL << 0)
- -#define BTRFS_QGROUP_LIMIT_MAX_EXCL   (1ULL << 1)
- -#define BTRFS_QGROUP_LIMIT_RSV_RFER   (1ULL << 2)
- -#define BTRFS_QGROUP_LIMIT_RSV_EXCL   (1ULL << 3)
- -#define BTRFS_QGROUP_LIMIT_RFER_CMPR  (1ULL << 4)
- -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR  (1ULL << 5)
- -
- -struct btrfs_qgroup_limit_item {
- -      /*
- -       * only updated when any of the other values change
- -       */
- -      __le64 flags;
- -      __le64 max_rfer;
- -      __le64 max_excl;
- -      __le64 rsv_rfer;
- -      __le64 rsv_excl;
- -} __attribute__ ((__packed__));
- -
   /* For raid type sysfs entries */
   struct raid_kobject {
         int raid_type;
@@@ -618,6 -1408,27 +618,27 @@@ struct btrfs_block_group_cache 
   
         struct btrfs_io_ctl io_ctl;
   
+       /*
+        * Incremented when doing extent allocations and holding a read lock
+        * on the space_info's groups_sem semaphore.
+        * Decremented when an ordered extent that represents an IO against this
+        * block group's range is created (after it's added to its inode's
+        * root's list of ordered extents) or immediately after the allocation
+        * if it's a metadata extent or fallocate extent (for these cases we
+        * don't create ordered extents).
+        */
+       atomic_t reservations;
+ 
+       /*
+        * Incremented while holding the spinlock *lock* by a task checking if
+        * it can perform a nocow write (incremented if the value for the *ro*
+        * field is 0). Decremented by such tasks once they create an ordered
+        * extent or before that if some error happens before reaching that step.
+        * This is to prevent races between block group relocation and nocow
+        * writes through direct IO.
+        */
+       atomic_t nocow_writers;
+ 
         /* Lock for free space tree operations. */
         struct mutex free_space_lock;
   
@@@ -1236,6 -2047,228 +1257,6 @@@ struct btrfs_root 
         atomic_t qgroup_meta_rsv;
   };
   
- -struct btrfs_ioctl_defrag_range_args {
- -      /* start of the defrag operation */
- -      __u64 start;
- -
- -      /* number of bytes to defrag, use (u64)-1 to say all */
- -      __u64 len;
- -
- -      /*
- -       * flags for the operation, which can include turning
- -       * on compression for this one defrag
- -       */
- -      __u64 flags;
- -
- -      /*
- -       * any extent bigger than this will be considered
- -       * already defragged.  Use 0 to take the kernel default
- -       * Use 1 to say every single extent must be rewritten
- -       */
- -      __u32 extent_thresh;
- -
- -      /*
- -       * which compression method to use if turning on compression
- -       * for this defrag operation.  If unspecified, zlib will
- -       * be used
- -       */
- -      __u32 compress_type;
- -
- -      /* spare for later */
- -      __u32 unused[4];
- -};
- -
- -
- -/*
- - * inode items have the data typically returned from stat and store other
- - * info about object characteristics.  There is one for every file and dir in
- - * the FS
- - */
- -#define BTRFS_INODE_ITEM_KEY          1
- -#define BTRFS_INODE_REF_KEY           12
- -#define BTRFS_INODE_EXTREF_KEY                13
- -#define BTRFS_XATTR_ITEM_KEY          24
- -#define BTRFS_ORPHAN_ITEM_KEY         48
- -/* reserve 2-15 close to the inode for later flexibility */
- -
- -/*
- - * dir items are the name -> inode pointers in a directory.  There is one
- - * for every name in a directory.
- - */
- -#define BTRFS_DIR_LOG_ITEM_KEY  60
- -#define BTRFS_DIR_LOG_INDEX_KEY 72
- -#define BTRFS_DIR_ITEM_KEY    84
- -#define BTRFS_DIR_INDEX_KEY   96
- -/*
- - * extent data is for file data
- - */
- -#define BTRFS_EXTENT_DATA_KEY 108
- -
- -/*
- - * extent csums are stored in a separate tree and hold csums for
- - * an entire extent on disk.
- - */
- -#define BTRFS_EXTENT_CSUM_KEY 128
- -
- -/*
- - * root items point to tree roots.  They are typically in the root
- - * tree used by the super block to find all the other trees
- - */
- -#define BTRFS_ROOT_ITEM_KEY   132
- -
- -/*
- - * root backrefs tie subvols and snapshots to the directory entries that
- - * reference them
- - */
- -#define BTRFS_ROOT_BACKREF_KEY        144
- -
- -/*
- - * root refs make a fast index for listing all of the snapshots and
- - * subvolumes referenced by a given root.  They point directly to the
- - * directory item in the root that references the subvol
- - */
- -#define BTRFS_ROOT_REF_KEY    156
- -
- -/*
- - * extent items are in the extent map tree.  These record which blocks
- - * are used, and how many references there are to each block
- - */
- -#define BTRFS_EXTENT_ITEM_KEY 168
- -
- -/*
- - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
- - * the length, so we save the level in key->offset instead of the length.
- - */
- -#define BTRFS_METADATA_ITEM_KEY       169
- -
- -#define BTRFS_TREE_BLOCK_REF_KEY      176
- -
- -#define BTRFS_EXTENT_DATA_REF_KEY     178
- -
- -#define BTRFS_EXTENT_REF_V0_KEY               180
- -
- -#define BTRFS_SHARED_BLOCK_REF_KEY    182
- -
- -#define BTRFS_SHARED_DATA_REF_KEY     184
- -
- -/*
- - * block groups give us hints into the extent allocation trees.  Which
- - * blocks are free etc etc
- - */
- -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
- -
- -/*
- - * Every block group is represented in the free space tree by a free space info
- - * item, which stores some accounting information. It is keyed on
- - * (block_group_start, FREE_SPACE_INFO, block_group_length).
- - */
- -#define BTRFS_FREE_SPACE_INFO_KEY 198
- -
- -/*
- - * A free space extent tracks an extent of space that is free in a block group.
- - * It is keyed on (start, FREE_SPACE_EXTENT, length).
- - */
- -#define BTRFS_FREE_SPACE_EXTENT_KEY 199
- -
- -/*
- - * When a block group becomes very fragmented, we convert it to use bitmaps
- - * instead of extents. A free space bitmap is keyed on
- - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
- - * (length / sectorsize) bits.
- - */
- -#define BTRFS_FREE_SPACE_BITMAP_KEY 200
- -
- -#define BTRFS_DEV_EXTENT_KEY  204
- -#define BTRFS_DEV_ITEM_KEY    216
- -#define BTRFS_CHUNK_ITEM_KEY  228
- -
- -/*
- - * Records the overall state of the qgroups.
- - * There's only one instance of this key present,
- - * (0, BTRFS_QGROUP_STATUS_KEY, 0)
- - */
- -#define BTRFS_QGROUP_STATUS_KEY         240
- -/*
- - * Records the currently used space of the qgroup.
- - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
- - */
- -#define BTRFS_QGROUP_INFO_KEY           242
- -/*
- - * Contains the user configured limits for the qgroup.
- - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
- - */
- -#define BTRFS_QGROUP_LIMIT_KEY          244
- -/*
- - * Records the child-parent relationship of qgroups. For
- - * each relation, 2 keys are present:
- - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
- - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
- - */
- -#define BTRFS_QGROUP_RELATION_KEY       246
- -
- -/*
- - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
- - */
- -#define BTRFS_BALANCE_ITEM_KEY        248
- -
- -/*
- - * The key type for tree items that are stored persistently, but do not need to
- - * exist for extended period of time. The items can exist in any tree.
- - *
- - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
- - *
- - * Existing items:
- - *
- - * - balance status item
- - *   (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
- - */
- -#define BTRFS_TEMPORARY_ITEM_KEY      248
- -
- -/*
- - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
- - */
- -#define BTRFS_DEV_STATS_KEY           249
- -
- -/*
- - * The key type for tree items that are stored persistently and usually exist
- - * for a long period, eg. filesystem lifetime. The item kinds can be status
- - * information, stats or preference values. The item can exist in any tree.
- - *
- - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
- - *
- - * Existing items:
- - *
- - * - device statistics, store IO stats in the device tree, one key for all
- - *   stats
- - *   (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
- - */
- -#define BTRFS_PERSISTENT_ITEM_KEY     249
- -
- -/*
- - * Persistantly stores the device replace state in the device tree.
- - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
- - */
- -#define BTRFS_DEV_REPLACE_KEY 250
- -
- -/*
- - * Stores items that allow to quickly map UUIDs to something else.
- - * These items are part of the filesystem UUID tree.
- - * The key is built like this:
- - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
- - */
- -#if BTRFS_UUID_SIZE != 16
- -#error "UUID items require BTRFS_UUID_SIZE == 16!"
- -#endif
- -#define BTRFS_UUID_KEY_SUBVOL 251     /* for UUIDs assigned to subvols */
- -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL        252     /* for UUIDs assigned to
- -                                               * received subvols */
- -
- -/*
- - * string items are for debugging.  They just store a short string of
- - * data in the FS
- - */
- -#define BTRFS_STRING_ITEM_KEY 253
- -
   /*
    * Flags for mount options.
    *
@@@ -2487,6 -3520,12 +2508,12 @@@ int btrfs_should_throttle_delayed_refs(
                                        struct btrfs_root *root);
   int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root);
+ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                        const u64 start);
+ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+ bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
+ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
   void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
   int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, unsigned long count);
@@@ -3110,7 -4149,6 +3137,7 @@@ void btrfs_test_inode_set_ops(struct in
   
   /* ioctl.c */
   long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+ +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
   int btrfs_ioctl_get_supported_features(void __user *arg);
   void btrfs_update_iflags(struct inode *inode);
   void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
@@@ -3315,9 -4353,10 +3342,9 @@@ static inline void assfail(char *expr, 
   #define ASSERT(expr)  ((void)0)
   #endif
   
- -#define btrfs_assert()
   __printf(5, 6)
   __cold
- -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
                      unsigned int line, int errno, const char *fmt, ...);
   
   const char *btrfs_decode_error(int errno);
@@@ -3327,46 -4366,6 +3354,46 @@@ void __btrfs_abort_transaction(struct b
                                struct btrfs_root *root, const char *function,
                                unsigned int line, int errno);
   
+ +/*
+ + * Call btrfs_abort_transaction as early as possible when an error condition is
+ + * detected, that way the exact line number is reported.
+ + */
+ +#define btrfs_abort_transaction(trans, root, errno)           \
+ +do {                                                          \
+ +      /* Report first abort since mount */                    \
+ +      if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
+ +                      &((root)->fs_info->fs_state))) {        \
+ +              WARN(1, KERN_DEBUG                              \
+ +              "BTRFS: Transaction aborted (error %d)\n",      \
+ +              (errno));                                       \
+ +      }                                                       \
+ +      __btrfs_abort_transaction((trans), (root), __func__,    \
+ +                                __LINE__, (errno));           \
+ +} while (0)
+ +
+ +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...)           \
+ +do {                                                          \
+ +      __btrfs_handle_fs_error((fs_info), __func__, __LINE__,  \
+ +                        (errno), fmt, ##args);                \
+ +} while (0)
+ +
+ +__printf(5, 6)
+ +__cold
+ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ +                 unsigned int line, int errno, const char *fmt, ...);
+ +/*
+ + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ + * will panic().  Otherwise we BUG() here.
+ + */
+ +#define btrfs_panic(fs_info, errno, fmt, args...)                     \
+ +do {                                                                  \
+ +      __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+ +      BUG();                                                          \
+ +} while (0)
+ +
+ +
+ +/* compatibility and incompatibility defines */
+ +
   #define btrfs_set_fs_incompat(__fs_info, opt) \
         __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
   
@@@ -3483,6 -4482,44 +3510,6 @@@ static inline int __btrfs_fs_compat_ro(
         return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
   }
   
- -/*
- - * Call btrfs_abort_transaction as early as possible when an error condition is
- - * detected, that way the exact line number is reported.
- - */
- -#define btrfs_abort_transaction(trans, root, errno)           \
- -do {                                                          \
- -      /* Report first abort since mount */                    \
- -      if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
- -                      &((root)->fs_info->fs_state))) {        \
- -              WARN(1, KERN_DEBUG                              \
- -              "BTRFS: Transaction aborted (error %d)\n",      \
- -              (errno));                                       \
- -      }                                                       \
- -      __btrfs_abort_transaction((trans), (root), __func__,    \
- -                                __LINE__, (errno));           \
- -} while (0)
- -
- -#define btrfs_std_error(fs_info, errno, fmt, args...)         \
- -do {                                                          \
- -      __btrfs_std_error((fs_info), __func__, __LINE__,        \
- -                        (errno), fmt, ##args);                \
- -} while (0)
- -
- -__printf(5, 6)
- -__cold
- -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
- -                 unsigned int line, int errno, const char *fmt, ...);
- -
- -/*
- - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
- - * will panic().  Otherwise we BUG() here.
- - */
- -#define btrfs_panic(fs_info, errno, fmt, args...)                     \
- -do {                                                                  \
- -      __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
- -      BUG();                                                          \
- -} while (0)
- -
   /* acl.c */
   #ifdef CONFIG_BTRFS_FS_POSIX_ACL
   struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
diff --combined fs/btrfs/dev-replace.c

index 1f193f742f21cf35783b8b6f1e5e3d9c6f40ec55,3371f9e546d96885e8f52dd629d7337b422300f7..85f12e6e28d212368af44fbafc1e22442f89dccf
--- 1/fs/btrfs/dev-replace.c
--- 2/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@@ -44,6 -44,9 +44,6 @@@ static void btrfs_dev_replace_update_de
                                                 struct btrfs_fs_info *fs_info,
                                                 struct btrfs_device *srcdev,
                                                 struct btrfs_device *tgtdev);
- -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- -                                       char *srcdev_name,
- -                                       struct btrfs_device **device);
   static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
   static int btrfs_dev_replace_kthread(void *data);
   static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@@ -302,8 -305,8 +302,8 @@@ void btrfs_after_dev_replace_commit(str
                 dev_replace->cursor_left_last_write_of_item;
   }
   
- -int btrfs_dev_replace_start(struct btrfs_root *root,
- -                          struct btrfs_ioctl_dev_replace_args *args)
+ +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
+ +                              u64 srcdevid, char *srcdev_name, int read_src)
   {
         struct btrfs_trans_handle *trans;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@@ -312,16 -315,29 +312,16 @@@
         struct btrfs_device *tgt_device = NULL;
         struct btrfs_device *src_device = NULL;
   
- -      switch (args->start.cont_reading_from_srcdev_mode) {
- -      case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
- -      case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
- -              break;
- -      default:
- -              return -EINVAL;
- -      }
- -
- -      if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- -          args->start.tgtdev_name[0] == '\0')
- -              return -EINVAL;
- -
         /* the disk copy procedure reuses the scrub code */
         mutex_lock(&fs_info->volume_mutex);
- -      ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
- -                                          args->start.srcdev_name,
- -                                          &src_device);
+ +      ret = btrfs_find_device_by_devspec(root, srcdevid,
+ +                                          srcdev_name, &src_device);
         if (ret) {
                 mutex_unlock(&fs_info->volume_mutex);
                 return ret;
         }
   
- -      ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+ +      ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
                                             src_device, &tgt_device);
         mutex_unlock(&fs_info->volume_mutex);
         if (ret)
@@@ -348,17 -364,18 +348,17 @@@
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- -              args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+ +              ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
                 goto leave;
         }
   
- -      dev_replace->cont_reading_from_srcdev_mode =
- -              args->start.cont_reading_from_srcdev_mode;
+ +      dev_replace->cont_reading_from_srcdev_mode = read_src;
         WARN_ON(!src_device);
         dev_replace->srcdev = src_device;
         WARN_ON(!tgt_device);
         dev_replace->tgtdev = tgt_device;
   
- -      btrfs_info_in_rcu(root->fs_info,
+ +      btrfs_info_in_rcu(fs_info,
                       "dev_replace from %s (devid %llu) to %s started",
                       src_device->missing ? "<missing disk>" :
                         rcu_str_deref(src_device->name),
@@@ -379,13 -396,14 +379,13 @@@
         dev_replace->item_needs_writeback = 1;
         atomic64_set(&dev_replace->num_write_errors, 0);
         atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- -      args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
         btrfs_dev_replace_unlock(dev_replace, 1);
   
         ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
         if (ret)
- -              btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+ +              btrfs_err(fs_info, "kobj add dev failed %d\n", ret);
   
-       btrfs_wait_ordered_roots(fs_info, -1);
+       btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
   
         /* force writing the updated state information to disk */
         trans = btrfs_start_transaction(root, 0);
@@@ -403,9 -421,11 +403,9 @@@
                               btrfs_device_get_total_bytes(src_device),
                               &dev_replace->scrub_progress, 0, 1);
   
- -      ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- -      /* don't warn if EINPROGRESS, someone else might be running scrub */
+ +      ret = btrfs_dev_replace_finishing(fs_info, ret);
         if (ret == -EINPROGRESS) {
- -              args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- -              ret = 0;
+ +              ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
         } else {
                 WARN_ON(ret);
         }
@@@ -420,35 -440,6 +420,35 @@@ leave
         return ret;
   }
   
+ +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
+ +                          struct btrfs_ioctl_dev_replace_args *args)
+ +{
+ +      int ret;
+ +
+ +      switch (args->start.cont_reading_from_srcdev_mode) {
+ +      case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+ +      case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+ +              break;
+ +      default:
+ +              return -EINVAL;
+ +      }
+ +
+ +      if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+ +          args->start.tgtdev_name[0] == '\0')
+ +              return -EINVAL;
+ +
+ +      ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
+ +                                      args->start.srcdevid,
+ +                                      args->start.srcdev_name,
+ +                                      args->start.cont_reading_from_srcdev_mode);
+ +      args->result = ret;
+ +      /* don't warn if EINPROGRESS, someone else might be running scrub */
+ +      if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
+ +              ret = 0;
+ +
+ +      return ret;
+ +}
+ +
   /*
    * blocked until all flighting bios are finished.
    */
@@@ -504,7 -495,7 +504,7 @@@ static int btrfs_dev_replace_finishing(
                 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                 return ret;
         }
-       btrfs_wait_ordered_roots(root->fs_info, -1);
+       btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
   
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
@@@ -569,9 -560,10 +569,9 @@@
         ASSERT(list_empty(&src_device->resized_list));
         tgt_device->commit_total_bytes = src_device->commit_total_bytes;
         tgt_device->commit_bytes_used = src_device->bytes_used;
- -      if (fs_info->sb->s_bdev == src_device->bdev)
- -              fs_info->sb->s_bdev = tgt_device->bdev;
- -      if (fs_info->fs_devices->latest_bdev == src_device->bdev)
- -              fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+ +
+ +      btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
+ +
         list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
         fs_info->fs_devices->rw_devices++;
   
@@@ -634,6 -626,25 +634,6 @@@ static void btrfs_dev_replace_update_de
         write_unlock(&em_tree->lock);
   }
   
- -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
- -                                       char *srcdev_name,
- -                                       struct btrfs_device **device)
- -{
- -      int ret;
- -
- -      if (srcdevid) {
- -              ret = 0;
- -              *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
- -                                          NULL);
- -              if (!*device)
- -                      ret = -ENOENT;
- -      } else {
- -              ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
- -                                                         device);
- -      }
- -      return ret;
- -}
- -
   void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_dev_replace_args *args)
   {
diff --combined fs/btrfs/extent-tree.c

index 290e05671ee44c67a7a43ce3c952317ed4d9c927,dcf89bfa990da427a9b74805c87e4a88e51afdab..9424864fd01ae1041bc173d5e4ac2e0eefd24423
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -3824,6 -3824,59 +3824,59 @@@ int btrfs_extent_readonly(struct btrfs_
         return readonly;
   }
   
+ bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+ {
+       struct btrfs_block_group_cache *bg;
+       bool ret = true;
+ 
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       if (!bg)
+               return false;
+ 
+       spin_lock(&bg->lock);
+       if (bg->ro)
+               ret = false;
+       else
+               atomic_inc(&bg->nocow_writers);
+       spin_unlock(&bg->lock);
+ 
+       /* no put on block group, done by btrfs_dec_nocow_writers */
+       if (!ret)
+               btrfs_put_block_group(bg);
+ 
+       return ret;
+ 
+ }
+ 
+ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
+ {
+       struct btrfs_block_group_cache *bg;
+ 
+       bg = btrfs_lookup_block_group(fs_info, bytenr);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->nocow_writers))
+               wake_up_atomic_t(&bg->nocow_writers);
+       /*
+        * Once for our lookup and once for the lookup done by a previous call
+        * to btrfs_inc_nocow_writers()
+        */
+       btrfs_put_block_group(bg);
+       btrfs_put_block_group(bg);
+ }
+ 
+ static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
+ {
+       schedule();
+       return 0;
+ }
+ 
+ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+ {
+       wait_on_atomic_t(&bg->nocow_writers,
+                        btrfs_wait_nocow_writers_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+ }
+ 
   static const char *alloc_name(u64 flags)
   {
         switch (flags) {
@@@ -4141,7 -4194,7 +4194,7 @@@ commit_trans
   
                         if (need_commit > 0) {
                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
-                               btrfs_wait_ordered_roots(fs_info, -1);
+                               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
                         }
   
                         trans = btrfs_join_transaction(root);
@@@ -4583,7 -4636,8 +4636,8 @@@ static void btrfs_writeback_inodes_sb_n
                  */
                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                 if (!current->journal_info)
-                       btrfs_wait_ordered_roots(root->fs_info, nr_items);
+                       btrfs_wait_ordered_roots(root->fs_info, nr_items,
+                                                0, (u64)-1);
         }
   }
   
@@@ -4620,7 -4674,7 +4674,7 @@@ static void shrink_delalloc(struct btrf
   
         /* Calc the number of the pages we need flush for space reservation */
         items = calc_reclaim_items_nr(root, to_reclaim);
- -      to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ +      to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
   
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
@@@ -4632,7 -4686,8 +4686,8 @@@
                 if (trans)
                         return;
                 if (wait_ordered)
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 return;
         }
   
@@@ -4671,7 -4726,8 +4726,8 @@@ skip_async
   
                 loops++;
                 if (wait_ordered && !trans) {
-                       btrfs_wait_ordered_roots(root->fs_info, items);
+                       btrfs_wait_ordered_roots(root->fs_info, items,
+                                                0, (u64)-1);
                 } else {
                         time_left = schedule_timeout_killable(1);
                         if (time_left)
@@@ -6172,6 -6228,57 +6228,57 @@@ int btrfs_exclude_logged_extents(struc
         return 0;
   }
   
+ static void
+ btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+ {
+       atomic_inc(&bg->reservations);
+ }
+ 
+ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
+                                       const u64 start)
+ {
+       struct btrfs_block_group_cache *bg;
+ 
+       bg = btrfs_lookup_block_group(fs_info, start);
+       ASSERT(bg);
+       if (atomic_dec_and_test(&bg->reservations))
+               wake_up_atomic_t(&bg->reservations);
+       btrfs_put_block_group(bg);
+ }
+ 
+ static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
+ {
+       schedule();
+       return 0;
+ }
+ 
+ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+ {
+       struct btrfs_space_info *space_info = bg->space_info;
+ 
+       ASSERT(bg->ro);
+ 
+       if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
+               return;
+ 
+       /*
+        * Our block group is read only but before we set it to read only,
+        * some task might have had allocated an extent from it already, but it
+        * has not yet created a respective ordered extent (and added it to a
+        * root's list of ordered extents).
+        * Therefore wait for any task currently allocating extents, since the
+        * block group's reservations counter is incremented while a read lock
+        * on the groups' semaphore is held and decremented after releasing
+        * the read access on that semaphore and creating the ordered extent.
+        */
+       down_write(&space_info->groups_sem);
+       up_write(&space_info->groups_sem);
+ 
+       wait_on_atomic_t(&bg->reservations,
+                        btrfs_wait_bg_reservations_atomic_t,
+                        TASK_UNINTERRUPTIBLE);
+ }
+ 
   /**
    * btrfs_update_reserved_bytes - update the block_group and space info counters
    * @cache:    The cache we are manipulating
@@@ -7025,35 -7132,36 +7132,35 @@@ btrfs_lock_cluster(struct btrfs_block_g
                    int delalloc)
   {
         struct btrfs_block_group_cache *used_bg = NULL;
- -      bool locked = false;
- -again:
+ +
         spin_lock(&cluster->refill_lock);
- -      if (locked) {
- -              if (used_bg == cluster->block_group)
+ +      while (1) {
+ +              used_bg = cluster->block_group;
+ +              if (!used_bg)
+ +                      return NULL;
+ +
+ +              if (used_bg == block_group)
                         return used_bg;
   
- -              up_read(&used_bg->data_rwsem);
- -              btrfs_put_block_group(used_bg);
- -      }
+ +              btrfs_get_block_group(used_bg);
   
- -      used_bg = cluster->block_group;
- -      if (!used_bg)
- -              return NULL;
+ +              if (!delalloc)
+ +                      return used_bg;
   
- -      if (used_bg == block_group)
- -              return used_bg;
+ +              if (down_read_trylock(&used_bg->data_rwsem))
+ +                      return used_bg;
   
- -      btrfs_get_block_group(used_bg);
+ +              spin_unlock(&cluster->refill_lock);
   
- -      if (!delalloc)
- -              return used_bg;
+ +              down_read(&used_bg->data_rwsem);
   
- -      if (down_read_trylock(&used_bg->data_rwsem))
- -              return used_bg;
+ +              spin_lock(&cluster->refill_lock);
+ +              if (used_bg == cluster->block_group)
+ +                      return used_bg;
   
- -      spin_unlock(&cluster->refill_lock);
- -      down_read(&used_bg->data_rwsem);
- -      locked = true;
- -      goto again;
+ +              up_read(&used_bg->data_rwsem);
+ +              btrfs_put_block_group(used_bg);
+ +      }
   }
   
   static inline void
@@@ -7430,6 -7538,7 +7537,7 @@@ checks
                         btrfs_add_free_space(block_group, offset, num_bytes);
                         goto loop;
                 }
+               btrfs_inc_block_group_reservations(block_group);
   
                 /* we are all good, lets return */
                 ins->objectid = search_start;
@@@ -7611,8 -7720,10 +7719,10 @@@ again
         WARN_ON(num_bytes < root->sectorsize);
         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
                                flags, delalloc);
- 
-       if (ret == -ENOSPC) {
+       if (!ret && !is_data) {
+               btrfs_dec_block_group_reservations(root->fs_info,
+                                                  ins->objectid);
+       } else if (ret == -ENOSPC) {
                 if (!final_tried && ins->offset) {
                         num_bytes = min(num_bytes >> 1, ins->offset);
                         num_bytes = round_down(num_bytes, root->sectorsize);
@@@ -9057,7 -9168,7 +9167,7 @@@ out
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
- -              btrfs_std_error(root->fs_info, err, NULL);
+ +              btrfs_handle_fs_error(root->fs_info, err, NULL);
         return err;
   }
   
diff --combined fs/btrfs/inode.c

index 167fc3d49450f87881abb3cf7e59a0b68a7180c2,c1ee4ade2d8718ac6283452716f9a7a34a201089..8fc99fb0c0aaf7857e73160ef40d407e66e1d016
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -824,6 -824,7 +824,7 @@@ retry
                                                 async_extent->ram_size - 1, 0);
                         goto out_free_reserve;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
   
                 /*
                  * clear dirty, set writeback and unlock the pages.
@@@ -861,6 -862,7 +862,7 @@@
         }
         return;
   out_free_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
   out_free:
         extent_clear_unlock_delalloc(inode, async_extent->start,
@@@ -1038,6 -1040,8 +1040,8 @@@ static noinline int cow_file_range(stru
                                 goto out_drop_extent_cache;
                 }
   
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+ 
                 if (disk_num_bytes < cur_alloc_size)
                         break;
   
@@@ -1066,6 -1070,7 +1070,7 @@@ out
   out_drop_extent_cache:
         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
   out_reserve:
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
   out_unlock:
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@@ -1377,6 -1382,9 +1382,9 @@@ next_slot
                          */
                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                 goto out_check;
+                       if (!btrfs_inc_nocow_writers(root->fs_info,
+                                                    disk_bytenr))
+                               goto out_check;
                         nocow = 1;
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                         extent_end = found_key.offset +
@@@ -1391,6 -1399,9 +1399,9 @@@ out_check
                         path->slots[0]++;
                         if (!nolock && nocow)
                                 btrfs_end_write_no_snapshoting(root);
+                       if (nocow)
+                               btrfs_dec_nocow_writers(root->fs_info,
+                                                       disk_bytenr);
                         goto next_slot;
                 }
                 if (!nocow) {
@@@ -1411,6 -1422,9 +1422,9 @@@
                         if (ret) {
                                 if (!nolock && nocow)
                                         btrfs_end_write_no_snapshoting(root);
+                               if (nocow)
+                                       btrfs_dec_nocow_writers(root->fs_info,
+                                                               disk_bytenr);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@@ -1453,6 -1467,8 +1467,8 @@@
   
                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                                num_bytes, num_bytes, type);
+               if (nocow)
+                       btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
                 BUG_ON(ret); /* -ENOMEM */
   
                 if (root->root_key.objectid ==
@@@ -7129,6 -7145,43 +7145,43 @@@ out
         return em;
   }
   
+ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
+                                                 const u64 start,
+                                                 const u64 len,
+                                                 const u64 orig_start,
+                                                 const u64 block_start,
+                                                 const u64 block_len,
+                                                 const u64 orig_block_len,
+                                                 const u64 ram_bytes,
+                                                 const int type)
+ {
+       struct extent_map *em = NULL;
+       int ret;
+ 
+       down_read(&BTRFS_I(inode)->dio_sem);
+       if (type != BTRFS_ORDERED_NOCOW) {
+               em = create_pinned_em(inode, start, len, orig_start,
+                                     block_start, block_len, orig_block_len,
+                                     ram_bytes, type);
+               if (IS_ERR(em))
+                       goto out;
+       }
+       ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
+                                          len, block_len, type);
+       if (ret) {
+               if (em) {
+                       free_extent_map(em);
+                       btrfs_drop_extent_cache(inode, start,
+                                               start + len - 1, 0);
+               }
+               em = ERR_PTR(ret);
+       }
+  out:
+       up_read(&BTRFS_I(inode)->dio_sem);
+ 
+       return em;
+ }
+ 
   static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                                                   u64 start, u64 len)
   {
@@@ -7144,41 -7197,13 +7197,13 @@@
         if (ret)
                 return ERR_PTR(ret);
   
-       /*
-        * Create the ordered extent before the extent map. This is to avoid
-        * races with the fast fsync path that would lead to it logging file
-        * extent items that point to disk extents that were not yet written to.
-        * The fast fsync path collects ordered extents into a local list and
-        * then collects all the new extent maps, so we must create the ordered
-        * extent first and make sure the fast fsync path collects any new
-        * ordered extents after collecting new extent maps as well.
-        * The fsync path simply can not rely on inode_dio_wait() because it
-        * causes deadlock with AIO.
-        */
-       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
-                                          ins.offset, ins.offset, 0);
-       if (ret) {
+       em = btrfs_create_dio_extent(inode, start, ins.offset, start,
+                                    ins.objectid, ins.offset, ins.offset,
+                                    ins.offset, 0);
+       btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
+       if (IS_ERR(em))
                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               return ERR_PTR(ret);
-       }
- 
-       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-                             ins.offset, ins.offset, ins.offset, 0);
-       if (IS_ERR(em)) {
-               struct btrfs_ordered_extent *oe;
   
-               btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-               oe = btrfs_lookup_ordered_extent(inode, start);
-               ASSERT(oe);
-               if (WARN_ON(!oe))
-                       return em;
-               set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
-               set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
-               btrfs_remove_ordered_extent(inode, oe);
-               /* Once for our lookup and once for the ordered extents tree. */
-               btrfs_put_ordered_extent(oe);
-               btrfs_put_ordered_extent(oe);
-       }
         return em;
   }
   
@@@ -7650,24 -7675,21 +7675,21 @@@ static int btrfs_get_blocks_direct(stru
                 block_start = em->block_start + (start - em->start);
   
                 if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1) {
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(root->fs_info, block_start)) {
+                       struct extent_map *em2;
+ 
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(root->fs_info, block_start);
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
-                               em = create_pinned_em(inode, start, len,
-                                                      orig_start,
-                                                      block_start, len,
-                                                      orig_block_len,
-                                                      ram_bytes, type);
-                               if (IS_ERR(em)) {
-                                       ret = PTR_ERR(em);
-                                       goto unlock_err;
-                               }
+                               em = em2;
                         }
- 
-                       ret = btrfs_add_ordered_extent_dio(inode, start,
-                                          block_start, len, len, type);
-                       if (ret) {
-                               free_extent_map(em);
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
                                 goto unlock_err;
                         }
                         goto unlock;
@@@ -9230,6 -9252,7 +9252,7 @@@ struct inode *btrfs_alloc_inode(struct 
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->delayed_iput);
         RB_CLEAR_NODE(&ei->rb_node);
+       init_rwsem(&ei->dio_sem);
   
         return inode;
   }
@@@ -9387,10 -9410,281 +9410,281 @@@ static int btrfs_getattr(struct vfsmoun
         return 0;
   }
   
+ static int btrfs_rename_exchange(struct inode *old_dir,
+                             struct dentry *old_dentry,
+                             struct inode *new_dir,
+                             struct dentry *new_dentry)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(old_dir)->root;
+       struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct inode *old_inode = old_dentry->d_inode;
+       struct timespec ctime = CURRENT_TIME;
+       struct dentry *parent;
+       u64 old_ino = btrfs_ino(old_inode);
+       u64 new_ino = btrfs_ino(new_inode);
+       u64 old_idx = 0;
+       u64 new_idx = 0;
+       u64 root_objectid;
+       int ret;
+       bool root_log_pinned = false;
+       bool dest_log_pinned = false;
+ 
+       /* we only allow rename subvolume link between subvolumes */
+       if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+               return -EXDEV;
+ 
+       /* close the race window with snapshot create/destroy ioctl */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&root->fs_info->subvol_sem);
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               down_read(&dest->fs_info->subvol_sem);
+ 
+       /*
+        * We want to reserve the absolute worst case amount of items.  So if
+        * both inodes are subvols and we need to unlink them then that would
+        * require 4 item modifications, but if they are both normal inodes it
+        * would require 5 item modifications, so we'll assume their normal
+        * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+        * should cover the worst case number of items we'll modify.
+        */
+       trans = btrfs_start_transaction(root, 12);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
+ 
+       /*
+        * We need to find a free sequence number both in the source and
+        * in the destination directory for the exchange.
+        */
+       ret = btrfs_set_inode_index(new_dir, &old_idx);
+       if (ret)
+               goto out_fail;
+       ret = btrfs_set_inode_index(old_dir, &new_idx);
+       if (ret)
+               goto out_fail;
+ 
+       BTRFS_I(old_inode)->dir_index = 0ULL;
+       BTRFS_I(new_inode)->dir_index = 0ULL;
+ 
+       /* Reference for the source. */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(root->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(root);
+               root_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, dest,
+                                            new_dentry->d_name.name,
+                                            new_dentry->d_name.len,
+                                            old_ino,
+                                            btrfs_ino(new_dir), old_idx);
+               if (ret)
+                       goto out_fail;
+       }
+ 
+       /* And now for the dest. */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               /* force full log commit if subvolume involved. */
+               btrfs_set_log_full_commit(dest->fs_info, trans);
+       } else {
+               btrfs_pin_log_trans(dest);
+               dest_log_pinned = true;
+               ret = btrfs_insert_inode_ref(trans, root,
+                                            old_dentry->d_name.name,
+                                            old_dentry->d_name.len,
+                                            new_ino,
+                                            btrfs_ino(old_dir), new_idx);
+               if (ret)
+                       goto out_fail;
+       }
+ 
+       /* Update inode version and ctime/mtime. */
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
+       inode_inc_iversion(new_inode);
+       old_dir->i_ctime = old_dir->i_mtime = ctime;
+       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_inode->i_ctime = ctime;
+       new_inode->i_ctime = ctime;
+ 
+       if (old_dentry->d_parent != new_dentry->d_parent) {
+               btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+               btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+       }
+ 
+       /* src is a subvolume */
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, root, old_dir,
+                                         root_objectid,
+                                         old_dentry->d_name.name,
+                                         old_dentry->d_name.len);
+       } else { /* src is an inode */
+               ret = __btrfs_unlink_inode(trans, root, old_dir,
+                                          old_dentry->d_inode,
+                                          old_dentry->d_name.name,
+                                          old_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, root, old_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       /* dest is a subvolume */
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
+               root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+               ret = btrfs_unlink_subvol(trans, dest, new_dir,
+                                         root_objectid,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+       } else { /* dest is an inode */
+               ret = __btrfs_unlink_inode(trans, dest, new_dir,
+                                          new_dentry->d_inode,
+                                          new_dentry->d_name.name,
+                                          new_dentry->d_name.len);
+               if (!ret)
+                       ret = btrfs_update_inode(trans, dest, new_inode);
+       }
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       ret = btrfs_add_link(trans, new_dir, old_inode,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len, 0, old_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       ret = btrfs_add_link(trans, old_dir, new_inode,
+                            old_dentry->d_name.name,
+                            old_dentry->d_name.len, 0, new_idx);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
+ 
+       if (old_inode->i_nlink == 1)
+               BTRFS_I(old_inode)->dir_index = old_idx;
+       if (new_inode->i_nlink == 1)
+               BTRFS_I(new_inode)->dir_index = new_idx;
+ 
+       if (root_log_pinned) {
+               parent = new_dentry->d_parent;
+               btrfs_log_new_name(trans, old_inode, old_dir, parent);
+               btrfs_end_log_trans(root);
+               root_log_pinned = false;
+       }
+       if (dest_log_pinned) {
+               parent = old_dentry->d_parent;
+               btrfs_log_new_name(trans, new_inode, new_dir, parent);
+               btrfs_end_log_trans(dest);
+               dest_log_pinned = false;
+       }
+ out_fail:
+       /*
+        * If we have pinned a log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && (root_log_pinned || dest_log_pinned)) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+ 
+               if (root_log_pinned) {
+                       btrfs_end_log_trans(root);
+                       root_log_pinned = false;
+               }
+               if (dest_log_pinned) {
+                       btrfs_end_log_trans(dest);
+                       dest_log_pinned = false;
+               }
+       }
+       ret = btrfs_end_transaction(trans, root);
+ out_notrans:
+       if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&dest->fs_info->subvol_sem);
+       if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+               up_read(&root->fs_info->subvol_sem);
+ 
+       return ret;
+ }
+ 
+ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct inode *dir,
+                                    struct dentry *dentry)
+ {
+       int ret;
+       struct inode *inode;
+       u64 objectid;
+       u64 index;
+ 
+       ret = btrfs_find_free_ino(root, &objectid);
+       if (ret)
+               return ret;
+ 
+       inode = btrfs_new_inode(trans, root, dir,
+                               dentry->d_name.name,
+                               dentry->d_name.len,
+                               btrfs_ino(dir),
+                               objectid,
+                               S_IFCHR | WHITEOUT_MODE,
+                               &index);
+ 
+       if (IS_ERR(inode)) {
+               ret = PTR_ERR(inode);
+               return ret;
+       }
+ 
+       inode->i_op = &btrfs_special_inode_operations;
+       init_special_inode(inode, inode->i_mode,
+               WHITEOUT_DEV);
+ 
+       ret = btrfs_init_inode_security(trans, inode, dir,
+                               &dentry->d_name);
+       if (ret)
+               goto out;
+ 
+       ret = btrfs_add_nondir(trans, dir, dentry,
+                               inode, 0, index);
+       if (ret)
+               goto out;
+ 
+       ret = btrfs_update_inode(trans, root, inode);
+ out:
+       unlock_new_inode(inode);
+       if (ret)
+               inode_dec_link_count(inode);
+       iput(inode);
+ 
+       return ret;
+ }
+ 
   static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
   {
         struct btrfs_trans_handle *trans;
+       unsigned int trans_num_items;
         struct btrfs_root *root = BTRFS_I(old_dir)->root;
         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
         struct inode *new_inode = d_inode(new_dentry);
@@@ -9399,6 -9693,7 +9693,7 @@@
         u64 root_objectid;
         int ret;
         u64 old_ino = btrfs_ino(old_inode);
+       bool log_pinned = false;
   
         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                 return -EPERM;
@@@ -9449,15 -9744,21 +9744,21 @@@
          * We want to reserve the absolute worst case amount of items.  So if
          * both inodes are subvols and we need to unlink them then that would
          * require 4 item modifications, but if they are both normal inodes it
-        * would require 5 item modifications, so we'll assume their normal
+        * would require 5 item modifications, so we'll assume they are normal
          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
          * should cover the worst case number of items we'll modify.
+        * If our rename has the whiteout flag, we need more 5 units for the
+        * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
+        * when selinux is enabled).
          */
-       trans = btrfs_start_transaction(root, 11);
+       trans_num_items = 11;
+       if (flags & RENAME_WHITEOUT)
+               trans_num_items += 5;
+       trans = btrfs_start_transaction(root, trans_num_items);
         if (IS_ERR(trans)) {
-                 ret = PTR_ERR(trans);
-                 goto out_notrans;
-         }
+               ret = PTR_ERR(trans);
+               goto out_notrans;
+       }
   
         if (dest != root)
                 btrfs_record_root_in_trans(trans, dest);
@@@ -9471,6 -9772,8 +9772,8 @@@
                 /* force full log commit if subvolume involved. */
                 btrfs_set_log_full_commit(root->fs_info, trans);
         } else {
+               btrfs_pin_log_trans(root);
+               log_pinned = true;
                 ret = btrfs_insert_inode_ref(trans, dest,
                                              new_dentry->d_name.name,
                                              new_dentry->d_name.len,
@@@ -9478,14 -9781,6 +9781,6 @@@
                                              btrfs_ino(new_dir), index);
                 if (ret)
                         goto out_fail;
-               /*
-                * this is an ugly little race, but the rename is required
-                * to make sure that if we crash, the inode is either at the
-                * old name or the new one.  pinning the log transaction lets
-                * us make sure we don't allow a log commit to come in after
-                * we unlink the name but before we add the new name back in.
-                */
-               btrfs_pin_log_trans(root);
         }
   
         inode_inc_iversion(old_dir);
@@@ -9552,12 -9847,46 +9847,46 @@@
         if (old_inode->i_nlink == 1)
                 BTRFS_I(old_inode)->dir_index = index;
   
-       if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+       if (log_pinned) {
                 struct dentry *parent = new_dentry->d_parent;
+ 
                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
                 btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
+ 
+       if (flags & RENAME_WHITEOUT) {
+               ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+                                               old_dentry);
+ 
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
         }
   out_fail:
+       /*
+        * If we have pinned the log and an error happened, we unpin tasks
+        * trying to sync the log and force them to fallback to a transaction
+        * commit if the log currently contains any of the inodes involved in
+        * this rename operation (to ensure we do not persist a log with an
+        * inconsistent state for any of these inodes or leading to any
+        * inconsistencies when replayed). If the transaction was aborted, the
+        * abortion reason is propagated to userspace when attempting to commit
+        * the transaction. If the log does not contain any of these inodes, we
+        * allow the tasks to sync it.
+        */
+       if (ret && log_pinned) {
+               if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
+                   btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
+                   (new_inode &&
+                    btrfs_inode_in_log(new_inode, root->fs_info->generation)))
+                   btrfs_set_log_full_commit(root->fs_info, trans);
+ 
+               btrfs_end_log_trans(root);
+               log_pinned = false;
+       }
         btrfs_end_transaction(trans, root);
   out_notrans:
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@@ -9570,10 -9899,14 +9899,14 @@@ static int btrfs_rename2(struct inode *
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
   {
-       if (flags & ~RENAME_NOREPLACE)
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
   
-       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (flags & RENAME_EXCHANGE)
+               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                         new_dentry);
+ 
+       return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
   }
   
   static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@@ -9942,6 -10275,7 +10275,7 @@@ static int __btrfs_prealloc_file_range(
                                 btrfs_end_transaction(trans, root);
                         break;
                 }
+               btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
   
                 last_alloc = ins.offset;
                 ret = insert_reserved_file_extent(trans, inode,
@@@ -10184,7 -10518,7 +10518,7 @@@ static const struct file_operations btr
         .iterate        = btrfs_real_readdir,
         .unlocked_ioctl = btrfs_ioctl,
   #ifdef CONFIG_COMPAT
- -      .compat_ioctl   = btrfs_ioctl,
+ +      .compat_ioctl   = btrfs_compat_ioctl,
   #endif
         .release        = btrfs_release_file,
         .fsync          = btrfs_sync_file,
diff --combined fs/btrfs/ioctl.c

index aa97dfe8ae70820ecc504e9959e960bb8b3c1f21,697cc336bd1ce19fad52743964eec4f2d4073f50..73c0be77b4bdc5be262251d89b30d997a3884def
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -125,10 -125,10 +125,10 @@@ static unsigned int btrfs_flags_to_ioct
         if (flags & BTRFS_INODE_NODATACOW)
                 iflags |= FS_NOCOW_FL;
   
- -      if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
- -              iflags |= FS_COMPR_FL;
- -      else if (flags & BTRFS_INODE_NOCOMPRESS)
+ +      if (flags & BTRFS_INODE_NOCOMPRESS)
                 iflags |= FS_NOCOMP_FL;
+ +      else if (flags & BTRFS_INODE_COMPRESS)
+ +              iflags |= FS_COMPR_FL;
   
         return iflags;
   }
@@@ -439,7 -439,7 +439,7 @@@ static noinline int create_subvol(struc
   {
         struct btrfs_trans_handle *trans;
         struct btrfs_key key;
- -      struct btrfs_root_item root_item;
+ +      struct btrfs_root_item *root_item;
         struct btrfs_inode_item *inode_item;
         struct extent_buffer *leaf;
         struct btrfs_root *root = BTRFS_I(dir)->root;
@@@ -455,22 -455,16 +455,22 @@@
         u64 qgroup_reserved;
         uuid_le new_uuid;
   
+ +      root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
+ +      if (!root_item)
+ +              return -ENOMEM;
+ +
         ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
         if (ret)
- -              return ret;
+ +              goto fail_free;
   
         /*
          * Don't create subvolume whose level is not zero. Or qgroup will be
          * screwed up since it assume subvolme qgroup's level to be 0.
          */
- -      if (btrfs_qgroup_level(objectid))
- -              return -ENOSPC;
+ +      if (btrfs_qgroup_level(objectid)) {
+ +              ret = -ENOSPC;
+ +              goto fail_free;
+ +      }
   
         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
         /*
@@@ -480,14 -474,14 +480,14 @@@
         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
                                                8, &qgroup_reserved, false);
         if (ret)
- -              return ret;
+ +              goto fail_free;
   
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 btrfs_subvolume_release_metadata(root, &block_rsv,
                                                  qgroup_reserved);
- -              return ret;
+ +              goto fail_free;
         }
         trans->block_rsv = &block_rsv;
         trans->bytes_reserved = block_rsv.size;
@@@ -515,45 -509,47 +515,45 @@@
                             BTRFS_UUID_SIZE);
         btrfs_mark_buffer_dirty(leaf);
   
- -      memset(&root_item, 0, sizeof(root_item));
- -
- -      inode_item = &root_item.inode;
+ +      inode_item = &root_item->inode;
         btrfs_set_stack_inode_generation(inode_item, 1);
         btrfs_set_stack_inode_size(inode_item, 3);
         btrfs_set_stack_inode_nlink(inode_item, 1);
         btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
   
- -      btrfs_set_root_flags(&root_item, 0);
- -      btrfs_set_root_limit(&root_item, 0);
+ +      btrfs_set_root_flags(root_item, 0);
+ +      btrfs_set_root_limit(root_item, 0);
         btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
   
- -      btrfs_set_root_bytenr(&root_item, leaf->start);
- -      btrfs_set_root_generation(&root_item, trans->transid);
- -      btrfs_set_root_level(&root_item, 0);
- -      btrfs_set_root_refs(&root_item, 1);
- -      btrfs_set_root_used(&root_item, leaf->len);
- -      btrfs_set_root_last_snapshot(&root_item, 0);
+ +      btrfs_set_root_bytenr(root_item, leaf->start);
+ +      btrfs_set_root_generation(root_item, trans->transid);
+ +      btrfs_set_root_level(root_item, 0);
+ +      btrfs_set_root_refs(root_item, 1);
+ +      btrfs_set_root_used(root_item, leaf->len);
+ +      btrfs_set_root_last_snapshot(root_item, 0);
   
- -      btrfs_set_root_generation_v2(&root_item,
- -                      btrfs_root_generation(&root_item));
+ +      btrfs_set_root_generation_v2(root_item,
+ +                      btrfs_root_generation(root_item));
         uuid_le_gen(&new_uuid);
- -      memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
- -      btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
- -      btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
- -      root_item.ctime = root_item.otime;
- -      btrfs_set_root_ctransid(&root_item, trans->transid);
- -      btrfs_set_root_otransid(&root_item, trans->transid);
+ +      memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+ +      btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
+ +      btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
+ +      root_item->ctime = root_item->otime;
+ +      btrfs_set_root_ctransid(root_item, trans->transid);
+ +      btrfs_set_root_otransid(root_item, trans->transid);
   
         btrfs_tree_unlock(leaf);
         free_extent_buffer(leaf);
         leaf = NULL;
   
- -      btrfs_set_root_dirid(&root_item, new_dirid);
+ +      btrfs_set_root_dirid(root_item, new_dirid);
   
         key.objectid = objectid;
         key.offset = 0;
         key.type = BTRFS_ROOT_ITEM_KEY;
         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- -                              &root_item);
+ +                              root_item);
         if (ret)
                 goto fail;
   
@@@ -605,13 -601,12 +605,13 @@@
         BUG_ON(ret);
   
         ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
- -                                root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+ +                                root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
                                   objectid);
         if (ret)
                 btrfs_abort_transaction(trans, root, ret);
   
   fail:
+ +      kfree(root_item);
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
         btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@@ -634,10 -629,6 +634,10 @@@
                 d_instantiate(dentry, inode);
         }
         return ret;
+ +
+ +fail_free:
+ +      kfree(root_item);
+ +      return ret;
   }
   
   static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@@ -690,7 -681,7 +690,7 @@@ static int create_snapshot(struct btrfs
         if (ret)
                 goto dec_and_free;
   
-       btrfs_wait_ordered_extents(root, -1);
+       btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
   
         btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                              BTRFS_BLOCK_RSV_TEMP);
@@@ -2676,10 -2667,10 +2676,10 @@@ out
         return ret;
   }
   
- -static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
   {
         struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
- -      struct btrfs_ioctl_vol_args *vol_args;
+ +      struct btrfs_ioctl_vol_args_v2 *vol_args;
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
@@@ -2695,9 -2686,7 +2695,9 @@@
                 goto err_drop;
         }
   
- -      vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ +      /* Check for compatibility reject unknown flags */
+ +      if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
+ +              return -EOPNOTSUPP;
   
         if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                         1)) {
@@@ -2706,23 -2695,13 +2706,23 @@@
         }
   
         mutex_lock(&root->fs_info->volume_mutex);
- -      ret = btrfs_rm_device(root, vol_args->name);
+ +      if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ +              ret = btrfs_rm_device(root, NULL, vol_args->devid);
+ +      } else {
+ +              vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ +              ret = btrfs_rm_device(root, vol_args->name, 0);
+ +      }
         mutex_unlock(&root->fs_info->volume_mutex);
         atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
   
- -      if (!ret)
- -              btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
- -
+ +      if (!ret) {
+ +              if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
+ +                      btrfs_info(root->fs_info, "device deleted: id %llu",
+ +                                      vol_args->devid);
+ +              else
+ +                      btrfs_info(root->fs_info, "device deleted: %s",
+ +                                      vol_args->name);
+ +      }
   out:
         kfree(vol_args);
   err_drop:
@@@ -2730,47 -2709,6 +2730,47 @@@
         return ret;
   }
   
+ +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
+ +{
+ +      struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ +      struct btrfs_ioctl_vol_args *vol_args;
+ +      int ret;
+ +
+ +      if (!capable(CAP_SYS_ADMIN))
+ +              return -EPERM;
+ +
+ +      ret = mnt_want_write_file(file);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+ +                      1)) {
+ +              ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ +              goto out_drop_write;
+ +      }
+ +
+ +      vol_args = memdup_user(arg, sizeof(*vol_args));
+ +      if (IS_ERR(vol_args)) {
+ +              ret = PTR_ERR(vol_args);
+ +              goto out;
+ +      }
+ +
+ +      vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ +      mutex_lock(&root->fs_info->volume_mutex);
+ +      ret = btrfs_rm_device(root, vol_args->name, 0);
+ +      mutex_unlock(&root->fs_info->volume_mutex);
+ +
+ +      if (!ret)
+ +              btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
+ +      kfree(vol_args);
+ +out:
+ +      atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ +out_drop_write:
+ +      mnt_drop_write_file(file);
+ +
+ +      return ret;
+ +}
+ +
   static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
   {
         struct btrfs_ioctl_fs_info_args *fi_args;
@@@ -3530,16 -3468,13 +3530,16 @@@ static int btrfs_clone(struct inode *sr
         u64 last_dest_end = destoff;
   
         ret = -ENOMEM;
- -      buf = vmalloc(root->nodesize);
- -      if (!buf)
- -              return ret;
+ +      buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ +      if (!buf) {
+ +              buf = vmalloc(root->nodesize);
+ +              if (!buf)
+ +                      return ret;
+ +      }
   
         path = btrfs_alloc_path();
         if (!path) {
- -              vfree(buf);
+ +              kvfree(buf);
                 return ret;
         }
   
@@@ -3840,7 -3775,7 +3840,7 @@@ process_slot
   
   out:
         btrfs_free_path(path);
- -      vfree(buf);
+ +      kvfree(buf);
         return ret;
   }
   
@@@ -4441,7 -4376,7 +4441,7 @@@ static long btrfs_ioctl_dev_replace(str
                         1)) {
                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                 } else {
- -                      ret = btrfs_dev_replace_start(root, p);
+ +                      ret = btrfs_dev_replace_by_ioctl(root, p);
                         atomic_set(
                          &root->fs_info->mutually_exclusive_operation_running,
                          0);
@@@ -4912,8 -4847,8 +4912,8 @@@ static long btrfs_ioctl_qgroup_assign(s
         /* update qgroup status and info */
         err = btrfs_run_qgroups(trans, root->fs_info);
         if (err < 0)
- -              btrfs_std_error(root->fs_info, ret,
- -                          "failed to update qgroup status and info\n");
+ +              btrfs_handle_fs_error(root->fs_info, err,
+ +                          "failed to update qgroup status and info");
         err = btrfs_end_transaction(trans, root);
         if (err && !ret)
                 ret = err;
@@@ -5459,15 -5394,9 +5459,15 @@@ static int btrfs_ioctl_set_features(str
         if (ret)
                 return ret;
   
+ +      ret = mnt_want_write_file(file);
+ +      if (ret)
+ +              return ret;
+ +
         trans = btrfs_start_transaction(root, 0);
- -      if (IS_ERR(trans))
- -              return PTR_ERR(trans);
+ +      if (IS_ERR(trans)) {
+ +              ret = PTR_ERR(trans);
+ +              goto out_drop_write;
+ +      }
   
         spin_lock(&root->fs_info->super_lock);
         newflags = btrfs_super_compat_flags(super_block);
@@@ -5486,11 -5415,7 +5486,11 @@@
         btrfs_set_super_incompat_flags(super_block, newflags);
         spin_unlock(&root->fs_info->super_lock);
   
- -      return btrfs_commit_transaction(trans, root);
+ +      ret = btrfs_commit_transaction(trans, root);
+ +out_drop_write:
+ +      mnt_drop_write_file(file);
+ +
+ +      return ret;
   }
   
   long btrfs_ioctl(struct file *file, unsigned int
@@@ -5534,8 -5459,6 +5534,8 @@@
                 return btrfs_ioctl_add_dev(root, argp);
         case BTRFS_IOC_RM_DEV:
                 return btrfs_ioctl_rm_dev(file, argp);
+ +      case BTRFS_IOC_RM_DEV_V2:
+ +              return btrfs_ioctl_rm_dev_v2(file, argp);
         case BTRFS_IOC_FS_INFO:
                 return btrfs_ioctl_fs_info(root, argp);
         case BTRFS_IOC_DEV_INFO:
@@@ -5629,24 -5552,3 +5629,24 @@@
   
         return -ENOTTY;
   }
+ +
+ +#ifdef CONFIG_COMPAT
+ +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+ +{
+ +      switch (cmd) {
+ +      case FS_IOC32_GETFLAGS:
+ +              cmd = FS_IOC_GETFLAGS;
+ +              break;
+ +      case FS_IOC32_SETFLAGS:
+ +              cmd = FS_IOC_SETFLAGS;
+ +              break;
+ +      case FS_IOC32_GETVERSION:
+ +              cmd = FS_IOC_GETVERSION;
+ +              break;
+ +      default:
+ +              return -ENOIOCTLCMD;
+ +      }
+ +
+ +      return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+ +}
+ +#endif
diff --combined fs/btrfs/relocation.c

index 1c29514d8aff828c3e696f9d8fe6dd21af83a232,054d9a80e77e58e134c3042647b847f0fc9a36a7..1cfd35cfac761cdacc43918060f248a8180881cd
--- 1/fs/btrfs/relocation.c
--- 2/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@@ -2418,7 -2418,7 +2418,7 @@@ again
         }
   out:
         if (ret) {
- -              btrfs_std_error(root->fs_info, ret, NULL);
+ +              btrfs_handle_fs_error(root->fs_info, ret, NULL);
                 if (!list_empty(&reloc_roots))
                         free_reloc_roots(&reloc_roots);
   
@@@ -4254,12 -4254,11 +4254,11 @@@ int btrfs_relocate_block_group(struct b
         btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
                rc->block_group->key.objectid, rc->block_group->flags);
   
-       ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
-       if (ret < 0) {
-               err = ret;
-               goto out;
-       }
-       btrfs_wait_ordered_roots(fs_info, -1);
+       btrfs_wait_block_group_reservations(rc->block_group);
+       btrfs_wait_nocow_writers(rc->block_group);
+       btrfs_wait_ordered_roots(fs_info, -1,
+                                rc->block_group->key.objectid,
+                                rc->block_group->key.offset);
   
         while (1) {
                 mutex_lock(&fs_info->cleaner_mutex);
diff --combined fs/btrfs/super.c

index bc060cf2675cae09c06b6c6ecab7874c622ace14,89d134794d47d65b5e841e5b078210c0f3870849..bf71071ab6f6c48180837b3e85b818ad1cc60968
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -97,6 -97,15 +97,6 @@@ const char *btrfs_decode_error(int errn
         return errstr;
   }
   
- -static void save_error_info(struct btrfs_fs_info *fs_info)
- -{
- -      /*
- -       * today we only save the error info into ram.  Long term we'll
- -       * also send it down to the disk
- -       */
- -      set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
- -}
- -
   /* btrfs handle error by forcing the filesystem readonly */
   static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
   {
@@@ -122,11 -131,11 +122,11 @@@
   }
   
   /*
- - * __btrfs_std_error decodes expected errors from the caller and
+ + * __btrfs_handle_fs_error decodes expected errors from the caller and
    * invokes the approciate error response.
    */
   __cold
- -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+ +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
                        unsigned int line, int errno, const char *fmt, ...)
   {
         struct super_block *sb = fs_info->sb;
@@@ -161,13 -170,8 +161,13 @@@
         }
   #endif
   
+ +      /*
+ +       * Today we only save the error info to memory.  Long term we'll
+ +       * also send it down to the disk
+ +       */
+ +      set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+ +
         /* Don't go through full error handling during mount */
- -      save_error_info(fs_info);
         if (sb->s_flags & MS_BORN)
                 btrfs_handle_error(fs_info);
   }
@@@ -248,7 -252,7 +248,7 @@@ void __btrfs_abort_transaction(struct b
         /* Wake up anybody who may be waiting on this transaction */
         wake_up(&root->fs_info->transaction_wait);
         wake_up(&root->fs_info->transaction_blocked_wait);
- -      __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+ +      __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
   }
   /*
    * __btrfs_panic decodes unexpected, fatal errors from the caller,
@@@ -1156,7 -1160,7 +1156,7 @@@ int btrfs_sync_fs(struct super_block *s
                 return 0;
         }
   
-       btrfs_wait_ordered_roots(fs_info, -1);
+       btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
   
         trans = btrfs_attach_transaction_barrier(root);
         if (IS_ERR(trans)) {
@@@ -1484,10 -1488,10 +1484,10 @@@ static int setup_security_options(struc
                 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
         } else {
                 /*
- -               * Since SELinux(the only one supports security_mnt_opts) does
- -               * NOT support changing context during remount/mount same sb,
- -               * This must be the same or part of the same security options,
- -               * just free it.
+ +               * Since SELinux (the only one supporting security_mnt_opts)
+ +               * does NOT support changing context during remount/mount of
+ +               * the same sb, this must be the same or part of the same
+ +               * security options, just free it.
                  */
                 security_free_mnt_opts(sec_opts);
         }
@@@ -1665,8 -1669,8 +1665,8 @@@ static inline void btrfs_remount_cleanu
                                          unsigned long old_opts)
   {
         /*
- -       * We need cleanup all defragable inodes if the autodefragment is
- -       * close or the fs is R/O.
+ +       * We need to cleanup all defragable inodes if the autodefragment is
+ +       * close or the filesystem is read only.
          */
         if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
             (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@@ -2047,10 -2051,9 +2047,10 @@@ static int btrfs_statfs(struct dentry *
         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
         int ret;
         u64 thresh = 0;
+ +      int mixed = 0;
   
         /*
- -       * holding chunk_muext to avoid allocating new chunks, holding
+ +       * holding chunk_mutex to avoid allocating new chunks, holding
          * device_list_mutex to avoid the device being removed
          */
         rcu_read_lock();
@@@ -2073,17 -2076,8 +2073,17 @@@
                                 }
                         }
                 }
- -              if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- -                      total_free_meta += found->disk_total - found->disk_used;
+ +
+ +              /*
+ +               * Metadata in mixed block goup profiles are accounted in data
+ +               */
+ +              if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ +                      if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+ +                              mixed = 1;
+ +                      else
+ +                              total_free_meta += found->disk_total -
+ +                                      found->disk_used;
+ +              }
   
                 total_used += found->disk_used;
         }
@@@ -2096,11 -2090,7 +2096,11 @@@
   
         /* Account global block reserve as used, it's in logical size already */
         spin_lock(&block_rsv->lock);
- -      buf->f_bfree -= block_rsv->size >> bits;
+ +      /* Mixed block groups accounting is not byte-accurate, avoid overflow */
+ +      if (buf->f_bfree >= block_rsv->size >> bits)
+ +              buf->f_bfree -= block_rsv->size >> bits;
+ +      else
+ +              buf->f_bfree = 0;
         spin_unlock(&block_rsv->lock);
   
         buf->f_bavail = div_u64(total_free_data, factor);
@@@ -2125,7 -2115,7 +2125,7 @@@
          */
         thresh = 4 * 1024 * 1024;
   
- -      if (total_free_meta - thresh < block_rsv->size)
+ +      if (!mixed && total_free_meta - thresh < block_rsv->size)
                 buf->f_bavail = 0;
   
         buf->f_type = BTRFS_SUPER_MAGIC;
diff --combined fs/btrfs/transaction.c

index abf3a4604ac6f19a43378bcd42e2b3267d9b26d2,f0bb54a773148c5acfeb2c235444da0854bcf866..5b0b758a3f79d108a6b5f9dc3bac1ad95a9b4057
--- 1/fs/btrfs/transaction.c
--- 2/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@@ -311,11 -311,10 +311,11 @@@ loop
    * when the transaction commits
    */
   static int record_root_in_trans(struct btrfs_trans_handle *trans,
- -                             struct btrfs_root *root)
+ +                             struct btrfs_root *root,
+ +                             int force)
   {
- -      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
- -          root->last_trans < trans->transid) {
+ +      if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ +          root->last_trans < trans->transid) || force) {
                 WARN_ON(root == root->fs_info->extent_root);
                 WARN_ON(root->commit_root != root->node);
   
@@@ -332,7 -331,7 +332,7 @@@
                 smp_wmb();
   
                 spin_lock(&root->fs_info->fs_roots_radix_lock);
- -              if (root->last_trans == trans->transid) {
+ +              if (root->last_trans == trans->transid && !force) {
                         spin_unlock(&root->fs_info->fs_roots_radix_lock);
                         return 0;
                 }
@@@ -403,7 -402,7 +403,7 @@@ int btrfs_record_root_in_trans(struct b
                 return 0;
   
         mutex_lock(&root->fs_info->reloc_mutex);
- -      record_root_in_trans(trans, root);
+ +      record_root_in_trans(trans, root, 0);
         mutex_unlock(&root->fs_info->reloc_mutex);
   
         return 0;
@@@ -1311,97 -1310,6 +1311,97 @@@ int btrfs_defrag_root(struct btrfs_roo
         return ret;
   }
   
+ +/* Bisesctability fixup, remove in 4.8 */
+ +#ifndef btrfs_std_error
+ +#define btrfs_std_error btrfs_handle_fs_error
+ +#endif
+ +
+ +/*
+ + * Do all special snapshot related qgroup dirty hack.
+ + *
+ + * Will do all needed qgroup inherit and dirty hack like switch commit
+ + * roots inside one transaction and write all btree into disk, to make
+ + * qgroup works.
+ + */
+ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
+ +                                 struct btrfs_root *src,
+ +                                 struct btrfs_root *parent,
+ +                                 struct btrfs_qgroup_inherit *inherit,
+ +                                 u64 dst_objectid)
+ +{
+ +      struct btrfs_fs_info *fs_info = src->fs_info;
+ +      int ret;
+ +
+ +      /*
+ +       * Save some performance in the case that qgroups are not
+ +       * enabled. If this check races with the ioctl, rescan will
+ +       * kick in anyway.
+ +       */
+ +      mutex_lock(&fs_info->qgroup_ioctl_lock);
+ +      if (!fs_info->quota_enabled) {
+ +              mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ +              return 0;
+ +      }
+ +      mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ +
+ +      /*
+ +       * We are going to commit transaction, see btrfs_commit_transaction()
+ +       * comment for reason locking tree_log_mutex
+ +       */
+ +      mutex_lock(&fs_info->tree_log_mutex);
+ +
+ +      ret = commit_fs_roots(trans, src);
+ +      if (ret)
+ +              goto out;
+ +      ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ +      if (ret < 0)
+ +              goto out;
+ +      ret = btrfs_qgroup_account_extents(trans, fs_info);
+ +      if (ret < 0)
+ +              goto out;
+ +
+ +      /* Now qgroup are all updated, we can inherit it to new qgroups */
+ +      ret = btrfs_qgroup_inherit(trans, fs_info,
+ +                                 src->root_key.objectid, dst_objectid,
+ +                                 inherit);
+ +      if (ret < 0)
+ +              goto out;
+ +
+ +      /*
+ +       * Now we do a simplified commit transaction, which will:
+ +       * 1) commit all subvolume and extent tree
+ +       *    To ensure all subvolume and extent tree have a valid
+ +       *    commit_root to accounting later insert_dir_item()
+ +       * 2) write all btree blocks onto disk
+ +       *    This is to make sure later btree modification will be cowed
+ +       *    Or commit_root can be populated and cause wrong qgroup numbers
+ +       * In this simplified commit, we don't really care about other trees
+ +       * like chunk and root tree, as they won't affect qgroup.
+ +       * And we don't write super to avoid half committed status.
+ +       */
+ +      ret = commit_cowonly_roots(trans, src);
+ +      if (ret)
+ +              goto out;
+ +      switch_commit_roots(trans->transaction, fs_info);
+ +      ret = btrfs_write_and_wait_transaction(trans, src);
+ +      if (ret)
+ +              btrfs_std_error(fs_info, ret,
+ +                      "Error while writing out transaction for qgroup");
+ +
+ +out:
+ +      mutex_unlock(&fs_info->tree_log_mutex);
+ +
+ +      /*
+ +       * Force parent root to be updated, as we recorded it before so its
+ +       * last_trans == cur_transid.
+ +       * Or it won't be committed again onto disk after later
+ +       * insert_dir_item()
+ +       */
+ +      if (!ret)
+ +              record_root_in_trans(trans, parent, 1);
+ +      return ret;
+ +}
+ +
   /*
    * new snapshots need to be created at a very specific time in the
    * transaction commit.  This does the actual creation.
@@@ -1475,7 -1383,7 +1475,7 @@@ static noinline int create_pending_snap
         dentry = pending->dentry;
         parent_inode = pending->dir;
         parent_root = BTRFS_I(parent_inode)->root;
- -      record_root_in_trans(trans, parent_root);
+ +      record_root_in_trans(trans, parent_root, 0);
   
         cur_time = current_fs_time(parent_inode->i_sb);
   
@@@ -1512,7 -1420,7 +1512,7 @@@
                 goto fail;
         }
   
- -      record_root_in_trans(trans, root);
+ +      record_root_in_trans(trans, root, 0);
         btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
         memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
         btrfs_check_and_init_root_item(new_root_item);
@@@ -1608,17 -1516,6 +1608,17 @@@
                 goto fail;
         }
   
+ +      /*
+ +       * Do special qgroup accounting for snapshot, as we do some qgroup
+ +       * snapshot hack to do fast snapshot.
+ +       * To co-operate with that hack, we do hack again.
+ +       * Or snapshot will be greatly slowed down by a subtree qgroup rescan
+ +       */
+ +      ret = qgroup_account_snapshot(trans, root, parent_root,
+ +                                    pending->inherit, objectid);
+ +      if (ret < 0)
+ +              goto fail;
+ +
         ret = btrfs_insert_dir_item(trans, parent_root,
                                     dentry->d_name.name, dentry->d_name.len,
                                     parent_inode, &key,
@@@ -1662,6 -1559,23 +1662,6 @@@
                 goto fail;
         }
   
- -      /*
- -       * account qgroup counters before qgroup_inherit()
- -       */
- -      ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
- -      if (ret)
- -              goto fail;
- -      ret = btrfs_qgroup_account_extents(trans, fs_info);
- -      if (ret)
- -              goto fail;
- -      ret = btrfs_qgroup_inherit(trans, fs_info,
- -                                 root->root_key.objectid,
- -                                 objectid, pending->inherit);
- -      if (ret) {
- -              btrfs_abort_transaction(trans, root, ret);
- -              goto fail;
- -      }
- -
   fail:
         pending->error = ret;
   dir_item_existed:
@@@ -1907,7 -1821,7 +1907,7 @@@ static inline int btrfs_start_delalloc_
   static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
   {
         if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-               btrfs_wait_ordered_roots(fs_info, -1);
+               btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
   }
   
   static inline void
@@@ -2231,7 -2145,7 +2231,7 @@@ int btrfs_commit_transaction(struct btr
   
         ret = btrfs_write_and_wait_transaction(trans, root);
         if (ret) {
- -              btrfs_std_error(root->fs_info, ret,
+ +              btrfs_handle_fs_error(root->fs_info, ret,
                             "Error while writing out transaction");
                 mutex_unlock(&root->fs_info->tree_log_mutex);
                 goto scrub_continue;
diff --combined fs/btrfs/tree-log.c

index 16a74d1a272024e5c734fd876b664d7c769f0ee9,003a826f4cffbc58bd6e2041cc22371e33b96319..6aaab31a722e0cf553919201f0a37e278f1f160a
--- 1/fs/btrfs/tree-log.c
--- 2/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@@ -4141,6 -4141,7 +4141,7 @@@ static int btrfs_log_changed_extents(st
   
         INIT_LIST_HEAD(&extents);
   
+       down_write(&BTRFS_I(inode)->dio_sem);
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
   
@@@ -4169,13 -4170,20 +4170,20 @@@
         }
   
         list_sort(NULL, &extents, extent_cmp);
+       btrfs_get_logged_extents(inode, logged_list, start, end);
         /*
-        * Collect any new ordered extents within the range. This is to
-        * prevent logging file extent items without waiting for the disk
-        * location they point to being written. We do this only to deal
-        * with races against concurrent lockless direct IO writes.
+        * Some ordered extents started by fsync might have completed
+        * before we could collect them into the list logged_list, which
+        * means they're gone, not in our logged_list nor in the inode's
+        * ordered tree. We want the application/user space to know an
+        * error happened while attempting to persist file data so that
+        * it can take proper action. If such error happened, we leave
+        * without writing to the log tree and the fsync must report the
+        * file data write error and not commit the current transaction.
          */
-       btrfs_get_logged_extents(inode, logged_list, start, end);
+       ret = btrfs_inode_check_errors(inode);
+       if (ret)
+               ctx->io_err = ret;
   process:
         while (!list_empty(&extents)) {
                 em = list_entry(extents.next, struct extent_map, list);
@@@ -4202,6 -4210,7 +4210,7 @@@
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
+       up_write(&BTRFS_I(inode)->dio_sem);
   
         btrfs_release_path(path);
         return ret;
@@@ -4622,23 -4631,6 +4631,6 @@@ static int btrfs_log_inode(struct btrfs
   
         mutex_lock(&BTRFS_I(inode)->log_mutex);
   
-       /*
-        * Collect ordered extents only if we are logging data. This is to
-        * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
-        * will process the ordered extents if they still exists at the time,
-        * because when we collect them we test and set for the flag
-        * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
-        * same ordered extents. The consequence for the LOG_INODE_ALL log mode
-        * not processing the ordered extents is that we end up logging the
-        * corresponding file extent items, based on the extent maps in the
-        * inode's extent_map_tree's modified_list, without logging the
-        * respective checksums (since the may still be only attached to the
-        * ordered extents and have not been inserted in the csum tree by
-        * btrfs_finish_ordered_io() yet).
-        */
-       if (inode_only == LOG_INODE_ALL)
-               btrfs_get_logged_extents(inode, &logged_list, start, end);
- 
         /*
          * a brute force approach to making sure we get the most uptodate
          * copies of everything.
@@@ -4846,21 -4838,6 +4838,6 @@@ log_extents
                         goto out_unlock;
         }
         if (fast_search) {
-               /*
-                * Some ordered extents started by fsync might have completed
-                * before we collected the ordered extents in logged_list, which
-                * means they're gone, not in our logged_list nor in the inode's
-                * ordered tree. We want the application/user space to know an
-                * error happened while attempting to persist file data so that
-                * it can take proper action. If such error happened, we leave
-                * without writing to the log tree and the fsync must report the
-                * file data write error and not commit the current transaction.
-                */
-               err = btrfs_inode_check_errors(inode);
-               if (err) {
-                       ctx->io_err = err;
-                       goto out_unlock;
-               }
                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
                                                 &logged_list, ctx, start, end);
                 if (ret) {
@@@ -5158,7 -5135,7 +5135,7 @@@ process_leaf
                         }
   
                         ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
+                       if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                 log_mode = LOG_INODE_ALL;
                         btrfs_release_path(path);
                         ret = btrfs_log_inode(trans, root, di_inode,
@@@ -5278,11 -5255,16 +5255,16 @@@ static int btrfs_log_all_parents(struc
                         if (IS_ERR(dir_inode))
                                 continue;
   
+                       if (ctx)
+                               ctx->log_new_dentries = false;
                         ret = btrfs_log_inode(trans, root, dir_inode,
                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
                         if (!ret &&
                             btrfs_must_commit_transaction(trans, dir_inode))
                                 ret = 1;
+                       if (!ret && ctx && ctx->log_new_dentries)
+                               ret = log_new_dir_dentries(trans, root,
+                                                          dir_inode, ctx);
                         iput(dir_inode);
                         if (ret)
                                 goto out;
@@@ -5519,7 -5501,7 +5501,7 @@@ int btrfs_recover_log_trees(struct btrf
   
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
- -              btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
+ +              btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
                             "recovering log root tree.");
                 goto error;
         }
@@@ -5533,7 -5515,7 +5515,7 @@@ again
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
   
                 if (ret < 0) {
- -                      btrfs_std_error(fs_info, ret,
+ +                      btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't find tree log root.");
                         goto error;
                 }
@@@ -5551,7 -5533,7 +5533,7 @@@
                 log = btrfs_read_fs_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
- -                      btrfs_std_error(fs_info, ret,
+ +                      btrfs_handle_fs_error(fs_info, ret,
                                     "Couldn't read tree log root.");
                         goto error;
                 }
@@@ -5566,7 -5548,7 +5548,7 @@@
                         free_extent_buffer(log->node);
                         free_extent_buffer(log->commit_root);
                         kfree(log);
- -                      btrfs_std_error(fs_info, ret, "Couldn't read target root "
+ +                      btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
                                     "for tree log recovery.");
                         goto error;
                 }
@@@ -5652,11 -5634,9 +5634,9 @@@ void btrfs_record_unlink_dir(struct btr
          * into the file.  When the file is logged we check it and
          * don't log the parents if the file is fully on disk.
          */
-       if (S_ISREG(inode->i_mode)) {
-               mutex_lock(&BTRFS_I(inode)->log_mutex);
-               BTRFS_I(inode)->last_unlink_trans = trans->transid;
-               mutex_unlock(&BTRFS_I(inode)->log_mutex);
-       }
+       mutex_lock(&BTRFS_I(inode)->log_mutex);
+       BTRFS_I(inode)->last_unlink_trans = trans->transid;
+       mutex_unlock(&BTRFS_I(inode)->log_mutex);
   
         /*
          * if this directory was already logged any new
author	Chris Mason <clm@fb.com>
	Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)
committer	Chris Mason <clm@fb.com>
	Tue, 17 May 2016 21:43:19 +0000 (14:43 -0700)
		1	2
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/dev-replace.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/relocation.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/transaction.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/tree-log.c	patch \|	diff1 \|	diff2 \|	blob \| history