Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt

index c79ec58fd7f6f3df8ee32d98448cf4acfe5d7705..3ae9bc94352a660f2d3ed9feccc0b3aa8955ffcc 100644 (file)
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -226,10 +226,6 @@ acl                        Enables POSIX Access Control Lists support.
  noacl                  This option disables POSIX Access Control List
                         support.
  
-reservation
-
-noreservation
-
  bsddf          (*)     Make 'df' act like BSD.
  minixdf                        Make 'df' act like Minix.
  
diff --git a/MAINTAINERS b/MAINTAINERS

index 1ab17de642e572298ef32b38b5a69198739b7cf8..d54d551004f7569202d410c78bc730997c0d8726 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3572,9 +3572,16 @@ M:       Andrew Morton <akpm@linux-foundation.org>
  M:     Jan Kara <jack@suse.cz>
  L:     linux-ext4@vger.kernel.org
  S:     Maintained
-F:     fs/jbd*/
-F:     include/linux/ext*jbd*.h
-F:     include/linux/jbd*.h
+F:     fs/jbd/
+F:     include/linux/ext3_jbd.h
+F:     include/linux/jbd.h
+
+JOURNALLING LAYER FOR BLOCK DEVICES (JBD2)
+M:     "Theodore Ts'o" <tytso@mit.edu>
+L:     linux-ext4@vger.kernel.org
+S:     Maintained
+F:     fs/jbd2/
+F:     include/linux/jbd2.h
  
  JSM Neo PCI based serial card
  M:     Breno Leitao <leitao@linux.vnet.ibm.com>
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile

index c947e36eda6c9d568b0c00b28c778ac2216996d5..04109460ba9e3f22d0115f8cc8b16d4e9b13d3f6 100644 (file)
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
  
  ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+               mmp.o
  
  ext4-$(CONFIG_EXT4_FS_XATTR)           += xattr.o xattr_user.o xattr_trusted.o
  ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c

index 1c67139ad4b422119c76d10f384dad9eabdf9158..264f6949511ef842169438571049ccb736955531 100644 (file)
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -361,130 +361,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
         return bh;
  }
  
-/**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle:                    handle to this transaction
- * @sb:                                super block
- * @block:                     start physcial block to add to the block group
- * @count:                     number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                        ext4_fsblk_t block, unsigned long count)
-{
-       struct buffer_head *bitmap_bh = NULL;
-       struct buffer_head *gd_bh;
-       ext4_group_t block_group;
-       ext4_grpblk_t bit;
-       unsigned int i;
-       struct ext4_group_desc *desc;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       int err = 0, ret, blk_free_count;
-       ext4_grpblk_t blocks_freed;
-       struct ext4_group_info *grp;
-
-       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-
-       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-       grp = ext4_get_group_info(sb, block_group);
-       /*
-        * Check to see if we are freeing blocks across a group
-        * boundary.
-        */
-       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-               goto error_return;
-       }
-       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-       if (!bitmap_bh)
-               goto error_return;
-       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
-       if (!desc)
-               goto error_return;
-
-       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-           in_range(block + count - 1, ext4_inode_table(sb, desc),
-                    sbi->s_itb_per_group)) {
-               ext4_error(sb, "Adding blocks in system zones - "
-                          "Block = %llu, count = %lu",
-                          block, count);
-               goto error_return;
-       }
-
-       /*
-        * We are about to add blocks to the bitmap,
-        * so we need undo access.
-        */
-       BUFFER_TRACE(bitmap_bh, "getting undo access");
-       err = ext4_journal_get_undo_access(handle, bitmap_bh);
-       if (err)
-               goto error_return;
-
-       /*
-        * We are about to modify some metadata.  Call the journal APIs
-        * to unshare ->b_data if a currently-committing transaction is
-        * using it
-        */
-       BUFFER_TRACE(gd_bh, "get_write_access");
-       err = ext4_journal_get_write_access(handle, gd_bh);
-       if (err)
-               goto error_return;
-       /*
-        * make sure we don't allow a parallel init on other groups in the
-        * same buddy cache
-        */
-       down_write(&grp->alloc_sem);
-       for (i = 0, blocks_freed = 0; i < count; i++) {
-               BUFFER_TRACE(bitmap_bh, "clear bit");
-               if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-                                               bit + i, bitmap_bh->b_data)) {
-                       ext4_error(sb, "bit already cleared for block %llu",
-                                  (ext4_fsblk_t)(block + i));
-                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
-               } else {
-                       blocks_freed++;
-               }
-       }
-       ext4_lock_group(sb, block_group);
-       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
-       ext4_free_blks_set(sb, desc, blk_free_count);
-       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-       ext4_unlock_group(sb, block_group);
-       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-
-       if (sbi->s_log_groups_per_flex) {
-               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               atomic_add(blocks_freed,
-                          &sbi->s_flex_groups[flex_group].free_blocks);
-       }
-       /*
-        * request to reload the buddy with the
-        * new bitmap information
-        */
-       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-       grp->bb_free += blocks_freed;
-       up_write(&grp->alloc_sem);
-
-       /* We dirtied the bitmap block */
-       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
-       /* And the group descriptor block */
-       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-       if (!err)
-               err = ret;
-
-error_return:
-       brelse(bitmap_bh);
-       ext4_std_error(sb, err);
-       return;
-}
-
  /**
   * ext4_has_free_blocks()
   * @sbi:       in-core super block structure.
@@ -493,7 +369,8 @@ error_return:
   * Check if filesystem has nblocks free & available for allocation.
   * On success return 1, return 0 on failure.
   */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                               s64 nblocks, unsigned int flags)
  {
         s64 free_blocks, dirty_blocks, root_blocks;
         struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                                                 EXT4_FREEBLOCKS_WATERMARK) {
                 free_blocks  = percpu_counter_sum_positive(fbc);
                 dirty_blocks = percpu_counter_sum_positive(dbc);
-               if (dirty_blocks < 0) {
-                       printk(KERN_CRIT "Dirty block accounting "
-                                       "went wrong %lld\n",
-                                       (long long)dirty_blocks);
-               }
         }
         /* Check whether we have space after
          * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
         /* Hm, nope.  Are (enough) root reserved blocks available? */
         if (sbi->s_resuid == current_fsuid() ||
             ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
-           capable(CAP_SYS_RESOURCE)) {
+           capable(CAP_SYS_RESOURCE) ||
+               (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
                 if (free_blocks >= (nblocks + dirty_blocks))
                         return 1;
         }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
  }
  
  int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-                                               s64 nblocks)
+                          s64 nblocks, unsigned int flags)
  {
-       if (ext4_has_free_blocks(sbi, nblocks)) {
+       if (ext4_has_free_blocks(sbi, nblocks, flags)) {
                 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
                 return 0;
         } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
   */
  int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  {
-       if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+       if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
             (*retries)++ > 3 ||
             !EXT4_SB(sb)->s_journal)
                 return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
   * error stores in errp pointer
   */
  ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-               ext4_fsblk_t goal, unsigned long *count, int *errp)
+                                 ext4_fsblk_t goal, unsigned int flags,
+                                 unsigned long *count, int *errp)
  {
         struct ext4_allocation_request ar;
         ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         ar.inode = inode;
         ar.goal = goal;
         ar.len = count ? *count : 1;
+       ar.flags = flags;
  
         ret = ext4_mb_new_blocks(handle, &ar, errp);
         if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 4daaf2b753f4fa037fd783ac34e47e15d1438b55..a74b89c09f90f1dcc587b9e5b81b0dfc222257db 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
  #define EXT4_MB_DELALLOC_RESERVED      0x0400
  /* We are doing stream allocation */
  #define EXT4_MB_STREAM_ALLOC           0x0800
-
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
  
  struct ext4_allocation_request {
         /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
   */
  #define        EXT4_BAD_INO             1      /* Bad blocks inode */
  #define EXT4_ROOT_INO           2      /* Root inode */
+#define EXT4_USR_QUOTA_INO      3      /* User quota inode */
+#define EXT4_GRP_QUOTA_INO      4      /* Group quota inode */
  #define EXT4_BOOT_LOADER_INO    5      /* Boot loader inode */
  #define EXT4_UNDEL_DIR_INO      6      /* Undelete directory inode */
  #define EXT4_RESIZE_INO                 7      /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
         /* Convert extent to initialized after IO complete */
  #define EXT4_GET_BLOCKS_IO_CONVERT_EXT         (EXT4_GET_BLOCKS_CONVERT|\
                                          EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+       /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT          0x0020
+       /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE           0x0040
  
  /*
   * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
         __le16  s_want_extra_isize;     /* New inodes should reserve # bytes */
         __le32  s_flags;                /* Miscellaneous flags */
         __le16  s_raid_stride;          /* RAID stride */
-       __le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
+       __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
         __le64  s_mmp_block;            /* Block for multi-mount protection */
         __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
         __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
         unsigned long s_ext_blocks;
         unsigned long s_ext_extents;
  #endif
+       /* ext4 extent cache stats */
+       unsigned long extent_cache_hits;
+       unsigned long extent_cache_misses;
  
         /* for buddy allocator */
         struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
         struct ext4_li_request *s_li_request;
         /* Wait multiplier for lazy initialization thread */
         unsigned int s_li_wait_mult;
+
+       /* Kernel thread for multiple mount protection */
+       struct task_struct *s_mmp_tsk;
  };
  
  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
  #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
  #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK       0x0020
  #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE     0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA           0x0100
  
  #define EXT4_FEATURE_INCOMPAT_COMPRESSION      0x0001
  #define EXT4_FEATURE_INCOMPAT_FILETYPE         0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
  #define EXT4_FEATURE_INCOMPAT_EA_INODE         0x0400 /* EA in inode */
  #define EXT4_FEATURE_INCOMPAT_DIRDATA          0x1000 /* data in dirent */
  
+#define EXT2_FEATURE_COMPAT_SUPP       EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP       EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT4_FEATURE_INCOMPAT_RECOVER| \
+                                        EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+                                        EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
  #define EXT4_FEATURE_COMPAT_SUPP       EXT2_FEATURE_COMPAT_EXT_ATTR
  #define EXT4_FEATURE_INCOMPAT_SUPP     (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                          EXT4_FEATURE_INCOMPAT_RECOVER| \
                                          EXT4_FEATURE_INCOMPAT_META_BG| \
                                          EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                          EXT4_FEATURE_INCOMPAT_64BIT| \
-                                        EXT4_FEATURE_INCOMPAT_FLEX_BG)
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_MMP)
  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                          EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                          EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
   */
  struct ext4_lazy_init {
         unsigned long           li_state;
-
-       wait_queue_head_t       li_wait_daemon;
-       wait_queue_head_t       li_wait_task;
-       struct timer_list       li_timer;
-       struct task_struct      *li_task;
-
         struct list_head        li_request_list;
         struct mutex            li_list_mtx;
  };
@@ -1614,6 +1638,67 @@ struct ext4_features {
         struct completion f_kobj_unregister;
  };
  
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+       __le32  mmp_magic;              /* Magic number for MMP */
+       __le32  mmp_seq;                /* Sequence no. updated periodically */
+
+       /*
+        * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+        * purposes and do not affect the correctness of the algorithm
+        */
+       __le64  mmp_time;               /* Time last updated */
+       char    mmp_nodename[64];       /* Node which last updated MMP block */
+       char    mmp_bdevname[32];       /* Bdev which last updated MMP block */
+
+       /*
+        * mmp_check_interval is used to verify if the MMP block has been
+        * updated on the block device. The value is updated based on the
+        * maximum time to write the MMP block during an update cycle.
+        */
+       __le16  mmp_check_interval;
+
+       __le16  mmp_pad1;
+       __le32  mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+       struct buffer_head *bh; /* bh from initial read_mmp_block() */
+       struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT            2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL    5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL    300UL
+
  /*
   * Function prototypes
   */
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
  extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                         ext4_group_t group);
  extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                               ext4_fsblk_t block, unsigned long count);
+                                        ext4_fsblk_t goal,
+                                        unsigned int flags,
+                                        unsigned long *count,
+                                        int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+                                 s64 nblocks, unsigned int flags);
  extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
  extern void ext4_check_blocks_bitmap(struct super_block *);
  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                              unsigned long count, int flags);
  extern int ext4_mb_add_groupinfo(struct super_block *sb,
                 ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                               ext4_fsblk_t block, unsigned long count);
  extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
  
  /* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
  extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
  extern int ext4_can_truncate(struct inode *inode);
  extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
  extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
  extern void ext4_set_inode_flags(struct inode *);
  extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
  extern int ext4_block_truncate_page(handle_t *handle,
                 struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+               struct address_space *mapping, loff_t from, loff_t length);
  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
  extern qsize_t *ext4_get_reserved_space(struct inode *inode);
  extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
                                                        __LINE__, ## message)
  extern void ext4_msg(struct super_block *, const char *, const char *, ...)
         __attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+                          const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg)     __dump_mmp_msg(sb, mmp, __func__, \
+                                                      __LINE__, msg)
  extern void __ext4_grp_locked_error(const char *, unsigned int, \
                                     struct super_block *, ext4_group_t, \
                                     unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
  extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
  extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+                               loff_t length);
  extern void ext4_ext_init(struct super_block *);
  extern void ext4_ext_release(struct super_block *);
  extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
                                int len,
                                struct writeback_control *wbc);
  
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
  /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
  enum ext4_state_bits {
         BH_Uninit       /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c

index 6e272ef6ba96c4938cc357ef498c760bde67d39e..f5240aa15601a6b34c40b339b0153e6f8bc6867d 100644 (file)
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
  
  #include <trace/events/ext4.h>
  
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                  handle_t *handle, struct buffer_head *bh)
-{
-       int err = 0;
-
-       if (ext4_handle_valid(handle)) {
-               err = jbd2_journal_get_undo_access(handle, bh);
-               if (err)
-                       ext4_journal_abort_handle(where, line, __func__, bh,
-                                                 handle, err);
-       }
-       return err;
-}
-
  int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                     handle_t *handle, struct buffer_head *bh)
  {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h

index d0f53538a57fd8663b03443898e4d12492ff399b..bb85757689b6af513dd556559a407049a1fc9782 100644 (file)
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                const char *err_fn,
                 struct buffer_head *bh, handle_t *handle, int err);
  
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
-                                  handle_t *handle, struct buffer_head *bh);
-
  int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                     handle_t *handle, struct buffer_head *bh);
  
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
  int __ext4_handle_dirty_super(const char *where, unsigned int line,
                               handle_t *handle, struct super_block *sb);
  
-#define ext4_journal_get_undo_access(handle, bh) \
-       __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
  #define ext4_journal_get_write_access(handle, bh) \
         __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
  #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index 4890d6f3ad15a976417ae542296a0051fe97bc27..5199bac7fc625d5a19ad5934417f724fb699fe22 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
  
  #include <trace/events/ext4.h>
  
+static int ext4_split_extent(handle_t *handle,
+                               struct inode *inode,
+                               struct ext4_ext_path *path,
+                               struct ext4_map_blocks *map,
+                               int split_flag,
+                               int flags);
+
  static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                             struct inode *inode,
                                             int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
  static ext4_fsblk_t
  ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                         struct ext4_ext_path *path,
-                       struct ext4_extent *ex, int *err)
+                       struct ext4_extent *ex, int *err, unsigned int flags)
  {
         ext4_fsblk_t goal, newblock;
  
         goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-       newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+       newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+                                       NULL, err);
         return newblock;
  }
  
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
         }
         ext_debug("\n");
  }
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+                       ext4_fsblk_t newblock, int level)
+{
+       int depth = ext_depth(inode);
+       struct ext4_extent *ex;
+
+       if (depth != level) {
+               struct ext4_extent_idx *idx;
+               idx = path[level].p_idx;
+               while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+                       ext_debug("%d: move %d:%llu in new index %llu\n", level,
+                                       le32_to_cpu(idx->ei_block),
+                                       ext4_idx_pblock(idx),
+                                       newblock);
+                       idx++;
+               }
+
+               return;
+       }
+
+       ex = path[depth].p_ext;
+       while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+               ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+                               le32_to_cpu(ex->ee_block),
+                               ext4_ext_pblock(ex),
+                               ext4_ext_is_uninitialized(ex),
+                               ext4_ext_get_actual_len(ex),
+                               newblock);
+               ex++;
+       }
+}
+
  #else
  #define ext4_ext_show_path(inode, path)
  #define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
  #endif
  
  void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
   * - initializes subtree
   */
  static int ext4_ext_split(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *path,
-                               struct ext4_extent *newext, int at)
+                         unsigned int flags,
+                         struct ext4_ext_path *path,
+                         struct ext4_extent *newext, int at)
  {
         struct buffer_head *bh = NULL;
         int depth = ext_depth(inode);
         struct ext4_extent_header *neh;
         struct ext4_extent_idx *fidx;
-       struct ext4_extent *ex;
         int i = at, k, m, a;
         ext4_fsblk_t newblock, oldblock;
         __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
         ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
         for (a = 0; a < depth - at; a++) {
                 newblock = ext4_ext_new_meta_block(handle, inode, path,
-                                                  newext, &err);
+                                                  newext, &err, flags);
                 if (newblock == 0)
                         goto cleanup;
                 ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
         neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
         neh->eh_magic = EXT4_EXT_MAGIC;
         neh->eh_depth = 0;
-       ex = EXT_FIRST_EXTENT(neh);
  
         /* move remainder of path[depth] to the new leaf */
         if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                 goto cleanup;
         }
         /* start copy from next extent */
-       /* TODO: we could do it by single memmove */
-       m = 0;
-       path[depth].p_ext++;
-       while (path[depth].p_ext <=
-                       EXT_MAX_EXTENT(path[depth].p_hdr)) {
-               ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
-                               le32_to_cpu(path[depth].p_ext->ee_block),
-                               ext4_ext_pblock(path[depth].p_ext),
-                               ext4_ext_is_uninitialized(path[depth].p_ext),
-                               ext4_ext_get_actual_len(path[depth].p_ext),
-                               newblock);
-               /*memmove(ex++, path[depth].p_ext++,
-                               sizeof(struct ext4_extent));
-               neh->eh_entries++;*/
-               path[depth].p_ext++;
-               m++;
-       }
+       m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+       ext4_ext_show_move(inode, path, newblock, depth);
         if (m) {
-               memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+               struct ext4_extent *ex;
+               ex = EXT_FIRST_EXTENT(neh);
+               memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                 le16_add_cpu(&neh->eh_entries, m);
         }
  
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
  
                 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
                                 i, newblock, le32_to_cpu(border), oldblock);
-               /* copy indexes */
-               m = 0;
-               path[i].p_idx++;
  
-               ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
-                               EXT_MAX_INDEX(path[i].p_hdr));
+               /* move remainder of path[i] to the new index block */
                 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                         EXT_LAST_INDEX(path[i].p_hdr))) {
                         EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                         err = -EIO;
                         goto cleanup;
                 }
-               while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
-                       ext_debug("%d: move %d:%llu in new index %llu\n", i,
-                                       le32_to_cpu(path[i].p_idx->ei_block),
-                                       ext4_idx_pblock(path[i].p_idx),
-                                       newblock);
-                       /*memmove(++fidx, path[i].p_idx++,
-                                       sizeof(struct ext4_extent_idx));
-                       neh->eh_entries++;
-                       BUG_ON(neh->eh_entries > neh->eh_max);*/
-                       path[i].p_idx++;
-                       m++;
-               }
+               /* start copy indexes */
+               m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+               ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+                               EXT_MAX_INDEX(path[i].p_hdr));
+               ext4_ext_show_move(inode, path, newblock, i);
                 if (m) {
-                       memmove(++fidx, path[i].p_idx - m,
+                       memmove(++fidx, path[i].p_idx,
                                 sizeof(struct ext4_extent_idx) * m);
                         le16_add_cpu(&neh->eh_entries, m);
                 }
@@ -1056,8 +1073,9 @@ cleanup:
   *   just created block
   */
  static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-                                       struct ext4_ext_path *path,
-                                       struct ext4_extent *newext)
+                                unsigned int flags,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *newext)
  {
         struct ext4_ext_path *curp = path;
         struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
         ext4_fsblk_t newblock;
         int err = 0;
  
-       newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+       newblock = ext4_ext_new_meta_block(handle, inode, path,
+               newext, &err, flags);
         if (newblock == 0)
                 return err;
  
@@ -1140,8 +1159,9 @@ out:
   * if no free index is found, then it requests in-depth growing.
   */
  static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-                                       struct ext4_ext_path *path,
-                                       struct ext4_extent *newext)
+                                   unsigned int flags,
+                                   struct ext4_ext_path *path,
+                                   struct ext4_extent *newext)
  {
         struct ext4_ext_path *curp;
         int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
         if (EXT_HAS_FREE_INDEX(curp)) {
                 /* if we found index with free entry, then use that
                  * entry: create all needed subtree and add new leaf */
-               err = ext4_ext_split(handle, inode, path, newext, i);
+               err = ext4_ext_split(handle, inode, flags, path, newext, i);
                 if (err)
                         goto out;
  
@@ -1174,7 +1194,8 @@ repeat:
                         err = PTR_ERR(path);
         } else {
                 /* tree is full, time to grow in depth */
-               err = ext4_ext_grow_indepth(handle, inode, path, newext);
+               err = ext4_ext_grow_indepth(handle, inode, flags,
+                                           path, newext);
                 if (err)
                         goto out;
  
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
   * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
   * 1 if they got merged.
   */
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
                                  struct ext4_ext_path *path,
                                  struct ext4_extent *ex)
  {
@@ -1602,6 +1623,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
         return merge_done;
  }
  
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+                                 struct ext4_ext_path *path,
+                                 struct ext4_extent *ex) {
+       struct ext4_extent_header *eh;
+       unsigned int depth;
+       int merge_done = 0;
+       int ret = 0;
+
+       depth = ext_depth(inode);
+       BUG_ON(path[depth].p_hdr == NULL);
+       eh = path[depth].p_hdr;
+
+       if (ex > EXT_FIRST_EXTENT(eh))
+               merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+       if (!merge_done)
+               ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+       return ret;
+}
+
  /*
   * check if a portion of the "newext" extent overlaps with an
   * existing extent.
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
         int depth, len, err;
         ext4_lblk_t next;
         unsigned uninitialized = 0;
+       int flags = 0;
  
         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
          * There is no free space in the found leaf.
          * We're gonna add a new leaf in the tree.
          */
-       err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+       if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+               flags = EXT4_MB_USE_ROOT_BLOCKS;
+       err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
         if (err)
                 goto cleanup;
         depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
  }
  
  /*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer.  If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
   * Return 0 if cache is invalid; 1 if the cache is valid
   */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-                       struct ext4_extent *ex)
-{
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+       struct ext4_ext_cache *ex){
         struct ext4_ext_cache *cex;
+       struct ext4_sb_info *sbi;
         int ret = 0;
  
         /*
@@ -2017,25 +2078,59 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
          */
         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
         cex = &EXT4_I(inode)->i_cached_extent;
+       sbi = EXT4_SB(inode->i_sb);
  
         /* has cache valid data? */
         if (cex->ec_len == 0)
                 goto errout;
  
         if (in_range(block, cex->ec_block, cex->ec_len)) {
-               ex->ee_block = cpu_to_le32(cex->ec_block);
-               ext4_ext_store_pblock(ex, cex->ec_start);
-               ex->ee_len = cpu_to_le16(cex->ec_len);
+               memcpy(ex, cex, sizeof(struct ext4_ext_cache));
                 ext_debug("%u cached by %u:%u:%llu\n",
                                 block,
                                 cex->ec_block, cex->ec_len, cex->ec_start);
                 ret = 1;
         }
  errout:
+       if (!ret)
+               sbi->extent_cache_misses++;
+       else
+               sbi->extent_cache_hits++;
         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
         return ret;
  }
  
+/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex:    Pointer where the cached extent will be stored
+ *         if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+                       struct ext4_extent *ex)
+{
+       struct ext4_ext_cache cex;
+       int ret = 0;
+
+       if (ext4_ext_check_cache(inode, block, &cex)) {
+               ex->ee_block = cpu_to_le32(cex.ec_block);
+               ext4_ext_store_pblock(ex, cex.ec_start);
+               ex->ee_len = cpu_to_le16(cex.ec_len);
+               ret = 1;
+       }
+
+       return ret;
+}
+
+
  /*
   * ext4_ext_rm_idx:
   * removes index from the index block.
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                 ext4_free_blocks(handle, inode, NULL, start, num, flags);
         } else if (from == le32_to_cpu(ex->ee_block)
                    && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-               printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
-                       from, to, le32_to_cpu(ex->ee_block), ee_len);
+               /* head removal */
+               ext4_lblk_t num;
+               ext4_fsblk_t start;
+
+               num = to - from;
+               start = ext4_ext_pblock(ex);
+
+               ext_debug("free first %u blocks starting %llu\n", num, start);
+               ext4_free_blocks(handle, inode, 0, start, num, flags);
+
         } else {
                 printk(KERN_INFO "strange request: removal(2) "
                                 "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
         return 0;
  }
  
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
  static int
  ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-               struct ext4_ext_path *path, ext4_lblk_t start)
+               struct ext4_ext_path *path, ext4_lblk_t start,
+               ext4_lblk_t end)
  {
         int err = 0, correct_index = 0;
         int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
         unsigned short ex_ee_len;
         unsigned uninitialized = 0;
         struct ext4_extent *ex;
+       struct ext4_map_blocks map;
  
         /* the header must be checked already in ext4_ext_remove_space() */
         ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 path[depth].p_ext = ex;
  
                 a = ex_ee_block > start ? ex_ee_block : start;
-               b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
-                       ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+               b = ex_ee_block+ex_ee_len - 1 < end ?
+                       ex_ee_block+ex_ee_len - 1 : end;
  
                 ext_debug("  border %u:%u\n", a, b);
  
-               if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
-                       block = 0;
-                       num = 0;
-                       BUG();
+               /* If this extent is beyond the end of the hole, skip it */
+               if (end <= ex_ee_block) {
+                       ex--;
+                       ex_ee_block = le32_to_cpu(ex->ee_block);
+                       ex_ee_len = ext4_ext_get_actual_len(ex);
+                       continue;
+               } else if (a != ex_ee_block &&
+                       b != ex_ee_block + ex_ee_len - 1) {
+                       /*
+                        * If this is a truncate, then this condition should
+                        * never happen because at least one of the end points
+                        * needs to be on the edge of the extent.
+                        */
+                       if (end == EXT_MAX_BLOCK) {
+                               ext_debug("  bad truncate %u:%u\n",
+                                               start, end);
+                               block = 0;
+                               num = 0;
+                               err = -EIO;
+                               goto out;
+                       }
+                       /*
+                        * else this is a hole punch, so the extent needs to
+                        * be split since neither edge of the hole is on the
+                        * extent edge
+                        */
+                       else{
+                               map.m_pblk = ext4_ext_pblock(ex);
+                               map.m_lblk = ex_ee_block;
+                               map.m_len = b - ex_ee_block;
+
+                               err = ext4_split_extent(handle,
+                                       inode, path, &map, 0,
+                                       EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                                       EXT4_GET_BLOCKS_PRE_IO);
+
+                               if (err < 0)
+                                       goto out;
+
+                               ex_ee_len = ext4_ext_get_actual_len(ex);
+
+                               b = ex_ee_block+ex_ee_len - 1 < end ?
+                                       ex_ee_block+ex_ee_len - 1 : end;
+
+                               /* Then remove tail of this extent */
+                               block = ex_ee_block;
+                               num = a - block;
+                       }
                 } else if (a != ex_ee_block) {
                         /* remove tail of the extent */
                         block = ex_ee_block;
                         num = a - block;
                 } else if (b != ex_ee_block + ex_ee_len - 1) {
                         /* remove head of the extent */
-                       block = a;
-                       num = b - a;
-                       /* there is no "make a hole" API yet */
-                       BUG();
+                       block = b;
+                       num =  ex_ee_block + ex_ee_len - b;
+
+                       /*
+                        * If this is a truncate, this condition
+                        * should never happen
+                        */
+                       if (end == EXT_MAX_BLOCK) {
+                               ext_debug("  bad truncate %u:%u\n",
+                                       start, end);
+                               err = -EIO;
+                               goto out;
+                       }
                 } else {
                         /* remove whole extent: excellent! */
                         block = ex_ee_block;
                         num = 0;
-                       BUG_ON(a != ex_ee_block);
-                       BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+                       if (a != ex_ee_block) {
+                               ext_debug("  bad truncate %u:%u\n",
+                                       start, end);
+                               err = -EIO;
+                               goto out;
+                       }
+
+                       if (b != ex_ee_block + ex_ee_len - 1) {
+                               ext_debug("  bad truncate %u:%u\n",
+                                       start, end);
+                               err = -EIO;
+                               goto out;
+                       }
                 }
  
                 /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 if (num == 0) {
                         /* this extent is removed; mark slot entirely unused */
                         ext4_ext_store_pblock(ex, 0);
-                       le16_add_cpu(&eh->eh_entries, -1);
+               } else if (block != ex_ee_block) {
+                       /*
+                        * If this was a head removal, then we need to update
+                        * the physical block since it is now at a different
+                        * location
+                        */
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
                 }
  
                 ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 if (err)
                         goto out;
  
+               /*
+                * If the extent was completely released,
+                * we need to remove it from the leaf
+                */
+               if (num == 0) {
+                       if (end != EXT_MAX_BLOCK) {
+                               /*
+                                * For hole punching, we need to scoot all the
+                                * extents up when an extent is removed so that
+                                * we dont have blank extents in the middle
+                                */
+                               memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+                                       sizeof(struct ext4_extent));
+
+                               /* Now get rid of the one at the end */
+                               memset(EXT_LAST_EXTENT(eh), 0,
+                                       sizeof(struct ext4_extent));
+                       }
+                       le16_add_cpu(&eh->eh_entries, -1);
+               }
+
                 ext_debug("new extent: %u:%u:%llu\n", block, num,
                                 ext4_ext_pblock(ex));
                 ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
         return 1;
  }
  
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+                               ext4_lblk_t end)
  {
         struct super_block *sb = inode->i_sb;
         int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
         while (i >= 0 && err == 0) {
                 if (i == depth) {
                         /* this is leaf block */
-                       err = ext4_ext_rm_leaf(handle, inode, path, start);
+                       err = ext4_ext_rm_leaf(handle, inode, path,
+                                       start, end);
                         /* root level has p_bh == NULL, brelse() eats this */
                         brelse(path[i].p_bh);
                         path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
         return ret;
  }
  
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT   0x1  /* safe to zeroout if split fails \
+                                       due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1  0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2  0x4  /* mark second half uninitialized */
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *              the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+                            struct inode *inode,
+                            struct ext4_ext_path *path,
+                            ext4_lblk_t split,
+                            int split_flag,
+                            int flags)
+{
+       ext4_fsblk_t newblock;
+       ext4_lblk_t ee_block;
+       struct ext4_extent *ex, newex, orig_ex;
+       struct ext4_extent *ex2 = NULL;
+       unsigned int ee_len, depth;
+       int err = 0;
+
+       ext_debug("ext4_split_extents_at: inode %lu, logical"
+               "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+       ext4_ext_show_leaf(inode, path);
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+       newblock = split - ee_block + ext4_ext_pblock(ex);
+
+       BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               goto out;
+
+       if (split == ee_block) {
+               /*
+                * case b: block @split is the block that the extent begins with
+                * then we just change the state of the extent, and splitting
+                * is not needed.
+                */
+               if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                       ext4_ext_mark_uninitialized(ex);
+               else
+                       ext4_ext_mark_initialized(ex);
+
+               if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+                       ext4_ext_try_to_merge(inode, path, ex);
+
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               goto out;
+       }
+
+       /* case a */
+       memcpy(&orig_ex, ex, sizeof(orig_ex));
+       ex->ee_len = cpu_to_le16(split - ee_block);
+       if (split_flag & EXT4_EXT_MARK_UNINIT1)
+               ext4_ext_mark_uninitialized(ex);
+
+       /*
+        * path may lead to new leaf, not to original leaf any more
+        * after ext4_ext_insert_extent() returns,
+        */
+       err = ext4_ext_dirty(handle, inode, path + depth);
+       if (err)
+               goto fix_extent_len;
+
+       ex2 = &newex;
+       ex2->ee_block = cpu_to_le32(split);
+       ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+       ext4_ext_store_pblock(ex2, newblock);
+       if (split_flag & EXT4_EXT_MARK_UNINIT2)
+               ext4_ext_mark_uninitialized(ex2);
+
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+       if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+               err = ext4_ext_zeroout(inode, &orig_ex);
+               if (err)
+                       goto fix_extent_len;
+               /* update the extent length and mark as initialized */
+               ex->ee_len = cpu_to_le32(ee_len);
+               ext4_ext_try_to_merge(inode, path, ex);
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               goto out;
+       } else if (err)
+               goto fix_extent_len;
+
+out:
+       ext4_ext_show_leaf(inode, path);
+       return err;
+
+fix_extent_len:
+       ex->ee_len = orig_ex.ee_len;
+       ext4_ext_dirty(handle, inode, path + depth);
+       return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+                             struct inode *inode,
+                             struct ext4_ext_path *path,
+                             struct ext4_map_blocks *map,
+                             int split_flag,
+                             int flags)
+{
+       ext4_lblk_t ee_block;
+       struct ext4_extent *ex;
+       unsigned int ee_len, depth;
+       int err = 0;
+       int uninitialized;
+       int split_flag1, flags1;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+       uninitialized = ext4_ext_is_uninitialized(ex);
+
+       if (map->m_lblk + map->m_len < ee_block + ee_len) {
+               split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+               if (uninitialized)
+                       split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+                                      EXT4_EXT_MARK_UNINIT2;
+               err = ext4_split_extent_at(handle, inode, path,
+                               map->m_lblk + map->m_len, split_flag1, flags1);
+               if (err)
+                       goto out;
+       }
+
+       ext4_ext_drop_refs(path);
+       path = ext4_ext_find_extent(inode, map->m_lblk, path);
+       if (IS_ERR(path))
+               return PTR_ERR(path);
+
+       if (map->m_lblk >= ee_block) {
+               split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               if (uninitialized)
+                       split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+               if (split_flag & EXT4_EXT_MARK_UNINIT2)
+                       split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+               err = ext4_split_extent_at(handle, inode, path,
+                               map->m_lblk, split_flag1, flags);
+               if (err)
+                       goto out;
+       }
+
+       ext4_ext_show_leaf(inode, path);
+out:
+       return err ? err : map->m_len;
+}
+
  #define EXT4_EXT_ZERO_LEN 7
  /*
   * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                            struct ext4_map_blocks *map,
                                            struct ext4_ext_path *path)
  {
-       struct ext4_extent *ex, newex, orig_ex;
-       struct ext4_extent *ex1 = NULL;
-       struct ext4_extent *ex2 = NULL;
-       struct ext4_extent *ex3 = NULL;
-       struct ext4_extent_header *eh;
+       struct ext4_map_blocks split_map;
+       struct ext4_extent zero_ex;
+       struct ext4_extent *ex;
         ext4_lblk_t ee_block, eof_block;
         unsigned int allocated, ee_len, depth;
-       ext4_fsblk_t newblock;
         int err = 0;
-       int ret = 0;
-       int may_zeroout;
+       int split_flag = 0;
  
         ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 eof_block = map->m_lblk + map->m_len;
  
         depth = ext_depth(inode);
-       eh = path[depth].p_hdr;
         ex = path[depth].p_ext;
         ee_block = le32_to_cpu(ex->ee_block);
         ee_len = ext4_ext_get_actual_len(ex);
         allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
-       ex2 = ex;
-       orig_ex.ee_block = ex->ee_block;
-       orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
  
+       WARN_ON(map->m_lblk < ee_block);
         /*
          * It is safe to convert extent to initialized via explicit
          * zeroout only if extent is fully insde i_size or new_size.
          */
-       may_zeroout = ee_block + ee_len <= eof_block;
+       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
  
-       err = ext4_ext_get_access(handle, inode, path + depth);
-       if (err)
-               goto out;
         /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-       if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
-               err =  ext4_ext_zeroout(inode, &orig_ex);
+       if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+           (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+               err = ext4_ext_zeroout(inode, ex);
                 if (err)
-                       goto fix_extent_len;
-               /* update the extent length and mark as initialized */
-               ex->ee_block = orig_ex.ee_block;
-               ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-               ext4_ext_dirty(handle, inode, path + depth);
-               /* zeroed the full extent */
-               return allocated;
-       }
+                       goto out;
  
-       /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-       if (map->m_lblk > ee_block) {
-               ex1 = ex;
-               ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-               ext4_ext_mark_uninitialized(ex1);
-               ex2 = &newex;
+               err = ext4_ext_get_access(handle, inode, path + depth);
+               if (err)
+                       goto out;
+               ext4_ext_mark_initialized(ex);
+               ext4_ext_try_to_merge(inode, path, ex);
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               goto out;
         }
+
         /*
-        * for sanity, update the length of the ex2 extent before
-        * we insert ex3, if ex1 is NULL. This is to avoid temporary
-        * overlap of blocks.
+        * four cases:
+        * 1. split the extent into three extents.
+        * 2. split the extent into two extents, zeroout the first half.
+        * 3. split the extent into two extents, zeroout the second half.
+        * 4. split the extent into two extents with out zeroout.
          */
-       if (!ex1 && allocated > map->m_len)
-               ex2->ee_len = cpu_to_le16(map->m_len);
-       /* ex3: to ee_block + ee_len : uninitialised */
-       if (allocated > map->m_len) {
-               unsigned int newdepth;
-               /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-               if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
-                       /*
-                        * map->m_lblk == ee_block is handled by the zerouout
-                        * at the beginning.
-                        * Mark first half uninitialized.
-                        * Mark second half initialized and zero out the
-                        * initialized extent
-                        */
-                       ex->ee_block = orig_ex.ee_block;
-                       ex->ee_len   = cpu_to_le16(ee_len - allocated);
-                       ext4_ext_mark_uninitialized(ex);
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                       ext4_ext_dirty(handle, inode, path + depth);
-
-                       ex3 = &newex;
-                       ex3->ee_block = cpu_to_le32(map->m_lblk);
-                       ext4_ext_store_pblock(ex3, newblock);
-                       ex3->ee_len = cpu_to_le16(allocated);
-                       err = ext4_ext_insert_extent(handle, inode, path,
-                                                       ex3, 0);
-                       if (err == -ENOSPC) {
-                               err =  ext4_ext_zeroout(inode, &orig_ex);
-                               if (err)
-                                       goto fix_extent_len;
-                               ex->ee_block = orig_ex.ee_block;
-                               ex->ee_len   = orig_ex.ee_len;
-                               ext4_ext_store_pblock(ex,
-                                       ext4_ext_pblock(&orig_ex));
-                               ext4_ext_dirty(handle, inode, path + depth);
-                               /* blocks available from map->m_lblk */
-                               return allocated;
-
-                       } else if (err)
-                               goto fix_extent_len;
+       split_map.m_lblk = map->m_lblk;
+       split_map.m_len = map->m_len;
  
-                       /*
-                        * We need to zero out the second half because
-                        * an fallocate request can update file size and
-                        * converting the second half to initialized extent
-                        * implies that we can leak some junk data to user
-                        * space.
-                        */
-                       err =  ext4_ext_zeroout(inode, ex3);
-                       if (err) {
-                               /*
-                                * We should actually mark the
-                                * second half as uninit and return error
-                                * Insert would have changed the extent
-                                */
-                               depth = ext_depth(inode);
-                               ext4_ext_drop_refs(path);
-                               path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                           path);
-                               if (IS_ERR(path)) {
-                                       err = PTR_ERR(path);
-                                       return err;
-                               }
-                               /* get the second half extent details */
-                               ex = path[depth].p_ext;
-                               err = ext4_ext_get_access(handle, inode,
-                                                               path + depth);
+       if (allocated > map->m_len) {
+               if (allocated <= EXT4_EXT_ZERO_LEN &&
+                   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                       /* case 3 */
+                       zero_ex.ee_block =
+                                        cpu_to_le32(map->m_lblk);
+                       zero_ex.ee_len = cpu_to_le16(allocated);
+                       ext4_ext_store_pblock(&zero_ex,
+                               ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+                       err = ext4_ext_zeroout(inode, &zero_ex);
+                       if (err)
+                               goto out;
+                       split_map.m_lblk = map->m_lblk;
+                       split_map.m_len = allocated;
+               } else if ((map->m_lblk - ee_block + map->m_len <
+                          EXT4_EXT_ZERO_LEN) &&
+                          (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+                       /* case 2 */
+                       if (map->m_lblk != ee_block) {
+                               zero_ex.ee_block = ex->ee_block;
+                               zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+                                                       ee_block);
+                               ext4_ext_store_pblock(&zero_ex,
+                                                     ext4_ext_pblock(ex));
+                               err = ext4_ext_zeroout(inode, &zero_ex);
                                 if (err)
-                                       return err;
-                               ext4_ext_mark_uninitialized(ex);
-                               ext4_ext_dirty(handle, inode, path + depth);
-                               return err;
+                                       goto out;
                         }
  
-                       /* zeroed the second half */
-                       return allocated;
-               }
-               ex3 = &newex;
-               ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-               ext4_ext_store_pblock(ex3, newblock + map->m_len);
-               ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-               ext4_ext_mark_uninitialized(ex3);
-               err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-               if (err == -ENOSPC && may_zeroout) {
-                       err =  ext4_ext_zeroout(inode, &orig_ex);
-                       if (err)
-                               goto fix_extent_len;
-                       /* update the extent length and mark as initialized */
-                       ex->ee_block = orig_ex.ee_block;
-                       ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                       ext4_ext_dirty(handle, inode, path + depth);
-                       /* zeroed the full extent */
-                       /* blocks available from map->m_lblk */
-                       return allocated;
-
-               } else if (err)
-                       goto fix_extent_len;
-               /*
-                * The depth, and hence eh & ex might change
-                * as part of the insert above.
-                */
-               newdepth = ext_depth(inode);
-               /*
-                * update the extent length after successful insert of the
-                * split extent
-                */
-               ee_len -= ext4_ext_get_actual_len(ex3);
-               orig_ex.ee_len = cpu_to_le16(ee_len);
-               may_zeroout = ee_block + ee_len <= eof_block;
-
-               depth = newdepth;
-               ext4_ext_drop_refs(path);
-               path = ext4_ext_find_extent(inode, map->m_lblk, path);
-               if (IS_ERR(path)) {
-                       err = PTR_ERR(path);
-                       goto out;
+                       split_map.m_lblk = ee_block;
+                       split_map.m_len = map->m_lblk - ee_block + map->m_len;
+                       allocated = map->m_len;
                 }
-               eh = path[depth].p_hdr;
-               ex = path[depth].p_ext;
-               if (ex2 != &newex)
-                       ex2 = ex;
-
-               err = ext4_ext_get_access(handle, inode, path + depth);
-               if (err)
-                       goto out;
+       }
  
-               allocated = map->m_len;
+       allocated = ext4_split_extent(handle, inode, path,
+                                      &split_map, split_flag, 0);
+       if (allocated < 0)
+               err = allocated;
  
-               /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
-                * to insert a extent in the middle zerout directly
-                * otherwise give the extent a chance to merge to left
-                */
-               if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                       map->m_lblk != ee_block && may_zeroout) {
-                       err =  ext4_ext_zeroout(inode, &orig_ex);
-                       if (err)
-                               goto fix_extent_len;
-                       /* update the extent length and mark as initialized */
-                       ex->ee_block = orig_ex.ee_block;
-                       ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                       ext4_ext_dirty(handle, inode, path + depth);
-                       /* zero out the first half */
-                       /* blocks available from map->m_lblk */
-                       return allocated;
-               }
-       }
-       /*
-        * If there was a change of depth as part of the
-        * insertion of ex3 above, we need to update the length
-        * of the ex1 extent again here
-        */
-       if (ex1 && ex1 != ex) {
-               ex1 = ex;
-               ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-               ext4_ext_mark_uninitialized(ex1);
-               ex2 = &newex;
-       }
-       /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-       ex2->ee_block = cpu_to_le32(map->m_lblk);
-       ext4_ext_store_pblock(ex2, newblock);
-       ex2->ee_len = cpu_to_le16(allocated);
-       if (ex2 != ex)
-               goto insert;
-       /*
-        * New (initialized) extent starts from the first block
-        * in the current extent. i.e., ex2 == ex
-        * We have to see if it can be merged with the extent
-        * on the left.
-        */
-       if (ex2 > EXT_FIRST_EXTENT(eh)) {
-               /*
-                * To merge left, pass "ex2 - 1" to try_to_merge(),
-                * since it merges towards right _only_.
-                */
-               ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-               if (ret) {
-                       err = ext4_ext_correct_indexes(handle, inode, path);
-                       if (err)
-                               goto out;
-                       depth = ext_depth(inode);
-                       ex2--;
-               }
-       }
-       /*
-        * Try to Merge towards right. This might be required
-        * only when the whole extent is being written to.
-        * i.e. ex2 == ex and ex3 == NULL.
-        */
-       if (!ex3) {
-               ret = ext4_ext_try_to_merge(inode, path, ex2);
-               if (ret) {
-                       err = ext4_ext_correct_indexes(handle, inode, path);
-                       if (err)
-                               goto out;
-               }
-       }
-       /* Mark modified extent as dirty */
-       err = ext4_ext_dirty(handle, inode, path + depth);
-       goto out;
-insert:
-       err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-       if (err == -ENOSPC && may_zeroout) {
-               err =  ext4_ext_zeroout(inode, &orig_ex);
-               if (err)
-                       goto fix_extent_len;
-               /* update the extent length and mark as initialized */
-               ex->ee_block = orig_ex.ee_block;
-               ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-               ext4_ext_dirty(handle, inode, path + depth);
-               /* zero out the first half */
-               return allocated;
-       } else if (err)
-               goto fix_extent_len;
  out:
-       ext4_ext_show_leaf(inode, path);
         return err ? err : allocated;
-
-fix_extent_len:
-       ex->ee_block = orig_ex.ee_block;
-       ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-       ext4_ext_mark_uninitialized(ex);
-       ext4_ext_dirty(handle, inode, path + depth);
-       return err;
  }
  
  /*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                                         struct ext4_ext_path *path,
                                         int flags)
  {
-       struct ext4_extent *ex, newex, orig_ex;
-       struct ext4_extent *ex1 = NULL;
-       struct ext4_extent *ex2 = NULL;
-       struct ext4_extent *ex3 = NULL;
-       ext4_lblk_t ee_block, eof_block;
-       unsigned int allocated, ee_len, depth;
-       ext4_fsblk_t newblock;
-       int err = 0;
-       int may_zeroout;
+       ext4_lblk_t eof_block;
+       ext4_lblk_t ee_block;
+       struct ext4_extent *ex;
+       unsigned int ee_len;
+       int split_flag = 0, depth;
  
         ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
                 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                 inode->i_sb->s_blocksize_bits;
         if (eof_block < map->m_lblk + map->m_len)
                 eof_block = map->m_lblk + map->m_len;
-
-       depth = ext_depth(inode);
-       ex = path[depth].p_ext;
-       ee_block = le32_to_cpu(ex->ee_block);
-       ee_len = ext4_ext_get_actual_len(ex);
-       allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
-       ex2 = ex;
-       orig_ex.ee_block = ex->ee_block;
-       orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
-
         /*
          * It is safe to convert extent to initialized via explicit
          * zeroout only if extent is fully insde i_size or new_size.
          */
-       may_zeroout = ee_block + ee_len <= eof_block;
-
-       /*
-        * If the uninitialized extent begins at the same logical
-        * block where the write begins, and the write completely
-        * covers the extent, then we don't need to split it.
-        */
-       if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
-               return allocated;
-
-       err = ext4_ext_get_access(handle, inode, path + depth);
-       if (err)
-               goto out;
-       /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-       if (map->m_lblk > ee_block) {
-               ex1 = ex;
-               ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-               ext4_ext_mark_uninitialized(ex1);
-               ex2 = &newex;
-       }
-       /*
-        * for sanity, update the length of the ex2 extent before
-        * we insert ex3, if ex1 is NULL. This is to avoid temporary
-        * overlap of blocks.
-        */
-       if (!ex1 && allocated > map->m_len)
-               ex2->ee_len = cpu_to_le16(map->m_len);
-       /* ex3: to ee_block + ee_len : uninitialised */
-       if (allocated > map->m_len) {
-               unsigned int newdepth;
-               ex3 = &newex;
-               ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-               ext4_ext_store_pblock(ex3, newblock + map->m_len);
-               ex3->ee_len = cpu_to_le16(allocated - map->m_len);
-               ext4_ext_mark_uninitialized(ex3);
-               err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-               if (err == -ENOSPC && may_zeroout) {
-                       err =  ext4_ext_zeroout(inode, &orig_ex);
-                       if (err)
-                               goto fix_extent_len;
-                       /* update the extent length and mark as initialized */
-                       ex->ee_block = orig_ex.ee_block;
-                       ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-                       ext4_ext_dirty(handle, inode, path + depth);
-                       /* zeroed the full extent */
-                       /* blocks available from map->m_lblk */
-                       return allocated;
-
-               } else if (err)
-                       goto fix_extent_len;
-               /*
-                * The depth, and hence eh & ex might change
-                * as part of the insert above.
-                */
-               newdepth = ext_depth(inode);
-               /*
-                * update the extent length after successful insert of the
-                * split extent
-                */
-               ee_len -= ext4_ext_get_actual_len(ex3);
-               orig_ex.ee_len = cpu_to_le16(ee_len);
-               may_zeroout = ee_block + ee_len <= eof_block;
-
-               depth = newdepth;
-               ext4_ext_drop_refs(path);
-               path = ext4_ext_find_extent(inode, map->m_lblk, path);
-               if (IS_ERR(path)) {
-                       err = PTR_ERR(path);
-                       goto out;
-               }
-               ex = path[depth].p_ext;
-               if (ex2 != &newex)
-                       ex2 = ex;
-
-               err = ext4_ext_get_access(handle, inode, path + depth);
-               if (err)
-                       goto out;
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
  
-               allocated = map->m_len;
-       }
-       /*
-        * If there was a change of depth as part of the
-        * insertion of ex3 above, we need to update the length
-        * of the ex1 extent again here
-        */
-       if (ex1 && ex1 != ex) {
-               ex1 = ex;
-               ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
-               ext4_ext_mark_uninitialized(ex1);
-               ex2 = &newex;
-       }
-       /*
-        * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-        * using direct I/O, uninitialised still.
-        */
-       ex2->ee_block = cpu_to_le32(map->m_lblk);
-       ext4_ext_store_pblock(ex2, newblock);
-       ex2->ee_len = cpu_to_le16(allocated);
-       ext4_ext_mark_uninitialized(ex2);
-       if (ex2 != ex)
-               goto insert;
-       /* Mark modified extent as dirty */
-       err = ext4_ext_dirty(handle, inode, path + depth);
-       ext_debug("out here\n");
-       goto out;
-insert:
-       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-       if (err == -ENOSPC && may_zeroout) {
-               err =  ext4_ext_zeroout(inode, &orig_ex);
-               if (err)
-                       goto fix_extent_len;
-               /* update the extent length and mark as initialized */
-               ex->ee_block = orig_ex.ee_block;
-               ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-               ext4_ext_dirty(handle, inode, path + depth);
-               /* zero out the first half */
-               return allocated;
-       } else if (err)
-               goto fix_extent_len;
-out:
-       ext4_ext_show_leaf(inode, path);
-       return err ? err : allocated;
+       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+       split_flag |= EXT4_EXT_MARK_UNINIT2;
  
-fix_extent_len:
-       ex->ee_block = orig_ex.ee_block;
-       ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
-       ext4_ext_mark_uninitialized(ex);
-       ext4_ext_dirty(handle, inode, path + depth);
-       return err;
+       flags |= EXT4_GET_BLOCKS_PRE_IO;
+       return ext4_split_extent(handle, inode, path, map, split_flag, flags);
  }
+
  static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                               struct inode *inode,
                                               struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
         struct ext4_extent_header *eh;
         int depth;
         int err = 0;
-       int ret = 0;
  
         depth = ext_depth(inode);
         eh = path[depth].p_hdr;
         ex = path[depth].p_ext;
  
+       ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+               "block %llu, max_blocks %u\n", inode->i_ino,
+               (unsigned long long)le32_to_cpu(ex->ee_block),
+               ext4_ext_get_actual_len(ex));
+
         err = ext4_ext_get_access(handle, inode, path + depth);
         if (err)
                 goto out;
         /* first mark the extent as initialized */
         ext4_ext_mark_initialized(ex);
  
-       /*
-        * We have to see if it can be merged with the extent
-        * on the left.
-        */
-       if (ex > EXT_FIRST_EXTENT(eh)) {
-               /*
-                * To merge left, pass "ex - 1" to try_to_merge(),
-                * since it merges towards right _only_.
-                */
-               ret = ext4_ext_try_to_merge(inode, path, ex - 1);
-               if (ret) {
-                       err = ext4_ext_correct_indexes(handle, inode, path);
-                       if (err)
-                               goto out;
-                       depth = ext_depth(inode);
-                       ex--;
-               }
-       }
-       /*
-        * Try to Merge towards right.
+       /* note: ext4_ext_correct_indexes() isn't needed here because
+        * borders are not changed
          */
-       ret = ext4_ext_try_to_merge(inode, path, ex);
-       if (ret) {
-               err = ext4_ext_correct_indexes(handle, inode, path);
-               if (err)
-                       goto out;
-               depth = ext_depth(inode);
-       }
+       ext4_ext_try_to_merge(inode, path, ex);
+
         /* Mark modified extent as dirty */
         err = ext4_ext_dirty(handle, inode, path + depth);
  out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         ext4_fsblk_t newblock = 0;
         int err = 0, depth, ret;
         unsigned int allocated = 0;
+       unsigned int punched_out = 0;
+       unsigned int result = 0;
         struct ext4_allocation_request ar;
         ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+       struct ext4_map_blocks punch_map;
  
         ext_debug("blocks %u/%u requested for inode %lu\n",
                   map->m_lblk, map->m_len, inode->i_ino);
         trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
  
         /* check in cache */
-       if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+       if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+               ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
                 if (!newex.ee_start_lo && !newex.ee_start_hi) {
                         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                 /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                   ee_block, ee_len, newblock);
  
-                       /* Do not put uninitialized extent in the cache */
-                       if (!ext4_ext_is_uninitialized(ex)) {
-                               ext4_ext_put_in_cache(inode, ee_block,
-                                                       ee_len, ee_start);
-                               goto out;
+                       if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
+                               /*
+                                * Do not put uninitialized extent
+                                * in the cache
+                                */
+                               if (!ext4_ext_is_uninitialized(ex)) {
+                                       ext4_ext_put_in_cache(inode, ee_block,
+                                               ee_len, ee_start);
+                                       goto out;
+                               }
+                               ret = ext4_ext_handle_uninitialized_extents(
+                                       handle, inode, map, path, flags,
+                                       allocated, newblock);
+                               return ret;
                         }
-                       ret = ext4_ext_handle_uninitialized_extents(handle,
-                                       inode, map, path, flags, allocated,
-                                       newblock);
-                       return ret;
+
+                       /*
+                        * Punch out the map length, but only to the
+                        * end of the extent
+                        */
+                       punched_out = allocated < map->m_len ?
+                               allocated : map->m_len;
+
+                       /*
+                        * Sense extents need to be converted to
+                        * uninitialized, they must fit in an
+                        * uninitialized extent
+                        */
+                       if (punched_out > EXT_UNINIT_MAX_LEN)
+                               punched_out = EXT_UNINIT_MAX_LEN;
+
+                       punch_map.m_lblk = map->m_lblk;
+                       punch_map.m_pblk = newblock;
+                       punch_map.m_len = punched_out;
+                       punch_map.m_flags = 0;
+
+                       /* Check to see if the extent needs to be split */
+                       if (punch_map.m_len != ee_len ||
+                               punch_map.m_lblk != ee_block) {
+
+                               ret = ext4_split_extent(handle, inode,
+                               path, &punch_map, 0,
+                               EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+                               EXT4_GET_BLOCKS_PRE_IO);
+
+                               if (ret < 0) {
+                                       err = ret;
+                                       goto out2;
+                               }
+                               /*
+                                * find extent for the block at
+                                * the start of the hole
+                                */
+                               ext4_ext_drop_refs(path);
+                               kfree(path);
+
+                               path = ext4_ext_find_extent(inode,
+                               map->m_lblk, NULL);
+                               if (IS_ERR(path)) {
+                                       err = PTR_ERR(path);
+                                       path = NULL;
+                                       goto out2;
+                               }
+
+                               depth = ext_depth(inode);
+                               ex = path[depth].p_ext;
+                               ee_len = ext4_ext_get_actual_len(ex);
+                               ee_block = le32_to_cpu(ex->ee_block);
+                               ee_start = ext4_ext_pblock(ex);
+
+                       }
+
+                       ext4_ext_mark_uninitialized(ex);
+
+                       err = ext4_ext_remove_space(inode, map->m_lblk,
+                               map->m_lblk + punched_out);
+
+                       goto out2;
                 }
         }
  
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         else
                 /* disable in-core preallocation for non-regular files */
                 ar.flags = 0;
+       if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+               ar.flags |= EXT4_MB_HINT_NOPREALLOC;
         newblock = ext4_mb_new_blocks(handle, &ar, &err);
         if (!newblock)
                 goto out2;
@@ -3529,7 +3647,11 @@ out2:
         }
         trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
                 newblock, map->m_len, err ? err : allocated);
-       return err ? err : allocated;
+
+       result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+                       punched_out : allocated;
+
+       return err ? err : result;
  }
  
  void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
  
         last_block = (inode->i_size + sb->s_blocksize - 1)
                         >> EXT4_BLOCK_SIZE_BITS(sb);
-       err = ext4_ext_remove_space(inode, last_block);
+       err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
  
         /* In a multi-transaction truncate, we only make the final
          * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
         if (IS_SYNC(inode))
                 ext4_handle_sync(handle);
  
-out_stop:
         up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
         /*
          * If this was a simple ftruncate() and the file will remain alive,
          * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
         struct ext4_map_blocks map;
         unsigned int credits, blkbits = inode->i_blkbits;
  
-       /* We only support the FALLOC_FL_KEEP_SIZE mode */
-       if (mode & ~FALLOC_FL_KEEP_SIZE)
-               return -EOPNOTSUPP;
-
         /*
          * currently supporting (pre)allocate mode for extent-based
          * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                 return -EOPNOTSUPP;
  
+       /* Return error if mode is not supported */
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return ext4_punch_hole(file, offset, len);
+
         trace_ext4_fallocate_enter(inode, offset, len, mode);
         map.m_lblk = offset >> blkbits;
         /*
@@ -3691,7 +3817,8 @@ retry:
                         break;
                 }
                 ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+                                     EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                                     EXT4_GET_BLOCKS_NO_NORMALIZE);
                 if (ret <= 0) {
  #ifdef EXT4FS_DEBUG
                         WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                 pgoff_t         last_offset;
                 pgoff_t         offset;
                 pgoff_t         index;
+               pgoff_t         start_index = 0;
                 struct page     **pages = NULL;
                 struct buffer_head *bh = NULL;
                 struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
                                 kfree(pages);
                                 return EXT_CONTINUE;
                         }
+                       index = 0;
  
+next_page:
                         /* Try to find the 1st mapped buffer. */
-                       end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+                       end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
                                   blksize_bits;
-                       if (!page_has_buffers(pages[0]))
+                       if (!page_has_buffers(pages[index]))
                                 goto out;
-                       head = page_buffers(pages[0]);
+                       head = page_buffers(pages[index]);
                         if (!head)
                                 goto out;
  
+                       index++;
                         bh = head;
                         do {
-                               if (buffer_mapped(bh)) {
+                               if (end >= newex->ec_block +
+                                       newex->ec_len)
+                                       /* The buffer is out of
+                                        * the request range.
+                                        */
+                                       goto out;
+
+                               if (buffer_mapped(bh) &&
+                                   end >= newex->ec_block) {
+                                       start_index = index - 1;
                                         /* get the 1st mapped buffer. */
-                                       if (end > newex->ec_block +
-                                               newex->ec_len)
-                                               /* The buffer is out of
-                                                * the request range.
-                                                */
-                                               goto out;
                                         goto found_mapped_buffer;
                                 }
+
                                 bh = bh->b_this_page;
                                 end++;
                         } while (bh != head);
  
-                       /* No mapped buffer found. */
-                       goto out;
+                       /* No mapped buffer in the range found in this page,
+                        * We need to look up next page.
+                        */
+                       if (index >= ret) {
+                               /* There is no page left, but we need to limit
+                                * newex->ec_len.
+                                */
+                               newex->ec_len = end - newex->ec_block;
+                               goto out;
+                       }
+                       goto next_page;
                 } else {
                         /*Find contiguous delayed buffers. */
                         if (ret > 0 && pages[0]->index == last_offset)
                                 head = page_buffers(pages[0]);
                         bh = head;
+                       index = 1;
+                       start_index = 0;
                 }
  
  found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
                                 end++;
                         } while (bh != head);
  
-                       for (index = 1; index < ret; index++) {
+                       for (; index < ret; index++) {
                                 if (!page_has_buffers(pages[index])) {
                                         bh = NULL;
                                         break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
                                         bh = NULL;
                                         break;
                                 }
+
                                 if (pages[index]->index !=
-                                       pages[0]->index + index) {
+                                   pages[start_index]->index + index
+                                   - start_index) {
                                         /* Blocks are not contiguous. */
                                         bh = NULL;
                                         break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
         return (error < 0 ? error : 0);
  }
  
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode:  The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct super_block *sb = inode->i_sb;
+       struct ext4_ext_cache cache_ex;
+       ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+       struct address_space *mapping = inode->i_mapping;
+       struct ext4_map_blocks map;
+       handle_t *handle;
+       loff_t first_block_offset, last_block_offset, block_len;
+       loff_t first_page, last_page, first_page_offset, last_page_offset;
+       int ret, credits, blocks_released, err = 0;
+
+       first_block = (offset + sb->s_blocksize - 1) >>
+               EXT4_BLOCK_SIZE_BITS(sb);
+       last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+       first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+       last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+
+       first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+       last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+       first_page_offset = first_page << PAGE_CACHE_SHIFT;
+       last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               err = filemap_write_and_wait_range(mapping,
+                       first_page_offset == 0 ? 0 : first_page_offset-1,
+                       last_page_offset);
+
+                       if (err)
+                               return err;
+       }
+
+       /* Now release the pages */
+       if (last_page_offset > first_page_offset) {
+               truncate_inode_pages_range(mapping, first_page_offset,
+                                          last_page_offset-1);
+       }
+
+       /* finish any pending end_io work */
+       ext4_flush_completed_IO(inode);
+
+       credits = ext4_writepage_trans_blocks(inode);
+       handle = ext4_journal_start(inode, credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+
+       err = ext4_orphan_add(handle, inode);
+       if (err)
+               goto out;
+
+       /*
+        * Now we need to zero out the un block aligned data.
+        * If the file is smaller than a block, just
+        * zero out the middle
+        */
+       if (first_block > last_block)
+               ext4_block_zero_page_range(handle, mapping, offset, length);
+       else {
+               /* zero out the head of the hole before the first block */
+               block_len  = first_block_offset - offset;
+               if (block_len > 0)
+                       ext4_block_zero_page_range(handle, mapping,
+                                                  offset, block_len);
+
+               /* zero out the tail of the hole after the last block */
+               block_len = offset + length - last_block_offset;
+               if (block_len > 0) {
+                       ext4_block_zero_page_range(handle, mapping,
+                                       last_block_offset, block_len);
+               }
+       }
+
+       /* If there are no blocks to remove, return now */
+       if (first_block >= last_block)
+               goto out;
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_ext_invalidate_cache(inode);
+       ext4_discard_preallocations(inode);
+
+       /*
+        * Loop over all the blocks and identify blocks
+        * that need to be punched out
+        */
+       iblock = first_block;
+       blocks_released = 0;
+       while (iblock < last_block) {
+               max_blocks = last_block - iblock;
+               num_blocks = 1;
+               memset(&map, 0, sizeof(map));
+               map.m_lblk = iblock;
+               map.m_len = max_blocks;
+               ret = ext4_ext_map_blocks(handle, inode, &map,
+                       EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+               if (ret > 0) {
+                       blocks_released += ret;
+                       num_blocks = ret;
+               } else if (ret == 0) {
+                       /*
+                        * If map blocks could not find the block,
+                        * then it is in a hole.  If the hole was
+                        * not already cached, then map blocks should
+                        * put it in the cache.  So we can get the hole
+                        * out of the cache
+                        */
+                       memset(&cache_ex, 0, sizeof(cache_ex));
+                       if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+                               !cache_ex.ec_start) {
+
+                               /* The hole is cached */
+                               num_blocks = cache_ex.ec_block +
+                               cache_ex.ec_len - iblock;
+
+                       } else {
+                               /* The block could not be identified */
+                               err = -EIO;
+                               break;
+                       }
+               } else {
+                       /* Map blocks error */
+                       err = ret;
+                       break;
+               }
+
+               if (num_blocks == 0) {
+                       /* This condition should never happen */
+                       ext_debug("Block lookup failed");
+                       err = -EIO;
+                       break;
+               }
+
+               iblock += num_blocks;
+       }
+
+       if (blocks_released > 0) {
+               ext4_ext_invalidate_cache(inode);
+               ext4_discard_preallocations(inode);
+       }
+
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+
+       up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+       ext4_orphan_del(handle, inode);
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+       ext4_journal_stop(handle);
+       return err;
+}
  int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len)
  {
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  
         return error;
  }
-
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 7b80d543b89e71c41764b54a5d6eee59992c712c..2c09723220091f7520b5a9c04622e1d788b9b1c4 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
  };
  
  const struct inode_operations ext4_file_inode_operations = {
-       .truncate       = ext4_truncate,
         .setattr        = ext4_setattr,
         .getattr        = ext4_getattr,
  #ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c

index e9473cbe80dfd00a7245e13f4de57229011a7bb2..ce66d2fe826cbdf99389dea50aa3f72240e911d9 100644 (file)
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
  
  static void dump_completed_IO(struct inode * inode)
  {
-#ifdef EXT4_DEBUG
+#ifdef EXT4FS_DEBUG
         struct list_head *cur, *before, *after;
         ext4_io_end_t *io, *io0, *io1;
         unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
         int ret;
         tid_t commit_tid;
+       bool needs_barrier = false;
  
         J_ASSERT(ext4_journal_current_handle() == NULL);
  
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
         }
  
         commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-       if (jbd2_log_start_commit(journal, commit_tid)) {
-               /*
-                * When the journal is on a different device than the
-                * fs data disk, we need to issue the barrier in
-                * writeback mode.  (In ordered mode, the jbd2 layer
-                * will take care of issuing the barrier.  In
-                * data=journal, all of the data blocks are written to
-                * the journal device.)
-                */
-               if (ext4_should_writeback_data(inode) &&
-                   (journal->j_fs_dev != journal->j_dev) &&
-                   (journal->j_flags & JBD2_BARRIER))
-                       blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
-                                       NULL);
-               ret = jbd2_log_wait_commit(journal, commit_tid);
-       } else if (journal->j_flags & JBD2_BARRIER)
+       if (journal->j_flags & JBD2_BARRIER &&
+           !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+               needs_barrier = true;
+       jbd2_log_start_commit(journal, commit_tid);
+       ret = jbd2_log_wait_commit(journal, commit_tid);
+       if (needs_barrier)
                 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
   out:
         trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index f2fa5e8a582caf92dba32df3cfad714920a85721..50d0e9c645845a4001e4da390318fa0c9e6b45b9 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
         while (target > 0) {
                 count = target;
                 /* allocating blocks for indirect blocks and direct blocks */
-               current_block = ext4_new_meta_blocks(handle, inode,
-                                                       goal, &count, err);
+               current_block = ext4_new_meta_blocks(handle, inode, goal,
+                                                    0, &count, err);
                 if (*err)
                         goto failed_out;
  
@@ -1930,7 +1930,7 @@ repeat:
          * We do still charge estimated metadata to the sb though;
          * we cannot afford to run out of free blocks.
          */
-       if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+       if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
                 dquot_release_reservation_block(inode, 1);
                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                         yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
                                 continue;
                         }
  
-                       if (PageWriteback(page))
-                               wait_on_page_writeback(page);
-
+                       wait_on_page_writeback(page);
                         BUG_ON(PageWriteback(page));
  
                         if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
                         loff_t end = offset + iov_length(iov, nr_segs);
  
                         if (end > isize)
-                               vmtruncate(inode, isize);
+                               ext4_truncate_failed_write(inode);
                 }
         }
         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3915,10 +3913,31 @@ void ext4_set_aops(struct inode *inode)
   */
  int ext4_block_truncate_page(handle_t *handle,
                 struct address_space *mapping, loff_t from)
+{
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned length;
+       unsigned blocksize;
+       struct inode *inode = mapping->host;
+
+       blocksize = inode->i_sb->s_blocksize;
+       length = blocksize - (offset & (blocksize - 1));
+
+       return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+               struct address_space *mapping, loff_t from, loff_t length)
  {
         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned blocksize, length, pos;
+       unsigned blocksize, max, pos;
         ext4_lblk_t iblock;
         struct inode *inode = mapping->host;
         struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
                 return -EINVAL;
  
         blocksize = inode->i_sb->s_blocksize;
-       length = blocksize - (offset & (blocksize - 1));
+       max = blocksize - (offset & (blocksize - 1));
+
+       /*
+        * correct length if it does not fall between
+        * 'from' and the end of the block
+        */
+       if (length > max || length < 0)
+               length = max;
+
         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
  
         if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
  
  int ext4_can_truncate(struct inode *inode)
  {
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-               return 0;
         if (S_ISREG(inode->i_mode))
                 return 1;
         if (S_ISDIR(inode->i_mode))
@@ -4391,6 +4416,31 @@ int ext4_can_truncate(struct inode *inode)
         return 0;
  }
  
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       if (!S_ISREG(inode->i_mode))
+               return -ENOTSUPP;
+
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               /* TODO: Add support for non extent hole punching */
+               return -ENOTSUPP;
+       }
+
+       return ext4_ext_punch_hole(file, offset, length);
+}
+
  /*
   * ext4_truncate()
   *
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
         /*
          * Figure out the offset within the block group inode table
          */
-       inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+       inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
         inode_offset = ((inode->i_ino - 1) %
                         EXT4_INODES_PER_GROUP(sb));
         block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
  
         if (S_ISREG(inode->i_mode) &&
             attr->ia_valid & ATTR_SIZE &&
-           (attr->ia_size < inode->i_size ||
-            (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+           (attr->ia_size < inode->i_size)) {
                 handle_t *handle;
  
                 handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                                 goto err_out;
                         }
                 }
-               /* ext4_truncate will clear the flag */
-               if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
-                       ext4_truncate(inode);
         }
  
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode))
-               rc = vmtruncate(inode, attr->ia_size);
+       if (attr->ia_valid & ATTR_SIZE) {
+               if (attr->ia_size != i_size_read(inode)) {
+                       truncate_setsize(inode, attr->ia_size);
+                       ext4_truncate(inode);
+               } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+                       ext4_truncate(inode);
+       }
  
         if (!rc) {
                 setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                 goto out_unlock;
         }
         ret = 0;
-       if (PageMappedToDisk(page))
-               goto out_unlock;
+
+       lock_page(page);
+       wait_on_page_writeback(page);
+       if (PageMappedToDisk(page)) {
+               up_read(&inode->i_alloc_sem);
+               return VM_FAULT_LOCKED;
+       }
  
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
  
-       lock_page(page);
         /*
          * return if we have all the buffers mapped. This avoid
          * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         if (page_has_buffers(page)) {
                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
                                         ext4_bh_unmapped)) {
-                       unlock_page(page);
-                       goto out_unlock;
+                       up_read(&inode->i_alloc_sem);
+                       return VM_FAULT_LOCKED;
                 }
         }
         unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         if (ret < 0)
                 goto out_unlock;
         ret = 0;
+
+       /*
+        * write_begin/end might have created a dirty page and someone
+        * could wander in and start the IO.  Make sure that hasn't
+        * happened.
+        */
+       lock_page(page);
+       wait_on_page_writeback(page);
+       up_read(&inode->i_alloc_sem);
+       return VM_FAULT_LOCKED;
  out_unlock:
         if (ret)
                 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index d8a16eecf1d55748f59c6b66efe8cdf46b589803..859f2ae8864e6af2b85dc62135f7f89cb0258d4f 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
         struct inode *inode;
         char *data;
         char *bitmap;
+       struct ext4_group_info *grinfo;
  
         mb_debug(1, "init page %lu\n", page->index);
  
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                 if (first_group + i >= ngroups)
                         break;
  
+               grinfo = ext4_get_group_info(sb, first_group + i);
+               /*
+                * If page is uptodate then we came here after online resize
+                * which added some new uninitialized group info structs, so
+                * we must skip all initialized uptodate buddies on the page,
+                * which may be currently in use by an allocating task.
+                */
+               if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+                       bh[i] = NULL;
+                       continue;
+               }
+
                 err = -EIO;
                 desc = ext4_get_group_desc(sb, first_group + i, NULL);
                 if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
         }
  
         /* wait for I/O completion */
-       for (i = 0; i < groups_per_page && bh[i]; i++)
-               wait_on_buffer(bh[i]);
+       for (i = 0; i < groups_per_page; i++)
+               if (bh[i])
+                       wait_on_buffer(bh[i]);
  
         err = -EIO;
-       for (i = 0; i < groups_per_page && bh[i]; i++)
-               if (!buffer_uptodate(bh[i]))
+       for (i = 0; i < groups_per_page; i++)
+               if (bh[i] && !buffer_uptodate(bh[i]))
                         goto out;
  
         err = 0;
         first_block = page->index * blocks_per_page;
-       /* init the page  */
-       memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
         for (i = 0; i < blocks_per_page; i++) {
                 int group;
-               struct ext4_group_info *grinfo;
  
                 group = (first_block + i) >> 1;
                 if (group >= ngroups)
                         break;
  
+               if (!bh[group - first_group])
+                       /* skip initialized uptodate buddy */
+                       continue;
+
                 /*
                  * data carry information regarding this
                  * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                          * incore got set to the group block bitmap below
                          */
                         ext4_lock_group(sb, group);
+                       /* init the buddy */
+                       memset(data, 0xff, blocksize);
                         ext4_mb_generate_buddy(sb, data, incore, group);
                         ext4_unlock_group(sb, group);
                         incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
  
  out:
         if (bh) {
-               for (i = 0; i < groups_per_page && bh[i]; i++)
+               for (i = 0; i < groups_per_page; i++)
                         brelse(bh[i]);
                 if (bh != &bhs)
                         kfree(bh);
@@ -957,22 +974,21 @@ out:
  }
  
  /*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen  whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
   */
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+               ext4_group_t group, struct ext4_buddy *e4b)
  {
-       int i;
-       int block, pnum;
+       struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+       int block, pnum, poff;
         int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
+       struct page *page;
+
+       e4b->bd_buddy_page = NULL;
+       e4b->bd_bitmap_page = NULL;
  
         blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
         /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
          */
         block = group * 2;
         pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_bitmap_page = page;
+       e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
  
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
+       if (blocks_per_page >= 2) {
+               /* buddy and bitmap are on the same page */
+               return 0;
         }
-       return i;
+
+       block++;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+       if (!page)
+               return -EIO;
+       BUG_ON(page->mapping != inode->i_mapping);
+       e4b->bd_buddy_page = page;
+       return 0;
  }
  
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                        ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
  {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
-
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
-
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
+       if (e4b->bd_bitmap_page) {
+               unlock_page(e4b->bd_bitmap_page);
+               page_cache_release(e4b->bd_bitmap_page);
+       }
+       if (e4b->bd_buddy_page) {
+               unlock_page(e4b->bd_buddy_page);
+               page_cache_release(e4b->bd_buddy_page);
         }
-
  }
  
  /*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
  int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
  {
  
-       int ret = 0;
-       void *bitmap;
-       int blocks_per_page;
-       int block, pnum, poff;
-       int num_grp_locked = 0;
         struct ext4_group_info *this_grp;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct inode *inode = sbi->s_buddy_cache;
-       struct page *page = NULL, *bitmap_page = NULL;
+       struct ext4_buddy e4b;
+       struct page *page;
+       int ret = 0;
  
         mb_debug(1, "init group %u\n", group);
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
         this_grp = ext4_get_group_info(sb, group);
         /*
          * This ensures that we don't reinit the buddy cache
          * page which map to the group from which we are already
          * allocating. If we are looking at the buddy cache we would
          * have taken a reference using ext4_mb_load_buddy and that
-        * would have taken the alloc_sem lock.
+        * would have pinned buddy page to page cache.
          */
-       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+       ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+       if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                 /*
                  * somebody initialized the group
                  * return without doing anything
                  */
-               ret = 0;
                 goto err;
         }
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, NULL);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
-       }
-       if (page == NULL || !PageUptodate(page)) {
+
+       page = e4b.bd_bitmap_page;
+       ret = ext4_mb_init_cache(page, NULL);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                 ret = -EIO;
                 goto err;
         }
         mark_page_accessed(page);
-       bitmap_page = page;
-       bitmap = page_address(page) + (poff * sb->s_blocksize);
  
-       /* init buddy cache */
-       block++;
-       pnum = block / blocks_per_page;
-       poff = block % blocks_per_page;
-       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-       if (page == bitmap_page) {
+       if (e4b.bd_buddy_page == NULL) {
                 /*
                  * If both the bitmap and buddy are in
                  * the same page we don't need to force
                  * init the buddy
                  */
-               unlock_page(page);
-       } else if (page) {
-               BUG_ON(page->mapping != inode->i_mapping);
-               ret = ext4_mb_init_cache(page, bitmap);
-               if (ret) {
-                       unlock_page(page);
-                       goto err;
-               }
-               unlock_page(page);
+               ret = 0;
+               goto err;
         }
-       if (page == NULL || !PageUptodate(page)) {
+       /* init buddy cache */
+       page = e4b.bd_buddy_page;
+       ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+       if (ret)
+               goto err;
+       if (!PageUptodate(page)) {
                 ret = -EIO;
                 goto err;
         }
         mark_page_accessed(page);
  err:
-       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-       if (bitmap_page)
-               page_cache_release(bitmap_page);
-       if (page)
-               page_cache_release(page);
+       ext4_mb_put_buddy_page_lock(&e4b);
         return ret;
  }
  
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
         e4b->bd_group = group;
         e4b->bd_buddy_page = NULL;
         e4b->bd_bitmap_page = NULL;
-       e4b->alloc_semp = &grp->alloc_sem;
-
-       /* Take the read lock on the group alloc
-        * sem. This would make sure a parallel
-        * ext4_mb_init_group happening on other
-        * groups mapped by the page is blocked
-        * till we are done with allocation
-        */
-repeat_load_buddy:
-       down_read(e4b->alloc_semp);
  
         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
-               /* we need to check for group need init flag
-                * with alloc_semp held so that we can be sure
-                * that new blocks didn't get added to the group
-                * when we are loading the buddy cache
-                */
-               up_read(e4b->alloc_semp);
                 /*
                  * we need full data about the group
                  * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
                 ret = ext4_mb_init_group(sb, group);
                 if (ret)
                         return ret;
-               goto repeat_load_buddy;
         }
  
         /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
         return 0;
  
  err:
+       if (page)
+               page_cache_release(page);
         if (e4b->bd_bitmap_page)
                 page_cache_release(e4b->bd_bitmap_page);
         if (e4b->bd_buddy_page)
                 page_cache_release(e4b->bd_buddy_page);
         e4b->bd_buddy = NULL;
         e4b->bd_bitmap = NULL;
-
-       /* Done with the buddy cache */
-       up_read(e4b->alloc_semp);
         return ret;
  }
  
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
                 page_cache_release(e4b->bd_bitmap_page);
         if (e4b->bd_buddy_page)
                 page_cache_release(e4b->bd_buddy_page);
-       /* Done with the buddy cache */
-       if (e4b->alloc_semp)
-               up_read(e4b->alloc_semp);
  }
  
  
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
         get_page(ac->ac_bitmap_page);
         ac->ac_buddy_page = e4b->bd_buddy_page;
         get_page(ac->ac_buddy_page);
-       /* on allocation we use ac to track the held semaphore */
-       ac->alloc_semp =  e4b->alloc_semp;
-       e4b->alloc_semp = NULL;
         /* store last allocated for subsequent stream allocation */
         if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                 spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
         struct super_block *sb = journal->j_private;
         struct ext4_buddy e4b;
         struct ext4_group_info *db;
-       int err, ret, count = 0, count2 = 0;
+       int err, count = 0, count2 = 0;
         struct ext4_free_data *entry;
         struct list_head *l, *ltmp;
  
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                          entry->count, entry->group, entry);
  
-               if (test_opt(sb, DISCARD)) {
-                       ret = ext4_issue_discard(sb, entry->group,
-                                       entry->start_blk, entry->count);
-                       if (unlikely(ret == -EOPNOTSUPP)) {
-                               ext4_warning(sb, "discard not supported, "
-                                                "disabling");
-                               clear_opt(sb, DISCARD);
-                       }
-               }
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, entry->group,
+                                          entry->start_blk, entry->count);
  
                 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                 /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                         spin_unlock(&pa->pa_lock);
                 }
         }
-       if (ac->alloc_semp)
-               up_read(ac->alloc_semp);
         if (pa) {
                 /*
                  * We want to add the pa to the right bucket.
                  * Remove it from the list and while adding
                  * make sure the list to which we are adding
-                * doesn't grow big.  We need to release
-                * alloc_semp before calling ext4_mb_add_n_trim()
+                * doesn't grow big.
                  */
                 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                         spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                  * there is enough free blocks to do block allocation
                  * and verify allocation doesn't exceed the quota limits.
                  */
-               while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+               while (ar->len &&
+                       ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+
                         /* let others to free the space */
                         yield();
                         ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                         return 0;
                 }
                 reserv_blks = ar->len;
-               while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
-                       ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-                       ar->len--;
+               if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+                       dquot_alloc_block_nofail(ar->inode, ar->len);
+               } else {
+                       while (ar->len &&
+                               dquot_alloc_block(ar->inode, ar->len)) {
+
+                               ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+                               ar->len--;
+                       }
                 }
                 inquota = ar->len;
                 if (ar->len == 0) {
@@ -4703,6 +4644,127 @@ error_return:
         return;
  }
  
+/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle:                    handle to this transaction
+ * @sb:                                super block
+ * @block:                     start physcial block to add to the block group
+ * @count:                     number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+                        ext4_fsblk_t block, unsigned long count)
+{
+       struct buffer_head *bitmap_bh = NULL;
+       struct buffer_head *gd_bh;
+       ext4_group_t block_group;
+       ext4_grpblk_t bit;
+       unsigned int i;
+       struct ext4_group_desc *desc;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_buddy e4b;
+       int err = 0, ret, blk_free_count;
+       ext4_grpblk_t blocks_freed;
+       struct ext4_group_info *grp;
+
+       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+       grp = ext4_get_group_info(sb, block_group);
+       /*
+        * Check to see if we are freeing blocks across a group
+        * boundary.
+        */
+       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+               goto error_return;
+
+       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+       if (!bitmap_bh)
+               goto error_return;
+       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+       if (!desc)
+               goto error_return;
+
+       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+           in_range(block + count - 1, ext4_inode_table(sb, desc),
+                    sbi->s_itb_per_group)) {
+               ext4_error(sb, "Adding blocks in system zones - "
+                          "Block = %llu, count = %lu",
+                          block, count);
+               goto error_return;
+       }
+
+       BUFFER_TRACE(bitmap_bh, "getting write access");
+       err = ext4_journal_get_write_access(handle, bitmap_bh);
+       if (err)
+               goto error_return;
+
+       /*
+        * We are about to modify some metadata.  Call the journal APIs
+        * to unshare ->b_data if a currently-committing transaction is
+        * using it
+        */
+       BUFFER_TRACE(gd_bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, gd_bh);
+       if (err)
+               goto error_return;
+
+       for (i = 0, blocks_freed = 0; i < count; i++) {
+               BUFFER_TRACE(bitmap_bh, "clear bit");
+               if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+                       ext4_error(sb, "bit already cleared for block %llu",
+                                  (ext4_fsblk_t)(block + i));
+                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
+               } else {
+                       blocks_freed++;
+               }
+       }
+
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_return;
+
+       /*
+        * need to update group_info->bb_free and bitmap
+        * with group lock held. generate_buddy look at
+        * them with group lock_held
+        */
+       ext4_lock_group(sb, block_group);
+       mb_clear_bits(bitmap_bh->b_data, bit, count);
+       mb_free_blocks(NULL, &e4b, bit, count);
+       blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+       ext4_free_blks_set(sb, desc, blk_free_count);
+       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+       ext4_unlock_group(sb, block_group);
+       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+               atomic_add(blocks_freed,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
+       }
+
+       ext4_mb_unload_buddy(&e4b);
+
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+       /* And the group descriptor block */
+       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+       ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+       if (!err)
+               err = ret;
+
+error_return:
+       brelse(bitmap_bh);
+       ext4_std_error(sb, err);
+       return;
+}
+
  /**
   * ext4_trim_extent -- function to TRIM one single free extent in the group
   * @sb:                super block for the file system
@@ -4715,11 +4777,10 @@ error_return:
   * one will allocate those blocks, mark it as used in buddy bitmap. This must
   * be called with under the group lock.
   */
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
-               ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+                            ext4_group_t group, struct ext4_buddy *e4b)
  {
         struct ext4_free_extent ex;
-       int ret = 0;
  
         assert_spin_locked(ext4_group_lock_ptr(sb, group));
  
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
          */
         mb_mark_used(e4b, &ex);
         ext4_unlock_group(sb, group);
-
-       ret = ext4_issue_discard(sb, group, start, count);
-
+       ext4_issue_discard(sb, group, start, count);
         ext4_lock_group(sb, group);
         mb_free_blocks(NULL, e4b, start, ex.fe_len);
-       return ret;
  }
  
  /**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
   * the group buddy bitmap. This is done until whole group is scanned.
   */
  static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
-               ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+                  ext4_grpblk_t start, ext4_grpblk_t max,
+                  ext4_grpblk_t minblocks)
  {
         void *bitmap;
         ext4_grpblk_t next, count = 0;
-       ext4_group_t group;
-       int ret = 0;
+       struct ext4_buddy e4b;
+       int ret;
  
-       BUG_ON(e4b == NULL);
+       ret = ext4_mb_load_buddy(sb, group, &e4b);
+       if (ret) {
+               ext4_error(sb, "Error in loading buddy "
+                               "information for %u", group);
+               return ret;
+       }
+       bitmap = e4b.bd_bitmap;
  
-       bitmap = e4b->bd_bitmap;
-       group = e4b->bd_group;
-       start = (e4b->bd_info->bb_first_free > start) ?
-               e4b->bd_info->bb_first_free : start;
         ext4_lock_group(sb, group);
+       start = (e4b.bd_info->bb_first_free > start) ?
+               e4b.bd_info->bb_first_free : start;
  
         while (start < max) {
                 start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                 next = mb_find_next_bit(bitmap, max, start);
  
                 if ((next - start) >= minblocks) {
-                       ret = ext4_trim_extent(sb, start,
-                               next - start, group, e4b);
-                       if (ret < 0)
-                               break;
+                       ext4_trim_extent(sb, start,
+                                        next - start, group, &e4b);
                         count += next - start;
                 }
                 start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                         ext4_lock_group(sb, group);
                 }
  
-               if ((e4b->bd_info->bb_free - count) < minblocks)
+               if ((e4b.bd_info->bb_free - count) < minblocks)
                         break;
         }
         ext4_unlock_group(sb, group);
+       ext4_mb_unload_buddy(&e4b);
  
         ext4_debug("trimmed %d blocks in the group %d\n",
                 count, group);
  
-       if (ret < 0)
-               count = ret;
-
         return count;
  }
  
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
   */
  int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
  {
-       struct ext4_buddy e4b;
+       struct ext4_group_info *grp;
         ext4_group_t first_group, last_group;
         ext4_group_t group, ngroups = ext4_get_groups_count(sb);
         ext4_grpblk_t cnt = 0, first_block, last_block;
-       uint64_t start, len, minlen, trimmed;
+       uint64_t start, len, minlen, trimmed = 0;
         ext4_fsblk_t first_data_blk =
                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
         int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
         start = range->start >> sb->s_blocksize_bits;
         len = range->len >> sb->s_blocksize_bits;
         minlen = range->minlen >> sb->s_blocksize_bits;
-       trimmed = 0;
  
         if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
                 return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                 return -EINVAL;
  
         for (group = first_group; group <= last_group; group++) {
-               ret = ext4_mb_load_buddy(sb, group, &e4b);
-               if (ret) {
-                       ext4_error(sb, "Error in loading buddy "
-                                       "information for %u", group);
-                       break;
+               grp = ext4_get_group_info(sb, group);
+               /* We only do this if the grp has never been initialized */
+               if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                       ret = ext4_mb_init_group(sb, group);
+                       if (ret)
+                               break;
                 }
  
                 /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                         last_block = first_block + len;
                 len -= last_block - first_block;
  
-               if (e4b.bd_info->bb_free >= minlen) {
-                       cnt = ext4_trim_all_free(sb, &e4b, first_block,
+               if (grp->bb_free >= minlen) {
+                       cnt = ext4_trim_all_free(sb, group, first_block,
                                                 last_block, minlen);
                         if (cnt < 0) {
                                 ret = cnt;
-                               ext4_mb_unload_buddy(&e4b);
                                 break;
                         }
                 }
-               ext4_mb_unload_buddy(&e4b);
                 trimmed += cnt;
                 first_block = 0;
         }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h

index 22bd4d7f289b834b277fb55ce92cefa019c9d354..20b5e7bfebd175e27db63f959c0d5d89a9f929b0 100644 (file)
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
         __u8 ac_op;             /* operation, for history only */
         struct page *ac_bitmap_page;
         struct page *ac_buddy_page;
-       /*
-        * pointer to the held semaphore upon successful
-        * block allocation
-        */
-       struct rw_semaphore *alloc_semp;
         struct ext4_prealloc_space *ac_pa;
         struct ext4_locality_group *ac_lg;
  };
@@ -215,7 +210,6 @@ struct ext4_buddy {
         struct super_block *bd_sb;
         __u16 bd_blkbits;
         ext4_group_t bd_group;
-       struct rw_semaphore *alloc_semp;
  };
  #define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
  #define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c

index 92816b4e0f16a143f555d539cb9cb47abc7e1d48..b57b98fb44d1457ec9f98e60290d6da3a90fb6c8 100644 (file)
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
          * We have the extent map build with the tmp inode.
          * Now copy the i_data across
          */
-       ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+       ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
         memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
  
         /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c

new file mode 100644 (file)

index 0000000..9bdef3f
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+       mark_buffer_dirty(bh);
+       lock_buffer(bh);
+       bh->b_end_io = end_buffer_write_sync;
+       get_bh(bh);
+       submit_bh(WRITE_SYNC, bh);
+       wait_on_buffer(bh);
+       if (unlikely(!buffer_uptodate(bh)))
+               return 1;
+
+       return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+                         ext4_fsblk_t mmp_block)
+{
+       struct mmp_struct *mmp;
+
+       if (*bh)
+               clear_buffer_uptodate(*bh);
+
+       /* This would be sb_bread(sb, mmp_block), except we need to be sure
+        * that the MD RAID device cache has been bypassed, and that the read
+        * is not blocked in the elevator. */
+       if (!*bh)
+               *bh = sb_getblk(sb, mmp_block);
+       if (*bh) {
+               get_bh(*bh);
+               lock_buffer(*bh);
+               (*bh)->b_end_io = end_buffer_read_sync;
+               submit_bh(READ_SYNC, *bh);
+               wait_on_buffer(*bh);
+               if (!buffer_uptodate(*bh)) {
+                       brelse(*bh);
+                       *bh = NULL;
+               }
+       }
+       if (!*bh) {
+               ext4_warning(sb, "Error while reading MMP block %llu",
+                            mmp_block);
+               return -EIO;
+       }
+
+       mmp = (struct mmp_struct *)((*bh)->b_data);
+       if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+               return -EINVAL;
+
+       return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+                   const char *function, unsigned int line, const char *msg)
+{
+       __ext4_warning(sb, function, line, msg);
+       __ext4_warning(sb, function, line,
+                      "MMP failure info: last update time: %llu, last update "
+                      "node: %s, last update device: %s\n",
+                      (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+                      mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+       struct super_block *sb = ((struct mmpd_data *) data)->sb;
+       struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+       struct mmp_struct *mmp;
+       ext4_fsblk_t mmp_block;
+       u32 seq = 0;
+       unsigned long failed_writes = 0;
+       int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+       unsigned mmp_check_interval;
+       unsigned long last_update_time;
+       unsigned long diff;
+       int retval;
+
+       mmp_block = le64_to_cpu(es->s_mmp_block);
+       mmp = (struct mmp_struct *)(bh->b_data);
+       mmp->mmp_time = cpu_to_le64(get_seconds());
+       /*
+        * Start with the higher mmp_check_interval and reduce it if
+        * the MMP block is being updated on time.
+        */
+       mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+                                EXT4_MMP_MIN_CHECK_INTERVAL);
+       mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+       bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+       memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+              sizeof(mmp->mmp_nodename));
+
+       while (!kthread_should_stop()) {
+               if (++seq > EXT4_MMP_SEQ_MAX)
+                       seq = 1;
+
+               mmp->mmp_seq = cpu_to_le32(seq);
+               mmp->mmp_time = cpu_to_le64(get_seconds());
+               last_update_time = jiffies;
+
+               retval = write_mmp_block(bh);
+               /*
+                * Don't spew too many error messages. Print one every
+                * (s_mmp_update_interval * 60) seconds.
+                */
+               if (retval && (failed_writes % 60) == 0) {
+                       ext4_error(sb, "Error writing to MMP block");
+                       failed_writes++;
+               }
+
+               if (!(le32_to_cpu(es->s_feature_incompat) &
+                   EXT4_FEATURE_INCOMPAT_MMP)) {
+                       ext4_warning(sb, "kmmpd being stopped since MMP feature"
+                                    " has been disabled.");
+                       EXT4_SB(sb)->s_mmp_tsk = NULL;
+                       goto failed;
+               }
+
+               if (sb->s_flags & MS_RDONLY) {
+                       ext4_warning(sb, "kmmpd being stopped since filesystem "
+                                    "has been remounted as readonly.");
+                       EXT4_SB(sb)->s_mmp_tsk = NULL;
+                       goto failed;
+               }
+
+               diff = jiffies - last_update_time;
+               if (diff < mmp_update_interval * HZ)
+                       schedule_timeout_interruptible(mmp_update_interval *
+                                                      HZ - diff);
+
+               /*
+                * We need to make sure that more than mmp_check_interval
+                * seconds have not passed since writing. If that has happened
+                * we need to check if the MMP block is as we left it.
+                */
+               diff = jiffies - last_update_time;
+               if (diff > mmp_check_interval * HZ) {
+                       struct buffer_head *bh_check = NULL;
+                       struct mmp_struct *mmp_check;
+
+                       retval = read_mmp_block(sb, &bh_check, mmp_block);
+                       if (retval) {
+                               ext4_error(sb, "error reading MMP data: %d",
+                                          retval);
+
+                               EXT4_SB(sb)->s_mmp_tsk = NULL;
+                               goto failed;
+                       }
+
+                       mmp_check = (struct mmp_struct *)(bh_check->b_data);
+                       if (mmp->mmp_seq != mmp_check->mmp_seq ||
+                           memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+                                  sizeof(mmp->mmp_nodename))) {
+                               dump_mmp_msg(sb, mmp_check,
+                                            "Error while updating MMP info. "
+                                            "The filesystem seems to have been"
+                                            " multiply mounted.");
+                               ext4_error(sb, "abort");
+                               goto failed;
+                       }
+                       put_bh(bh_check);
+               }
+
+                /*
+                * Adjust the mmp_check_interval depending on how much time
+                * it took for the MMP block to be written.
+                */
+               mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+                                            EXT4_MMP_MAX_CHECK_INTERVAL),
+                                        EXT4_MMP_MIN_CHECK_INTERVAL);
+               mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+       }
+
+       /*
+        * Unmount seems to be clean.
+        */
+       mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+       mmp->mmp_time = cpu_to_le64(get_seconds());
+
+       retval = write_mmp_block(bh);
+
+failed:
+       kfree(data);
+       brelse(bh);
+       return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+       u32 new_seq;
+
+       do {
+               get_random_bytes(&new_seq, sizeof(u32));
+       } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+       return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+                                   ext4_fsblk_t mmp_block)
+{
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+       struct buffer_head *bh = NULL;
+       struct mmp_struct *mmp = NULL;
+       struct mmpd_data *mmpd_data;
+       u32 seq;
+       unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+       unsigned int wait_time = 0;
+       int retval;
+
+       if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+           mmp_block >= ext4_blocks_count(es)) {
+               ext4_warning(sb, "Invalid MMP block in superblock");
+               goto failed;
+       }
+
+       retval = read_mmp_block(sb, &bh, mmp_block);
+       if (retval)
+               goto failed;
+
+       mmp = (struct mmp_struct *)(bh->b_data);
+
+       if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+               mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+       /*
+        * If check_interval in MMP block is larger, use that instead of
+        * update_interval from the superblock.
+        */
+       if (mmp->mmp_check_interval > mmp_check_interval)
+               mmp_check_interval = mmp->mmp_check_interval;
+
+       seq = le32_to_cpu(mmp->mmp_seq);
+       if (seq == EXT4_MMP_SEQ_CLEAN)
+               goto skip;
+
+       if (seq == EXT4_MMP_SEQ_FSCK) {
+               dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+               goto failed;
+       }
+
+       wait_time = min(mmp_check_interval * 2 + 1,
+                       mmp_check_interval + 60);
+
+       /* Print MMP interval if more than 20 secs. */
+       if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+               ext4_warning(sb, "MMP interval %u higher than expected, please"
+                            " wait.\n", wait_time * 2);
+
+       if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+               ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+               goto failed;
+       }
+
+       retval = read_mmp_block(sb, &bh, mmp_block);
+       if (retval)
+               goto failed;
+       mmp = (struct mmp_struct *)(bh->b_data);
+       if (seq != le32_to_cpu(mmp->mmp_seq)) {
+               dump_mmp_msg(sb, mmp,
+                            "Device is already active on another node.");
+               goto failed;
+       }
+
+skip:
+       /*
+        * write a new random sequence number.
+        */
+       mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+       retval = write_mmp_block(bh);
+       if (retval)
+               goto failed;
+
+       /*
+        * wait for MMP interval and check mmp_seq.
+        */
+       if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+               ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+               goto failed;
+       }
+
+       retval = read_mmp_block(sb, &bh, mmp_block);
+       if (retval)
+               goto failed;
+       mmp = (struct mmp_struct *)(bh->b_data);
+       if (seq != le32_to_cpu(mmp->mmp_seq)) {
+               dump_mmp_msg(sb, mmp,
+                            "Device is already active on another node.");
+               goto failed;
+       }
+
+       mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+       if (!mmpd_data) {
+               ext4_warning(sb, "not enough memory for mmpd_data");
+               goto failed;
+       }
+       mmpd_data->sb = sb;
+       mmpd_data->bh = bh;
+
+       /*
+        * Start a kernel thread to update the MMP block periodically.
+        */
+       EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+                                            bdevname(bh->b_bdev,
+                                                     mmp->mmp_bdevname));
+       if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+               EXT4_SB(sb)->s_mmp_tsk = NULL;
+               kfree(mmpd_data);
+               ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+                            sb->s_id);
+               goto failed;
+       }
+
+       return 0;
+
+failed:
+       brelse(bh);
+       return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index b9f3e7862f13834b2c166b21a1b96d3e36dde428..2b8304bf3c50e1d1363287e79a66d5641fccceaa 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
          * It needs to call wait_on_page_writeback() to wait for the
          * writeback of the page.
          */
-       if (PageWriteback(page))
-               wait_on_page_writeback(page);
+       wait_on_page_writeback(page);
  
         /* Release old bh and drop refs */
         try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index 67fd0b0258589ae64428d26530807b898e79854b..b754b7721f51fea4dd943a68e436fe5677f301a0 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
         frame->at = entries;
         frame->bh = bh;
         bh = bh2;
+
+       ext4_handle_dirty_metadata(handle, dir, frame->bh);
+       ext4_handle_dirty_metadata(handle, dir, bh);
+
         de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-       dx_release (frames);
-       if (!(de))
+       if (!de) {
+               /*
+                * Even if the block split failed, we have to properly write
+                * out all the changes we did so far. Otherwise we can end up
+                * with corrupted filesystem.
+                */
+               ext4_mark_inode_dirty(handle, dir);
+               dx_release(frames);
                 return retval;
+       }
+       dx_release(frames);
  
         retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
         brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
         handle_t *handle;
         struct inode *inode;
         int l, err, retries = 0;
+       int credits;
  
         l = strlen(symname)+1;
         if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
  
         dquot_initialize(dir);
  
+       if (l > EXT4_N_BLOCKS * 4) {
+               /*
+                * For non-fast symlinks, we just allocate inode and put it on
+                * orphan list in the first transaction => we need bitmap,
+                * group descriptor, sb, inode block, quota blocks.
+                */
+               credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+       } else {
+               /*
+                * Fast symlink. We have to add entry to directory
+                * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+                * allocate new inode (bitmap, group descriptor, inode block,
+                * quota blocks, sb is already counted in previous macros).
+                */
+               credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                         EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                         EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+       }
  retry:
-       handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-                                       EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                       EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+       handle = ext4_journal_start(dir, credits);
         if (IS_ERR(handle))
                 return PTR_ERR(handle);
  
@@ -2263,21 +2292,44 @@ retry:
         if (IS_ERR(inode))
                 goto out_stop;
  
-       if (l > sizeof(EXT4_I(inode)->i_data)) {
+       if (l > EXT4_N_BLOCKS * 4) {
                 inode->i_op = &ext4_symlink_inode_operations;
                 ext4_set_aops(inode);
                 /*
-                * page_symlink() calls into ext4_prepare/commit_write.
-                * We have a transaction open.  All is sweetness.  It also sets
-                * i_size in generic_commit_write().
+                * We cannot call page_symlink() with transaction started
+                * because it calls into ext4_write_begin() which can wait
+                * for transaction commit if we are running out of space
+                * and thus we deadlock. So we have to stop transaction now
+                * and restart it when symlink contents is written.
+                * 
+                * To keep fs consistent in case of crash, we have to put inode
+                * to orphan list in the mean time.
                  */
+               drop_nlink(inode);
+               err = ext4_orphan_add(handle, inode);
+               ext4_journal_stop(handle);
+               if (err)
+                       goto err_drop_inode;
                 err = __page_symlink(inode, symname, l, 1);
+               if (err)
+                       goto err_drop_inode;
+               /*
+                * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+                * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+                */
+               handle = ext4_journal_start(dir,
+                               EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+                               EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+               if (IS_ERR(handle)) {
+                       err = PTR_ERR(handle);
+                       goto err_drop_inode;
+               }
+               inc_nlink(inode);
+               err = ext4_orphan_del(handle, inode);
                 if (err) {
+                       ext4_journal_stop(handle);
                         clear_nlink(inode);
-                       unlock_new_inode(inode);
-                       ext4_mark_inode_dirty(handle, inode);
-                       iput(inode);
-                       goto out_stop;
+                       goto err_drop_inode;
                 }
         } else {
                 /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
         if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                 goto retry;
         return err;
+err_drop_inode:
+       unlock_new_inode(inode);
+       iput(inode);
+       return err;
  }
  
  static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c

index b6dbd056fcb1d7f532f428e34cae4ef5248680ce..7bb8f76d470a019dd3801c6b9d8e6d76825f96c2 100644 (file)
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
         for (i = 0; i < io_end->num_io_pages; i++) {
                 struct page *page = io_end->pages[i]->p_page;
                 struct buffer_head *bh, *head;
-               int partial_write = 0;
+               loff_t offset;
+               loff_t io_end_offset;
  
-               head = page_buffers(page);
-               if (error)
+               if (error) {
                         SetPageError(page);
-               BUG_ON(!head);
-               if (head->b_size != PAGE_CACHE_SIZE) {
-                       loff_t offset;
-                       loff_t io_end_offset = io_end->offset + io_end->size;
+                       set_bit(AS_EIO, &page->mapping->flags);
+                       head = page_buffers(page);
+                       BUG_ON(!head);
+
+                       io_end_offset = io_end->offset + io_end->size;
  
                         offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
                         bh = head;
                         do {
                                 if ((offset >= io_end->offset) &&
-                                   (offset+bh->b_size <= io_end_offset)) {
-                                       if (error)
-                                               buffer_io_error(bh);
-
-                               }
-                               if (buffer_delay(bh))
-                                       partial_write = 1;
-                               else if (!buffer_mapped(bh))
-                                       clear_buffer_dirty(bh);
-                               else if (buffer_dirty(bh))
-                                       partial_write = 1;
+                                   (offset+bh->b_size <= io_end_offset))
+                                       buffer_io_error(bh);
+
                                 offset += bh->b_size;
                                 bh = bh->b_this_page;
                         } while (bh != head);
                 }
  
-               /*
-                * If this is a partial write which happened to make
-                * all buffers uptodate then we can optimize away a
-                * bogus readpage() for the next read(). Here we
-                * 'discover' whether the page went uptodate as a
-                * result of this (potentially partial) write.
-                */
-               if (!partial_write)
-                       SetPageUptodate(page);
-
                 put_io_page(io_end->pages[i]);
         }
         io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 8553dfb310afd7ac2209d186125287e99a867f61..d9937df7f5cfe18c35d42f683b83333f46cc93a9 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -75,11 +75,27 @@ static void ext4_write_super(struct super_block *sb);
  static int ext4_freeze(struct super_block *sb);
  static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
  static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  static void ext4_destroy_lazyinit_thread(void);
  static void ext4_unregister_li_request(struct super_block *sb);
  static void ext4_clear_request_list(void);
  
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "ext2",
+       .mount          = ext4_mount,
+       .kill_sb        = kill_block_super,
+       .fs_flags       = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
  #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
  static struct file_system_type ext3_fs_type = {
         .owner          = THIS_MODULE,
@@ -806,6 +822,8 @@ static void ext4_put_super(struct super_block *sb)
                 invalidate_bdev(sbi->journal_bdev);
                 ext4_blkdev_remove(sbi);
         }
+       if (sbi->s_mmp_tsk)
+               kthread_stop(sbi->s_mmp_tsk);
         sb->s_fs_info = NULL;
         /*
          * Now that we are completely done shutting down the
@@ -1096,7 +1114,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
  
         if (!test_opt(sb, INIT_INODE_TABLE))
                 seq_puts(seq, ",noinit_inode_table");
-       else if (sbi->s_li_wait_mult)
+       else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
                 seq_printf(seq, ",init_inode_table=%u",
                            (unsigned) sbi->s_li_wait_mult);
  
@@ -1187,9 +1205,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                 const char *data, size_t len, loff_t off);
  
  static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
         .get_reserved_space = ext4_get_reserved_space,
-#endif
         .write_dquot    = ext4_write_dquot,
         .acquire_dquot  = ext4_acquire_dquot,
         .release_dquot  = ext4_release_dquot,
@@ -1900,7 +1916,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                 ext4_msg(sb, KERN_WARNING,
                          "warning: mounting fs with errors, "
                          "running e2fsck is recommended");
-       else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+       else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                  le16_to_cpu(es->s_mnt_count) >=
                  (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                 ext4_msg(sb, KERN_WARNING,
@@ -2425,6 +2441,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
                           EXT4_SB(sb)->s_sectors_written_start) >> 1)));
  }
  
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+                                     struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+                                       struct ext4_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                           struct ext4_sb_info *sbi,
                                           const char *buf, size_t count)
@@ -2482,6 +2510,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
  EXT4_RO_ATTR(delayed_allocation_blocks);
  EXT4_RO_ATTR(session_write_kbytes);
  EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                  inode_readahead_blks_store, s_inode_readahead_blks);
  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2527,8 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(delayed_allocation_blocks),
         ATTR_LIST(session_write_kbytes),
         ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(extent_cache_hits),
+       ATTR_LIST(extent_cache_misses),
         ATTR_LIST(inode_readahead_blks),
         ATTR_LIST(inode_goal),
         ATTR_LIST(mb_stats),
@@ -2659,12 +2691,6 @@ static void print_daily_error_info(unsigned long arg)
         mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
  }
  
-static void ext4_lazyinode_timeout(unsigned long data)
-{
-       struct task_struct *p = (struct task_struct *)data;
-       wake_up_process(p);
-}
-
  /* Find next suitable group and run ext4_init_inode_table */
  static int ext4_run_li_request(struct ext4_li_request *elr)
  {
@@ -2696,11 +2722,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
                 ret = ext4_init_inode_table(sb, group,
                                             elr->lr_timeout ? 0 : 1);
                 if (elr->lr_timeout == 0) {
-                       timeout = jiffies - timeout;
-                       if (elr->lr_sbi->s_li_wait_mult)
-                               timeout *= elr->lr_sbi->s_li_wait_mult;
-                       else
-                               timeout *= 20;
+                       timeout = (jiffies - timeout) *
+                                 elr->lr_sbi->s_li_wait_mult;
                         elr->lr_timeout = timeout;
                 }
                 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2735,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
  
  /*
   * Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
   */
  static void ext4_remove_li_request(struct ext4_li_request *elr)
  {
@@ -2730,14 +2753,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
  
  static void ext4_unregister_li_request(struct super_block *sb)
  {
-       struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
-       if (!ext4_li_info)
+       mutex_lock(&ext4_li_mtx);
+       if (!ext4_li_info) {
+               mutex_unlock(&ext4_li_mtx);
                 return;
+       }
  
         mutex_lock(&ext4_li_info->li_list_mtx);
-       ext4_remove_li_request(elr);
+       ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
         mutex_unlock(&ext4_li_info->li_list_mtx);
+       mutex_unlock(&ext4_li_mtx);
  }
  
  static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2781,10 @@ static int ext4_lazyinit_thread(void *arg)
         struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
         struct list_head *pos, *n;
         struct ext4_li_request *elr;
-       unsigned long next_wakeup;
-       DEFINE_WAIT(wait);
+       unsigned long next_wakeup, cur;
  
         BUG_ON(NULL == eli);
  
-       eli->li_timer.data = (unsigned long)current;
-       eli->li_timer.function = ext4_lazyinode_timeout;
-
-       eli->li_task = current;
-       wake_up(&eli->li_wait_task);
-
  cont_thread:
         while (true) {
                 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2815,15 @@ cont_thread:
                 if (freezing(current))
                         refrigerator();
  
-               if ((time_after_eq(jiffies, next_wakeup)) ||
+               cur = jiffies;
+               if ((time_after_eq(cur, next_wakeup)) ||
                     (MAX_JIFFY_OFFSET == next_wakeup)) {
                         cond_resched();
                         continue;
                 }
  
-               eli->li_timer.expires = next_wakeup;
-               add_timer(&eli->li_timer);
-               prepare_to_wait(&eli->li_wait_daemon, &wait,
-                               TASK_INTERRUPTIBLE);
-               if (time_before(jiffies, next_wakeup))
-                       schedule();
-               finish_wait(&eli->li_wait_daemon, &wait);
+               schedule_timeout_interruptible(next_wakeup - cur);
+
                 if (kthread_should_stop()) {
                         ext4_clear_request_list();
                         goto exit_thread;
@@ -2833,12 +2847,7 @@ exit_thread:
                 goto cont_thread;
         }
         mutex_unlock(&eli->li_list_mtx);
-       del_timer_sync(&ext4_li_info->li_timer);
-       eli->li_task = NULL;
-       wake_up(&eli->li_wait_task);
-
         kfree(ext4_li_info);
-       ext4_lazyinit_task = NULL;
         ext4_li_info = NULL;
         mutex_unlock(&ext4_li_mtx);
  
@@ -2866,7 +2875,6 @@ static int ext4_run_lazyinit_thread(void)
         if (IS_ERR(ext4_lazyinit_task)) {
                 int err = PTR_ERR(ext4_lazyinit_task);
                 ext4_clear_request_list();
-               del_timer_sync(&ext4_li_info->li_timer);
                 kfree(ext4_li_info);
                 ext4_li_info = NULL;
                 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2883,6 @@ static int ext4_run_lazyinit_thread(void)
                 return err;
         }
         ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
-       wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
         return 0;
  }
  
@@ -2911,13 +2917,9 @@ static int ext4_li_info_new(void)
         if (!eli)
                 return -ENOMEM;
  
-       eli->li_task = NULL;
         INIT_LIST_HEAD(&eli->li_request_list);
         mutex_init(&eli->li_list_mtx);
  
-       init_waitqueue_head(&eli->li_wait_daemon);
-       init_waitqueue_head(&eli->li_wait_task);
-       init_timer(&eli->li_timer);
         eli->li_state |= EXT4_LAZYINIT_QUIT;
  
         ext4_li_info = eli;
@@ -2960,20 +2962,19 @@ static int ext4_register_li_request(struct super_block *sb,
         ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
         int ret = 0;
  
-       if (sbi->s_li_request != NULL)
+       if (sbi->s_li_request != NULL) {
+               /*
+                * Reset timeout so it can be computed again, because
+                * s_li_wait_mult might have changed.
+                */
+               sbi->s_li_request->lr_timeout = 0;
                 return 0;
+       }
  
         if (first_not_zeroed == ngroups ||
             (sb->s_flags & MS_RDONLY) ||
-           !test_opt(sb, INIT_INODE_TABLE)) {
-               sbi->s_li_request = NULL;
+           !test_opt(sb, INIT_INODE_TABLE))
                 return 0;
-       }
-
-       if (first_not_zeroed == ngroups) {
-               sbi->s_li_request = NULL;
-               return 0;
-       }
  
         elr = ext4_li_request_new(sb, first_not_zeroed);
         if (!elr)
@@ -3166,6 +3167,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
             ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                 set_opt(sb, DELALLOC);
  
+       /*
+        * set default s_li_wait_mult for lazyinit, for the case there is
+        * no mount option specified.
+        */
+       sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
         if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
                            &journal_devnum, &journal_ioprio, NULL, 0)) {
                 ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3194,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        "feature flags set on rev 0 fs, "
                        "running e2fsck is recommended");
  
+       if (IS_EXT2_SB(sb)) {
+               if (ext2_feature_set_ok(sb))
+                       ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+                                "using the ext4 subsystem");
+               else {
+                       ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+                                "to feature incompatibilities");
+                       goto failed_mount;
+               }
+       }
+
+       if (IS_EXT3_SB(sb)) {
+               if (ext3_feature_set_ok(sb))
+                       ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+                                "using the ext4 subsystem");
+               else {
+                       ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+                                "to feature incompatibilities");
+                       goto failed_mount;
+               }
+       }
+
         /*
          * Check feature flags regardless of the revision level, since we
          * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3488,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                           EXT4_HAS_INCOMPAT_FEATURE(sb,
                                     EXT4_FEATURE_INCOMPAT_RECOVER));
  
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+           !(sb->s_flags & MS_RDONLY))
+               if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+                       goto failed_mount3;
+
         /*
          * The first inode we look at is the journal inode.  Don't try
          * root first: it may be modified in the journal!
@@ -3474,7 +3508,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 goto failed_mount_wq;
         } else {
                 clear_opt(sb, DATA_FLAGS);
-               set_opt(sb, WRITEBACK_DATA);
                 sbi->s_journal = NULL;
                 needs_recovery = 0;
                 goto no_journal;
@@ -3707,6 +3740,8 @@ failed_mount3:
         percpu_counter_destroy(&sbi->s_freeinodes_counter);
         percpu_counter_destroy(&sbi->s_dirs_counter);
         percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       if (sbi->s_mmp_tsk)
+               kthread_stop(sbi->s_mmp_tsk);
  failed_mount2:
         for (i = 0; i < db_count; i++)
                 brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4277,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         int enable_quota = 0;
         ext4_group_t g;
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
-       int err;
+       int err = 0;
  #ifdef CONFIG_QUOTA
         int i;
  #endif
@@ -4368,6 +4403,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                 goto restore_opts;
                         if (!ext4_setup_super(sb, es, 0))
                                 sb->s_flags &= ~MS_RDONLY;
+                       if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                                    EXT4_FEATURE_INCOMPAT_MMP))
+                               if (ext4_multi_mount_protect(sb,
+                                               le64_to_cpu(es->s_mmp_block))) {
+                                       err = -EROFS;
+                                       goto restore_opts;
+                               }
                         enable_quota = 1;
                 }
         }
@@ -4432,6 +4474,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_super_block *es = sbi->s_es;
         u64 fsid;
+       s64 bfree;
  
         if (test_opt(sb, MINIX_DF)) {
                 sbi->s_overhead_last = 0;
@@ -4475,8 +4518,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
         buf->f_type = EXT4_SUPER_MAGIC;
         buf->f_bsize = sb->s_blocksize;
         buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-       buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+       bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
                        percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+       /* prevent underflow in case that few free space is available */
+       buf->f_bfree = max_t(s64, bfree, 0);
         buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
         if (buf->f_bfree < ext4_r_blocks_count(es))
                 buf->f_bavail = 0;
@@ -4652,6 +4697,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
         if (test_opt(sb, DELALLOC))
                 sync_filesystem(sb);
  
+       if (!inode)
+               goto out;
+
         /* Update modification times of quota files when userspace can
          * start looking at them */
         handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4820,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  }
  
  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
-       .owner          = THIS_MODULE,
-       .name           = "ext2",
-       .mount          = ext4_mount,
-       .kill_sb        = kill_block_super,
-       .fs_flags       = FS_REQUIRES_DEV,
-};
-
  static inline void register_as_ext2(void)
  {
         int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4832,22 @@ static inline void unregister_as_ext2(void)
  {
         unregister_filesystem(&ext2_fs_type);
  }
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+               return 0;
+       if (sb->s_flags & MS_RDONLY)
+               return 1;
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+               return 0;
+       return 1;
+}
  MODULE_ALIAS("ext2");
  #else
  static inline void register_as_ext2(void) { }
  static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
  #endif
  
  #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4863,24 @@ static inline void unregister_as_ext3(void)
  {
         unregister_filesystem(&ext3_fs_type);
  }
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+               return 0;
+       if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+               return 0;
+       if (sb->s_flags & MS_RDONLY)
+               return 1;
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+               return 0;
+       return 1;
+}
  MODULE_ALIAS("ext3");
  #else
  static inline void register_as_ext3(void) { }
  static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
  #endif
  
  static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4964,8 @@ static int __init ext4_init_fs(void)
         err = init_inodecache();
         if (err)
                 goto out1;
-       register_as_ext2();
         register_as_ext3();
+       register_as_ext2();
         err = register_filesystem(&ext4_fs_type);
         if (err)
                 goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c

index b545ca1c459c42e2cc6aef35ce49dea3fcd2694d..c757adc972506d672c78b7de03e654ede7eb8b1e 100644 (file)
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
                         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
  
-                       block = ext4_new_meta_blocks(handle, inode,
-                                                 goal, NULL, &error);
+                       block = ext4_new_meta_blocks(handle, inode, goal, 0,
+                                                    NULL, &error);
                         if (error)
                                 goto cleanup;
  
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index 29148a81c783728928cf4ea63d46160cc5c4e85c..7f21cf3aaf92e66d892259289cae31553423312b 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
                         ret = err;
                 spin_lock(&journal->j_list_lock);
                 J_ASSERT(jinode->i_transaction == commit_transaction);
-               commit_transaction->t_flushed_data_blocks = 1;
                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                 smp_mb__after_clear_bit();
                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
                 err = 0;
         }
  
+       write_lock(&journal->j_state_lock);
+       J_ASSERT(commit_transaction->t_state == T_COMMIT);
+       commit_transaction->t_state = T_COMMIT_DFLUSH;
+       write_unlock(&journal->j_state_lock);
         /* 
          * If the journal is not located on the file system device,
          * then we must flush the file system device before we issue
          * the commit record
          */
-       if (commit_transaction->t_flushed_data_blocks &&
+       if (commit_transaction->t_need_data_flush &&
             (journal->j_fs_dev != journal->j_dev) &&
             (journal->j_flags & JBD2_BARRIER))
                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
                     required. */
                 JBUFFER_TRACE(jh, "file as BJ_Forget");
                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-               /* Wake up any transactions which were waiting for this
-                  IO to complete */
+               /*
+                * Wake up any transactions which were waiting for this IO to
+                * complete. The barrier must be here so that changes by
+                * jbd2_journal_file_buffer() take effect before wake_up_bit()
+                * does the waitqueue check.
+                */
+               smp_mb();
                 wake_up_bit(&bh->b_state, BH_Unshadow);
                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
                 __brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
                 jbd2_journal_abort(journal, err);
  
         jbd_debug(3, "JBD: commit phase 5\n");
+       write_lock(&journal->j_state_lock);
+       J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+       commit_transaction->t_state = T_COMMIT_JFLUSH;
+       write_unlock(&journal->j_state_lock);
  
         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
  
         jbd_debug(3, "JBD: commit phase 7\n");
  
-       J_ASSERT(commit_transaction->t_state == T_COMMIT);
+       J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
  
         commit_transaction->t_start = jiffies;
         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index e0ec3db1c395b6c338acc6f7c14af6a2c11eb067..9a78269903041934f0896148e1999e8467fd0872 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
  int __jbd2_log_start_commit(journal_t *journal, tid_t target)
  {
         /*
-        * Are we already doing a recent enough commit?
+        * The only transaction we can possibly wait upon is the
+        * currently running transaction (if it exists).  Otherwise,
+        * the target tid must be an old one.
          */
-       if (!tid_geq(journal->j_commit_request, target)) {
+       if (journal->j_running_transaction &&
+           journal->j_running_transaction->t_tid == target) {
                 /*
                  * We want a new commit: OK, mark the request and wakeup the
                  * commit thread.  We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
                           journal->j_commit_sequence);
                 wake_up(&journal->j_wait_commit);
                 return 1;
-       }
+       } else if (!tid_geq(journal->j_commit_request, target))
+               /* This should never happen, but if it does, preserve
+                  the evidence before kjournald goes into a loop and
+                  increments j_commit_sequence beyond all recognition. */
+               WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+                         journal->j_commit_request,
+                         journal->j_commit_sequence,
+                         target, journal->j_running_transaction ? 
+                         journal->j_running_transaction->t_tid : 0);
         return 0;
  }
  
@@ -576,6 +587,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
         return ret;
  }
  
+/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+       int ret = 0;
+       transaction_t *commit_trans;
+
+       if (!(journal->j_flags & JBD2_BARRIER))
+               return 0;
+       read_lock(&journal->j_state_lock);
+       /* Transaction already committed? */
+       if (tid_geq(journal->j_commit_sequence, tid))
+               goto out;
+       commit_trans = journal->j_committing_transaction;
+       if (!commit_trans || commit_trans->t_tid != tid) {
+               ret = 1;
+               goto out;
+       }
+       /*
+        * Transaction is being committed and we already proceeded to
+        * submitting a flush to fs partition?
+        */
+       if (journal->j_fs_dev != journal->j_dev) {
+               if (!commit_trans->t_need_data_flush ||
+                   commit_trans->t_state >= T_COMMIT_DFLUSH)
+                       goto out;
+       } else {
+               if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+                       goto out;
+       }
+       ret = 1;
+out:
+       read_unlock(&journal->j_state_lock);
+       return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
  /*
   * Wait for a specified commit to complete.
   * The caller may not hold the journal lock.
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index 05fa77a23711f8db041cac46984452451daae70c..3eec82d32fd4c6886fdf823e7d3690ca2e665162 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
   */
  
  /*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
   *
   * In order for t_max_wait to be reliable, it must be protected by a
   * lock.  But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
   * means that maximum wait time reported by the jbd2_run_stats
   * tracepoint will always be zero.
   */
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+                                    unsigned long ts)
  {
  #ifdef CONFIG_JBD2_DEBUG
-       unsigned long ts = jiffies;
-
         if (jbd2_journal_enable_debug &&
             time_after(transaction->t_start, ts)) {
                 ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
         tid_t           tid;
         int             needed, need_to_start;
         int             nblocks = handle->h_buffer_credits;
+       unsigned long ts = jiffies;
  
         if (nblocks > journal->j_max_transaction_buffers) {
                 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
         /* OK, account for the buffers that this operation expects to
          * use and add the handle to the running transaction. 
          */
-       update_t_max_wait(transaction);
+       update_t_max_wait(transaction, ts);
         handle->h_transaction = transaction;
         atomic_inc(&transaction->t_updates);
         atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
   * This function is visible to journal users (like ext3fs), so is not
   * called with the journal already locked.
   *
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
   */
  handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
  {
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
          */
         JBUFFER_TRACE(jh, "cancelling revoke");
         jbd2_journal_cancel_revoke(handle, jh);
-       jbd2_journal_put_journal_head(jh);
  out:
+       jbd2_journal_put_journal_head(jh);
         return err;
  }
  
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
             jinode->i_next_transaction == transaction)
                 goto done;
  
+       /*
+        * We only ever set this variable to 1 so the test is safe. Since
+        * t_need_data_flush is likely to be set, we do the test to save some
+        * cacheline bouncing
+        */
+       if (!transaction->t_need_data_flush)
+               transaction->t_need_data_flush = 1;
         /* On some different transaction's list - should be
          * the committing one */
         if (jinode->i_transaction) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h

index a32dcaec04e147917a3e085a5e83146189782336..4ecb7b16b278061a280d8f29bdeb7c88eae24979 100644 (file)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -529,9 +529,10 @@ struct transaction_s
         enum {
                 T_RUNNING,
                 T_LOCKED,
-               T_RUNDOWN,
                 T_FLUSH,
                 T_COMMIT,
+               T_COMMIT_DFLUSH,
+               T_COMMIT_JFLUSH,
                 T_FINISHED
         }                       t_state;
  
@@ -658,7 +659,9 @@ struct transaction_s
          * waiting for it to finish.
          */
         unsigned int t_synchronous_commit:1;
-       unsigned int t_flushed_data_blocks:1;
+
+       /* Disk flush needs to be sent to fs partition [no locking] */
+       int                     t_need_data_flush;
  
         /*
          * For use by the filesystem to store fs-specific data
@@ -1228,6 +1231,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
  int jbd2_journal_force_commit_nested(journal_t *journal);
  int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
  int jbd2_log_do_checkpoint(journal_t *journal);
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
  
  void __jbd2_log_wait_for_space(journal_t *journal);
  extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 May 2011 16:53:20 +0000 (09:53 -0700)
Documentation/filesystems/ext4.txt		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
fs/ext4/Makefile		patch \| blob \| history
fs/ext4/balloc.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/ext4_jbd2.c		patch \| blob \| history
fs/ext4/ext4_jbd2.h		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/fsync.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/mballoc.c		patch \| blob \| history
fs/ext4/mballoc.h		patch \| blob \| history
fs/ext4/migrate.c		patch \| blob \| history
fs/ext4/mmp.c	[new file with mode: 0644]	patch \| blob
fs/ext4/move_extent.c		patch \| blob \| history
fs/ext4/namei.c		patch \| blob \| history
fs/ext4/page-io.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/ext4/xattr.c		patch \| blob \| history
fs/jbd2/commit.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/jbd2/transaction.c		patch \| blob \| history
include/linux/jbd2.h		patch \| blob \| history