ext4: avoid unnecessarily writing back dirty pages before hole punching

author Li Wang <liwang@ubuntukylin.com>

Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)

committer Theodore Ts'o <tytso@mit.edu>

Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)
author Li Wang <liwang@ubuntukylin.com>
Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)
committer Theodore Ts'o <tytso@mit.edu>
Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 0db830d541ecc16b73a5f339a6e2335241a8c9f0..06136b5a20275a97802ec25cff8cde672acb8a65 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3466,6 +3466,16 @@ int ext4_can_truncate(struct inode *inode)
         return 0;
  }
  
+static inline int ext4_begin_ordered_punch_hole(struct inode *inode,
+                                              loff_t start, loff_t length)
+{
+       if (!EXT4_I(inode)->jinode)
+               return 0;
+       return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode),
+                                                   EXT4_I(inode)->jinode,
+                                                   start, start+length-1);
+}
+
  /*
   * ext4_punch_hole: punches a hole in a file by releaseing the blocks
   * associated with the given offset and length
@@ -3482,7 +3492,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
         struct inode *inode = file_inode(file);
         struct super_block *sb = inode->i_sb;
         ext4_lblk_t first_block, stop_block;
-       struct address_space *mapping = inode->i_mapping;
         loff_t first_block_offset, last_block_offset;
         handle_t *handle;
         unsigned int credits;
@@ -3498,17 +3507,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  
         trace_ext4_punch_hole(inode, offset, length);
  
-       /*
-        * Write out all dirty pages to avoid race conditions
-        * Then release them.
-        */
-       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-               ret = filemap_write_and_wait_range(mapping, offset,
-                                                  offset + length - 1);
-               if (ret)
-                       return ret;
-       }
-
         mutex_lock(&inode->i_mutex);
         /* It's not possible punch hole on append only file */
         if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
@@ -3537,6 +3535,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
         first_block_offset = round_up(offset, sb->s_blocksize);
         last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
  
+       if (ext4_should_order_data(inode)) {
+               ret = ext4_begin_ordered_punch_hole(inode, offset, length);
+               if (ret)
+                       goto out_mutex;
+       }
+
         /* Now release the pages and zero block aligned part of pages*/
         if (last_block_offset > first_block_offset)
                 truncate_pagecache_range(inode, first_block_offset,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index 915dd575cd46705d692346a13c3a623dc2af2bdb..4c8b8d477933040602b64bbdaf44d0ab7c4f2aba 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit);
  EXPORT_SYMBOL(jbd2_journal_file_inode);
  EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
  EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
-EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole);
  EXPORT_SYMBOL(jbd2_inode_cache);
  
  static void __journal_abort_soft (journal_t *journal, int errno);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index dd422e6804183a5539f0174ce5ac004c2cb05956..91d62e15465de0586da2e9c2752a02c592df31fc 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2419,29 +2419,10 @@ done:
         return 0;
  }
  
-/*
- * File truncate and transaction commit interact with each other in a
- * non-trivial way.  If a transaction writing data block A is
- * committing, we cannot discard the data by truncate until we have
- * written them.  Otherwise if we crashed after the transaction with
- * write has committed but before the transaction with truncate has
- * committed, we could see stale data in block A.  This function is a
- * helper to solve this problem.  It starts writeout of the truncated
- * part in case it is in the committing transaction.
- *
- * Filesystem code must call this function when inode is journaled in
- * ordered mode before truncation happens and after the inode has been
- * placed on orphan list with the new inode size. The second condition
- * avoids the race that someone writes new data and we start
- * committing the transaction after this function has been called but
- * before a transaction for truncate is started (and furthermore it
- * allows us to optimize the case where the addition to orphan list
- * happens in the same transaction as write --- we don't have to write
- * any data in such case).
- */
-int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+
+int jbd2_journal_begin_ordered_punch_hole(journal_t *journal,
                                         struct jbd2_inode *jinode,
-                                       loff_t new_size)
+                                       loff_t start, loff_t end)
  {
         transaction_t *inode_trans, *commit_trans;
         int ret = 0;
@@ -2460,10 +2441,12 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
         spin_unlock(&journal->j_list_lock);
         if (inode_trans == commit_trans) {
                 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
-                       new_size, LLONG_MAX);
+                       start, end);
                 if (ret)
                         jbd2_journal_abort(journal, ret);
         }
  out:
         return ret;
  }
+
+
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h

index 0302f3f1406386684e4c677e76afc48aaab0469f..5f3c094d5bdca8264af1d5e90ce1724bf0f4ed01 100644 (file)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1157,11 +1157,40 @@ extern int         jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *
  extern int        jbd2_journal_force_commit(journal_t *);
  extern int        jbd2_journal_force_commit_nested(journal_t *);
  extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
-extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
-                               struct jbd2_inode *inode, loff_t new_size);
+extern int        jbd2_journal_begin_ordered_punch_hole(journal_t *,
+                                       struct jbd2_inode *,
+                                       loff_t, loff_t);
  extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
  extern void       jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
  
+/*
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
+ */
+static inline int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+                                       struct jbd2_inode *jinode,
+                                       loff_t new_size)
+{
+       return jbd2_journal_begin_ordered_punch_hole(journal, jinode,
+                                                 new_size, LLONG_MAX);
+}
+
  /*
   * journal_head management
   */
author	Li Wang <liwang@ubuntukylin.com>
	Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)
committer	Theodore Ts'o <tytso@mit.edu>
	Thu, 13 Jun 2013 03:25:45 +0000 (23:25 -0400)
fs/ext4/inode.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/jbd2/transaction.c		patch \| blob \| history
include/linux/jbd2.h		patch \| blob \| history