]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
writeback: do not sync data dirtied after sync start
authorJan Kara <jack@suse.cz>
Tue, 5 Nov 2013 05:55:44 +0000 (16:55 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Tue, 5 Nov 2013 05:55:44 +0000 (16:55 +1100)
When there are processes heavily creating small files while sync(2) is
running, it can easily happen that quite some new files are created
between WB_SYNC_NONE and WB_SYNC_ALL pass of sync(2).  That can happen
especially if there are several busy filesystems (remember that sync
traverses filesystems sequentially and waits in WB_SYNC_ALL phase on one
fs before starting it on another fs).  Because WB_SYNC_ALL pass is slow
(e.g.  causes a transaction commit and cache flush for each inode in
ext3), resulting sync(2) times are rather large.

The following script reproduces the problem:

function run_writers
{
  for (( i = 0; i < 10; i++ )); do
    mkdir $1/dir$i
    for (( j = 0; j < 40000; j++ )); do
      dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null
    done &
  done
}

for dir in "$@"; do
  run_writers $dir
done

sleep 40
time sync
======

Fix the problem by disregarding inodes dirtied after sync(2) was called in
the WB_SYNC_ALL pass.  To allow for this, sync_inodes_sb() now takes a
time stamp when sync has started which is used for setting up work for
flusher threads.

To give some numbers, when above script is run on two ext4 filesystems on
simple SATA drive, the average sync time from 10 runs is 267.549 seconds
with standard deviation 104.799426.  With the patched kernel, the average
sync time from 10 runs is 2.995 seconds with standard deviation 0.096.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/fs-writeback.c
fs/sync.c
fs/xfs/xfs_super.c
include/linux/writeback.h
include/trace/events/writeback.h

index 9f4935b8f2087eb44475bf7fb8aab20f44889e86..70837dadad72f83be73cb96ef8e010fc643f7ee6 100644 (file)
@@ -39,7 +39,7 @@
 struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
-       unsigned long *older_than_this;
+       unsigned long older_than_this;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
@@ -248,8 +248,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-               if (work->older_than_this &&
-                   inode_dirtied_after(inode, *work->older_than_this))
+               if (inode_dirtied_after(inode, work->older_than_this))
                        break;
                list_move(&inode->i_wb_list, &tmp);
                moved++;
@@ -791,12 +790,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
        unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
-       unsigned long oldest_jif;
        struct inode *inode;
        long progress;
 
-       oldest_jif = jiffies;
-       work->older_than_this = &oldest_jif;
+       if (!work->older_than_this)
+               work->older_than_this = jiffies;
 
        spin_lock(&wb->list_lock);
        for (;;) {
@@ -830,10 +828,10 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * safe.
                 */
                if (work->for_kupdate) {
-                       oldest_jif = jiffies -
+                       work->older_than_this = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
                } else if (work->for_background)
-                       oldest_jif = jiffies;
+                       work->older_than_this = jiffies;
 
                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
@@ -1350,13 +1348,14 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
+               .older_than_this = older_than_this,
                .range_cyclic   = 0,
                .done           = &done,
                .reason         = WB_REASON_SYNC,
index 905f3f6b3d8567032b9d5ad6044a184cb6ac3151..ff96f99fef645c38f34bd985b1a2be2184415774 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait,
+                            unsigned long start)
 {
        if (wait)
-               sync_inodes_sb(sb);
+               sync_inodes_sb(sb, start);
        else
                writeback_inodes_sb(sb, WB_REASON_SYNC);
 
@@ -47,6 +48,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 int sync_filesystem(struct super_block *sb)
 {
        int ret;
+       unsigned long start = jiffies;
 
        /*
         * We need to be protected against the filesystem going from
@@ -60,17 +62,17 @@ int sync_filesystem(struct super_block *sb)
        if (sb->s_flags & MS_RDONLY)
                return 0;
 
-       ret = __sync_filesystem(sb, 0);
+       ret = __sync_filesystem(sb, 0, start);
        if (ret < 0)
                return ret;
-       return __sync_filesystem(sb, 1);
+       return __sync_filesystem(sb, 1, start);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
        if (!(sb->s_flags & MS_RDONLY))
-               sync_inodes_sb(sb);
+               sync_inodes_sb(sb, *((unsigned long *)arg));
 }
 
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -102,9 +104,10 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 SYSCALL_DEFINE0(sync)
 {
        int nowait = 0, wait = 1;
+       unsigned long start = jiffies;
 
        wakeup_flusher_threads(0, WB_REASON_SYNC);
-       iterate_supers(sync_inodes_one_sb, NULL);
+       iterate_supers(sync_inodes_one_sb, &start);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
        iterate_bdevs(fdatawrite_one_bdev, NULL);
index 15188cc9944919e275e43e4978056efa91801343..8968f5036fa17a5cda24c1bfdd9d7c3ed1957ca1 100644 (file)
@@ -918,7 +918,7 @@ xfs_flush_inodes(
        struct super_block      *sb = mp->m_super;
 
        if (down_read_trylock(&sb->s_umount)) {
-               sync_inodes_sb(sb);
+               sync_inodes_sb(sb, jiffies);
                up_read(&sb->s_umount);
        }
 }
index 021b8a319b9e2cf7f0f60a5f6fedcfe3367f8016..fc0e4320aa6d50d0e5f966e3f8b649506d461ac7 100644 (file)
@@ -97,7 +97,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                  enum wb_reason reason);
-void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
index 464ea82e10dbf1f1a519b75c52f9e129a5a715e0..c7bbbe794e65cdd0a41c7235bcee6e9718970876 100644 (file)
@@ -287,11 +287,11 @@ TRACE_EVENT(writeback_queue_io,
                __field(int,            reason)
        ),
        TP_fast_assign(
-               unsigned long *older_than_this = work->older_than_this;
+               unsigned long older_than_this = work->older_than_this;
                strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
-               __entry->older  = older_than_this ?  *older_than_this : 0;
+               __entry->older  = older_than_this;
                __entry->age    = older_than_this ?
-                                 (jiffies - *older_than_this) * 1000 / HZ : -1;
+                                 (jiffies - older_than_this) * 1000 / HZ : -1;
                __entry->moved  = moved;
                __entry->reason = work->reason;
        ),