]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - fs/fs-writeback.c
Merge branch 'master' into tk71
[mv-sheeva.git] / fs / fs-writeback.c
index ab38fef1c9a1a52eab7128fa6a7dad217d4ad744..59c6e4956786e36b323bbbdea49822e7539ef355 100644 (file)
@@ -79,13 +79,14 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
        return sb->s_bdi;
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-               struct wb_writeback_work *work)
+static inline struct inode *wb_inode(struct list_head *head)
 {
-       trace_writeback_queue(bdi, work);
+       return list_entry(head, struct inode, i_wb_list);
+}
 
-       spin_lock_bh(&bdi->wb_lock);
-       list_add_tail(&work->list, &bdi->work_list);
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+{
        if (bdi->wb.task) {
                wake_up_process(bdi->wb.task);
        } else {
@@ -93,15 +94,26 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
                 * The bdi thread isn't there, wake up the forker thread which
                 * will create and run it.
                 */
-               trace_writeback_nothread(bdi, work);
                wake_up_process(default_backing_dev_info.wb.task);
        }
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+                          struct wb_writeback_work *work)
+{
+       trace_writeback_queue(bdi, work);
+
+       spin_lock_bh(&bdi->wb_lock);
+       list_add_tail(&work->list, &bdi->work_list);
+       if (!bdi->wb.task)
+               trace_writeback_nothread(bdi, work);
+       bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-               bool range_cyclic, bool for_background)
+                     bool range_cyclic)
 {
        struct wb_writeback_work *work;
 
@@ -121,7 +133,6 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->sync_mode = WB_SYNC_NONE;
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
-       work->for_background = for_background;
 
        bdi_queue_work(bdi, work);
 }
@@ -139,7 +150,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-       __bdi_start_writeback(bdi, nr_pages, true, false);
+       __bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -147,13 +158,21 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-       __bdi_start_writeback(bdi, LONG_MAX, true, true);
+       /*
+        * We just wake up the flusher thread. It will perform background
+        * writeback as soon as there is no other work to do.
+        */
+       trace_writeback_wake_background(bdi);
+       spin_lock_bh(&bdi->wb_lock);
+       bdi_wakeup_flusher(bdi);
+       spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -172,11 +191,11 @@ static void redirty_tail(struct inode *inode)
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
 
-               tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+               tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-       list_move(&inode->i_list, &wb->b_dirty);
+       list_move(&inode->i_wb_list, &wb->b_dirty);
 }
 
 /*
@@ -186,7 +205,7 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
-       list_move(&inode->i_list, &wb->b_more_io);
+       list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -227,14 +246,14 @@ static void move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
 
        while (!list_empty(delaying_queue)) {
-               inode = list_entry(delaying_queue->prev, struct inode, i_list);
+               inode = wb_inode(delaying_queue->prev);
                if (older_than_this &&
                    inode_dirtied_after(inode, *older_than_this))
                        break;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
-               list_move(&inode->i_list, &tmp);
+               list_move(&inode->i_wb_list, &tmp);
        }
 
        /* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +264,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 
        /* Move inodes from one superblock together */
        while (!list_empty(&tmp)) {
-               inode = list_entry(tmp.prev, struct inode, i_list);
-               sb = inode->i_sb;
+               sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
-                       inode = list_entry(pos, struct inode, i_list);
+                       inode = wb_inode(pos);
                        if (inode->i_sb == sb)
-                               list_move(&inode->i_list, dispatch_queue);
+                               list_move(&inode->i_wb_list, dispatch_queue);
                }
        }
 }
@@ -408,16 +426,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                         * completion.
                         */
                        redirty_tail(inode);
-               } else if (atomic_read(&inode->i_count)) {
-                       /*
-                        * The inode is clean, inuse
-                        */
-                       list_move(&inode->i_list, &inode_in_use);
                } else {
                        /*
-                        * The inode is clean, unused
+                        * The inode is clean.  At this point we either have
+                        * a reference to the inode or it's on it's way out.
+                        * No need to add it back to the LRU.
                         */
-                       list_move(&inode->i_list, &inode_unused);
+                       list_del_init(&inode->i_wb_list);
                }
        }
        inode_sync_complete(inode);
@@ -465,8 +480,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 {
        while (!list_empty(&wb->b_io)) {
                long pages_skipped;
-               struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
+               struct inode *inode = wb_inode(wb->b_io.prev);
 
                if (inode->i_sb != sb) {
                        if (only_this_sb) {
@@ -487,10 +501,16 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                        return 0;
                }
 
-               if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+               /*
+                * Don't bother with new inodes or inodes beeing freed, first
+                * kind does not need peridic writeout yet, and for the latter
+                * kind writeout is handled by the freer.
+                */
+               if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        requeue_io(inode);
                        continue;
                }
+
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -498,7 +518,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                if (inode_dirtied_after(inode, wbc->wb_start))
                        return 1;
 
-               BUG_ON(inode->i_state & I_FREEING);
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
@@ -536,8 +555,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                queue_io(wb, wbc->older_than_this);
 
        while (!list_empty(&wb->b_io)) {
-               struct inode *inode = list_entry(wb->b_io.prev,
-                                                struct inode, i_list);
+               struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;
 
                if (!pin_sb_for_writeback(sb)) {
@@ -582,7 +600,7 @@ static inline bool over_bground_thresh(void)
        global_dirty_limits(&background_thresh, &dirty_thresh);
 
        return (global_page_state(NR_FILE_DIRTY) +
-               global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+               global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 
 /*
@@ -612,6 +630,7 @@ static long wb_writeback(struct bdi_writeback *wb,
        };
        unsigned long oldest_jif;
        long wrote = 0;
+       long write_chunk;
        struct inode *inode;
 
        if (wbc.for_kupdate) {
@@ -624,6 +643,24 @@ static long wb_writeback(struct bdi_writeback *wb,
                wbc.range_end = LLONG_MAX;
        }
 
+       /*
+        * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+        * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+        * here avoids calling into writeback_inodes_wb() more than once.
+        *
+        * The intended call sequence for WB_SYNC_ALL writeback is:
+        *
+        *      wb_writeback()
+        *          __writeback_inodes_sb()     <== called only once
+        *              write_cache_pages()     <== called once for each inode
+        *                   (quickly) tag currently dirty pages
+        *                   (maybe slowly) sync all tagged pages
+        */
+       if (wbc.sync_mode == WB_SYNC_NONE)
+               write_chunk = MAX_WRITEBACK_PAGES;
+       else
+               write_chunk = LONG_MAX;
+
        wbc.wb_start = jiffies; /* livelock avoidance */
        for (;;) {
                /*
@@ -632,6 +669,16 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (work->nr_pages <= 0)
                        break;
 
+               /*
+                * Background writeout and kupdate-style writeback may
+                * run forever. Stop them if there is other work to do
+                * so that e.g. sync can proceed. They'll be restarted
+                * after the other works are all done.
+                */
+               if ((work->for_background || work->for_kupdate) &&
+                   !list_empty(&wb->bdi->work_list))
+                       break;
+
                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
@@ -640,7 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                        break;
 
                wbc.more_io = 0;
-               wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+               wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
 
                trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -650,8 +697,8 @@ static long wb_writeback(struct bdi_writeback *wb,
                        writeback_inodes_wb(wb, &wbc);
                trace_wbc_writeback_written(&wbc, wb->bdi);
 
-               work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-               wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+               work->nr_pages -= write_chunk - wbc.nr_to_write;
+               wrote += write_chunk - wbc.nr_to_write;
 
                /*
                 * If we consumed everything, see if we have more
@@ -666,7 +713,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                /*
                 * Did we write something? Try for more
                 */
-               if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+               if (wbc.nr_to_write < write_chunk)
                        continue;
                /*
                 * Nothing written. Wait for some inode to
@@ -675,8 +722,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 */
                spin_lock(&inode_lock);
                if (!list_empty(&wb->b_more_io))  {
-                       inode = list_entry(wb->b_more_io.prev,
-                                               struct inode, i_list);
+                       inode = wb_inode(wb->b_more_io.prev);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
                        inode_wait_for_writeback(inode);
                }
@@ -704,6 +750,34 @@ get_next_work_item(struct backing_dev_info *bdi)
        return work;
 }
 
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
+{
+       return global_page_state(NR_FILE_DIRTY) +
+               global_page_state(NR_UNSTABLE_NFS) +
+               get_nr_dirty_inodes();
+}
+
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+       if (over_bground_thresh()) {
+
+               struct wb_writeback_work work = {
+                       .nr_pages       = LONG_MAX,
+                       .sync_mode      = WB_SYNC_NONE,
+                       .for_background = 1,
+                       .range_cyclic   = 1,
+               };
+
+               return wb_writeback(wb, &work);
+       }
+
+       return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
        unsigned long expired;
@@ -721,9 +795,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
                return 0;
 
        wb->last_old_flush = jiffies;
-       nr_pages = global_page_state(NR_FILE_DIRTY) +
-                       global_page_state(NR_UNSTABLE_NFS) +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+       nr_pages = get_nr_dirty_pages();
 
        if (nr_pages) {
                struct wb_writeback_work work = {
@@ -775,6 +847,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+       wrote += wb_check_background_flush(wb);
        clear_bit(BDI_writeback_running, &wb->bdi->state);
 
        return wrote;
@@ -790,7 +863,7 @@ int bdi_writeback_thread(void *data)
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
 
-       current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+       current->flags |= PF_SWAPWRITE;
        set_freezable();
        wb->last_active = jiffies;
 
@@ -861,7 +934,7 @@ void wakeup_flusher_threads(long nr_pages)
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                if (!bdi_has_dirty_io(bdi))
                        continue;
-               __bdi_start_writeback(bdi, nr_pages, false, false);
+               __bdi_start_writeback(bdi, nr_pages, false);
        }
        rcu_read_unlock();
 }
@@ -962,7 +1035,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
-                       if (hlist_unhashed(&inode->i_hash))
+                       if (inode_unhashed(inode))
                                goto out;
                }
                if (inode->i_state & I_FREEING)
@@ -990,7 +1063,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
 
                        inode->dirtied_when = jiffies;
-                       list_move(&inode->i_list, &bdi->wb.b_dirty);
+                       list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                }
        }
 out:
@@ -1069,33 +1142,42 @@ static void wait_sb_inodes(struct super_block *sb)
 }
 
 /**
- * writeback_inodes_sb -       writeback dirty inodes from given super_block
+ * writeback_inodes_sb_nr -    writeback dirty inodes from given super_block
  * @sb: the superblock
+ * @nr: the number of pages to write
  *
  * Start writeback on some inodes on this super_block. No guarantees are made
  * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO. The number of pages submitted is
- * returned.
+ * for IO completion of submitted IO.
  */
-void writeback_inodes_sb(struct super_block *sb)
+void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
-       unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-       unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
        DECLARE_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .done           = &done,
+               .nr_pages       = nr,
        };
 
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-       work.nr_pages = nr_dirty + nr_unstable +
-                       (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
        bdi_queue_work(sb->s_bdi, &work);
        wait_for_completion(&done);
 }
+EXPORT_SYMBOL(writeback_inodes_sb_nr);
+
+/**
+ * writeback_inodes_sb -       writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+       return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
+}
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
@@ -1117,12 +1199,33 @@ int writeback_inodes_sb_if_idle(struct super_block *sb)
 }
 EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
 
+/**
+ * writeback_inodes_sb_if_idle -       start writeback if none underway
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
+                                  unsigned long nr)
+{
+       if (!writeback_in_progress(sb->s_bdi)) {
+               down_read(&sb->s_umount);
+               writeback_inodes_sb_nr(sb, nr);
+               up_read(&sb->s_umount);
+               return 1;
+       } else
+               return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+
 /**
  * sync_inodes_sb      -       sync sb inode pages
  * @sb: the superblock
  *
  * This function writes and waits on any dirty inode belonging to this
- * super_block. The number of pages synced is returned.
+ * super_block.
  */
 void sync_inodes_sb(struct super_block *sb)
 {
@@ -1198,3 +1301,23 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode_metadata - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust its dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+       struct writeback_control wbc = {
+               .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+               .nr_to_write = 0, /* metadata-only */
+       };
+
+       return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);