Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

[linux-beck.git] / fs / xfs / linux-2.6 / xfs_buf.c
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c

index 6f3ebb634b8bc71089fbcb4dc69fea0e658d185a..6f76ba85f193e724e8bba96d5c8cc07e40fb1c50 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
  #include <linux/migrate.h>
  #include <linux/backing-dev.h>
  #include <linux/freezer.h>
+#include <linux/list_sort.h>
  
  #include "xfs_sb.h"
  #include "xfs_inum.h"
@@ -1072,22 +1073,30 @@ xfs_buf_ioerror(
  }
  
  int
-xfs_bawrite(
-       void                    *mp,
+xfs_bwrite(
+       struct xfs_mount        *mp,
         struct xfs_buf          *bp)
  {
-       trace_xfs_buf_bawrite(bp, _RET_IP_);
+       int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+       int                     error = 0;
  
-       ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+       bp->b_strat = xfs_bdstrat_cb;
+       bp->b_mount = mp;
+       bp->b_flags |= XBF_WRITE;
+       if (!iowait)
+               bp->b_flags |= _XBF_RUN_QUEUES;
  
         xfs_buf_delwri_dequeue(bp);
+       xfs_buf_iostrategy(bp);
  
-       bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-       bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
+       if (iowait) {
+               error = xfs_buf_iowait(bp);
+               if (error)
+                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               xfs_buf_relse(bp);
+       }
  
-       bp->b_mount = mp;
-       bp->b_strat = xfs_bdstrat_cb;
-       return xfs_bdstrat_cb(bp);
+       return error;
  }
  
  void
@@ -1106,6 +1115,126 @@ xfs_bdwrite(
         xfs_buf_delwri_queue(bp, 1);
  }
  
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+       xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+       ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+       /*
+        * No need to wait until the buffer is unpinned, we aren't flushing it.
+        */
+       XFS_BUF_ERROR(bp, EIO);
+
+       /*
+        * We're calling biodone, so delete XBF_DONE flag.
+        */
+       XFS_BUF_UNREAD(bp);
+       XFS_BUF_UNDELAYWRITE(bp);
+       XFS_BUF_UNDONE(bp);
+       XFS_BUF_STALE(bp);
+
+       XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+       xfs_biodone(bp);
+
+       return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+       struct xfs_buf  *bp)
+{
+       int64_t         fl = XFS_BUF_BFLAGS(bp);
+       /*
+        * No need to wait until the buffer is unpinned.
+        * We aren't flushing it.
+        *
+        * chunkhold expects B_DONE to be set, whether
+        * we actually finish the I/O or not. We don't want to
+        * change that interface.
+        */
+       XFS_BUF_UNREAD(bp);
+       XFS_BUF_UNDELAYWRITE(bp);
+       XFS_BUF_DONE(bp);
+       XFS_BUF_STALE(bp);
+       XFS_BUF_CLR_IODONE_FUNC(bp);
+       XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+       if (!(fl & XBF_ASYNC)) {
+               /*
+                * Mark b_error and B_ERROR _both_.
+                * Lot's of chunkcache code assumes that.
+                * There's no reason to mark error for
+                * ASYNC buffers.
+                */
+               XFS_BUF_ERROR(bp, EIO);
+               XFS_BUF_FINISH_IOWAIT(bp);
+       } else {
+               xfs_buf_relse(bp);
+       }
+
+       return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+       struct xfs_buf  *bp)
+{
+       if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+               trace_xfs_bdstrat_shut(bp, _RET_IP_);
+               /*
+                * Metadata write that didn't get logged but
+                * written delayed anyway. These aren't associated
+                * with a transaction, and can be ignored.
+                */
+               if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+                       return xfs_bioerror_relse(bp);
+               else
+                       return xfs_bioerror(bp);
+       }
+
+       xfs_buf_iorequest(bp);
+       return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               trace_xfs_bdstrat_shut(bp, _RET_IP_);
+               xfs_bioerror_relse(bp);
+               return;
+       }
+
+       xfs_buf_iorequest(bp);
+}
+
  STATIC void
  _xfs_buf_ioend(
         xfs_buf_t               *bp,
@@ -1324,7 +1453,7 @@ xfs_buf_iomove(
         xfs_buf_t               *bp,    /* buffer to process            */
         size_t                  boff,   /* starting buffer offset       */
         size_t                  bsize,  /* length to copy               */
-       caddr_t                 data,   /* data address                 */
+       void                    *data,  /* data address                 */
         xfs_buf_rw_t            mode)   /* read/write/zero flag         */
  {
         size_t                  bend, cpoff, csize;
@@ -1406,8 +1535,8 @@ xfs_alloc_bufhash(
  
         btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
         btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-       btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
-                                       sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
+       btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+                                        sizeof(xfs_bufhash_t));
         for (i = 0; i < (1 << btp->bt_hashshift); i++) {
                 spin_lock_init(&btp->bt_hash[i].bh_lock);
                 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1418,7 +1547,7 @@ STATIC void
  xfs_free_bufhash(
         xfs_buftarg_t           *btp)
  {
-       kmem_free(btp->bt_hash);
+       kmem_free_large(btp->bt_hash);
         btp->bt_hash = NULL;
  }
  
@@ -1623,6 +1752,11 @@ xfs_buf_delwri_queue(
                 list_del(&bp->b_list);
         }
  
+       if (list_empty(dwq)) {
+               /* start xfsbufd as it is about to have something to do */
+               wake_up_process(bp->b_target->bt_task);
+       }
+
         bp->b_flags |= _XBF_DELWRI_Q;
         list_add_tail(&bp->b_list, dwq);
         bp->b_queuetime = jiffies;
@@ -1654,6 +1788,35 @@ xfs_buf_delwri_dequeue(
         trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
  }
  
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+       long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+       ASSERT(bp->b_flags & XBF_DELWRI);
+       ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+       /*
+        * Check the buffer age before locking the delayed write queue as we
+        * don't need to promote buffers that are already past the flush age.
+        */
+       if (bp->b_queuetime < jiffies - age)
+               return;
+       bp->b_queuetime = jiffies - age;
+       spin_lock(&btp->bt_delwrite_lock);
+       list_move(&bp->b_list, &btp->bt_delwrite_queue);
+       spin_unlock(&btp->bt_delwrite_lock);
+}
+
  STATIC void
  xfs_buf_runall_queues(
         struct workqueue_struct *queue)
@@ -1672,6 +1835,8 @@ xfsbufd_wakeup(
         list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
                 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
                         continue;
+               if (list_empty(&btp->bt_delwrite_queue))
+                       continue;
                 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
                 wake_up_process(btp->bt_task);
         }
@@ -1722,20 +1887,53 @@ xfs_buf_delwri_split(
  
  }
  
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+       void            *priv,
+       struct list_head *a,
+       struct list_head *b)
+{
+       struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
+       struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
+       xfs_daddr_t             diff;
+
+       diff = ap->b_bn - bp->b_bn;
+       if (diff < 0)
+               return -1;
+       if (diff > 0)
+               return 1;
+       return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+       xfs_buftarg_t   *target,
+       struct list_head *list)
+{
+       list_sort(NULL, list, xfs_buf_cmp);
+}
+
  STATIC int
  xfsbufd(
         void            *data)
  {
-       struct list_head tmp;
-       xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
-       int             count;
-       xfs_buf_t       *bp;
+       xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
  
         current->flags |= PF_MEMALLOC;
  
         set_freezable();
  
         do {
+               long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+               long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+               int     count = 0;
+               struct list_head tmp;
+
                 if (unlikely(freezing(current))) {
                         set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
                         refrigerator();
@@ -1743,17 +1941,16 @@ xfsbufd(
                         clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
                 }
  
-               schedule_timeout_interruptible(
-                       xfs_buf_timer_centisecs * msecs_to_jiffies(10));
+               /* sleep for a long time if there is nothing to do. */
+               if (list_empty(&target->bt_delwrite_queue))
+                       tout = MAX_SCHEDULE_TIMEOUT;
+               schedule_timeout_interruptible(tout);
  
-               xfs_buf_delwri_split(target, &tmp,
-                               xfs_buf_age_centisecs * msecs_to_jiffies(10));
-
-               count = 0;
+               xfs_buf_delwri_split(target, &tmp, age);
+               list_sort(NULL, &tmp, xfs_buf_cmp);
                 while (!list_empty(&tmp)) {
-                       bp = list_entry(tmp.next, xfs_buf_t, b_list);
-                       ASSERT(target == bp->b_target);
-
+                       struct xfs_buf *bp;
+                       bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                         list_del_init(&bp->b_list);
                         xfs_buf_iostrategy(bp);
                         count++;
@@ -1779,42 +1976,45 @@ xfs_flush_buftarg(
         xfs_buftarg_t   *target,
         int             wait)
  {
-       struct list_head tmp;
-       xfs_buf_t       *bp, *n;
+       xfs_buf_t       *bp;
         int             pincount = 0;
+       LIST_HEAD(tmp_list);
+       LIST_HEAD(wait_list);
  
         xfs_buf_runall_queues(xfsconvertd_workqueue);
         xfs_buf_runall_queues(xfsdatad_workqueue);
         xfs_buf_runall_queues(xfslogd_workqueue);
  
         set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-       pincount = xfs_buf_delwri_split(target, &tmp, 0);
+       pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
  
         /*
-        * Dropped the delayed write list lock, now walk the temporary list
+        * Dropped the delayed write list lock, now walk the temporary list.
+        * All I/O is issued async and then if we need to wait for completion
+        * we do that after issuing all the IO.
          */
-       list_for_each_entry_safe(bp, n, &tmp, b_list) {
+       list_sort(NULL, &tmp_list, xfs_buf_cmp);
+       while (!list_empty(&tmp_list)) {
+               bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                 ASSERT(target == bp->b_target);
-               if (wait)
+               list_del_init(&bp->b_list);
+               if (wait) {
                         bp->b_flags &= ~XBF_ASYNC;
-               else
-                       list_del_init(&bp->b_list);
-
+                       list_add(&bp->b_list, &wait_list);
+               }
                 xfs_buf_iostrategy(bp);
         }
  
-       if (wait)
+       if (wait) {
+               /* Expedite and wait for IO to complete. */
                 blk_run_address_space(target->bt_mapping);
+               while (!list_empty(&wait_list)) {
+                       bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  
-       /*
-        * Remaining list items must be flushed before returning
-        */
-       while (!list_empty(&tmp)) {
-               bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
-               list_del_init(&bp->b_list);
-               xfs_iowait(bp);
-               xfs_buf_relse(bp);
+                       list_del_init(&bp->b_list);
+                       xfs_iowait(bp);
+                       xfs_buf_relse(bp);
+               }
         }
  
         return pincount;