xfs: remove i_iocount

[mv-sheeva.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 7f7b42469ea7c9653903d0ffbe00769d40a168eb..ee63c4fb3639f0ae59c733c7a556dfbeef370a8f 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -149,10 +149,6 @@ xfs_file_fsync(
  
         xfs_iflags_clear(ip, XFS_ITRUNCATED);
  
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       xfs_ioend_wait(ip);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
         if (mp->m_flags & XFS_MOUNT_BARRIER) {
                 /*
                  * If we have an RT and/or log subvolume we need to make sure
@@ -317,7 +313,19 @@ xfs_file_aio_read(
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
-       if (unlikely(ioflags & IO_ISDIRECT)) {
+       /*
+        * Locking is a bit tricky here. If we take an exclusive lock
+        * for direct IO, we effectively serialise all new concurrent
+        * read IO to this file and block it behind IO that is currently in
+        * progress because IO in progress holds the IO lock shared. We only
+        * need to hold the lock exclusive to blow away the page cache, so
+        * only take lock exclusively if the page cache needs invalidation.
+        * This allows the normal direct IO case of no page cache pages to
+        * proceeed concurrently without serialisation.
+        */
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+               xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
                 if (inode->i_mapping->nrpages) {
@@ -330,8 +338,7 @@ xfs_file_aio_read(
                         }
                 }
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-       } else
-               xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       }
  
         trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
  
@@ -407,11 +414,13 @@ xfs_aio_write_isize_update(
   */
  STATIC void
  xfs_aio_write_newsize_update(
-       struct xfs_inode        *ip)
+       struct xfs_inode        *ip,
+       xfs_fsize_t             new_size)
  {
-       if (ip->i_new_size) {
+       if (new_size == ip->i_new_size) {
                 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               ip->i_new_size = 0;
+               if (new_size == ip->i_new_size)
+                       ip->i_new_size = 0;
                 if (ip->i_d.di_size > ip->i_size)
                         ip->i_d.di_size = ip->i_size;
                 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -462,7 +471,7 @@ xfs_file_splice_write(
         ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
  
         xfs_aio_write_isize_update(inode, ppos, ret);
-       xfs_aio_write_newsize_update(ip);
+       xfs_aio_write_newsize_update(ip, new_size);
         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
         return ret;
  }
@@ -659,6 +668,7 @@ xfs_file_aio_write_checks(
         struct file             *file,
         loff_t                  *pos,
         size_t                  *count,
+       xfs_fsize_t             *new_sizep,
         int                     *iolock)
  {
         struct inode            *inode = file->f_mapping->host;
@@ -666,6 +676,8 @@ xfs_file_aio_write_checks(
         xfs_fsize_t             new_size;
         int                     error = 0;
  
+       *new_sizep = 0;
+restart:
         error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
         if (error) {
                 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -673,20 +685,41 @@ xfs_file_aio_write_checks(
                 return error;
         }
  
-       new_size = *pos + *count;
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
-
         if (likely(!(file->f_mode & FMODE_NOCMTIME)))
                 file_update_time(file);
  
         /*
          * If the offset is beyond the size of the file, we need to zero any
          * blocks that fall between the existing EOF and the start of this
-        * write.
+        * write. There is no need to issue zeroing if another in-flght IO ends
+        * at or before this one If zeronig is needed and we are currently
+        * holding the iolock shared, we need to update it to exclusive which
+        * involves dropping all locks and relocking to maintain correct locking
+        * order. If we do this, restart the function to ensure all checks and
+        * values are still valid.
          */
-       if (*pos > ip->i_size)
+       if ((ip->i_new_size && *pos > ip->i_new_size) ||
+           (!ip->i_new_size && *pos > ip->i_size)) {
+               if (*iolock == XFS_IOLOCK_SHARED) {
+                       xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                       *iolock = XFS_IOLOCK_EXCL;
+                       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+                       goto restart;
+               }
                 error = -xfs_zero_eof(ip, *pos, ip->i_size);
+       }
+
+       /*
+        * If this IO extends beyond EOF, we may need to update ip->i_new_size.
+        * We have already zeroed space beyond EOF (if necessary).  Only update
+        * ip->i_new_size if this IO ends beyond any other in-flight writes.
+        */
+       new_size = *pos + *count;
+       if (new_size > ip->i_size) {
+               if (new_size > ip->i_new_size)
+                       ip->i_new_size = new_size;
+               *new_sizep = new_size;
+       }
  
         xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
         if (error)
@@ -721,7 +754,7 @@ xfs_file_aio_write_checks(
   * the dio layer.  To avoid the problem with aio, we also need to wait for
   * outstanding IOs to complete so that unwritten extent conversion is completed
   * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ * hitting it with a big hammer (i.e. inode_dio_wait()).
   *
   * Returns with locks held indicated by @iolock and errors indicated by
   * negative return values.
@@ -733,6 +766,7 @@ xfs_file_dio_aio_write(
         unsigned long           nr_segs,
         loff_t                  pos,
         size_t                  ocount,
+       xfs_fsize_t             *new_size,
         int                     *iolock)
  {
         struct file             *file = iocb->ki_filp;
@@ -753,13 +787,20 @@ xfs_file_dio_aio_write(
         if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
                 unaligned_io = 1;
  
-       if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+       /*
+        * We don't need to take an exclusive lock unless there page cache needs
+        * to be invalidated or unaligned IO is being executed. We don't need to
+        * consider the EOF extension case here because
+        * xfs_file_aio_write_checks() will relock the inode as necessary for
+        * EOF zeroing cases and fill out the new inode size as appropriate.
+        */
+       if (unaligned_io || mapping->nrpages)
                 *iolock = XFS_IOLOCK_EXCL;
         else
                 *iolock = XFS_IOLOCK_SHARED;
         xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
  
-       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
         if (ret)
                 return ret;
  
@@ -776,7 +817,7 @@ xfs_file_dio_aio_write(
          * otherwise demote the lock if we had to flush cached pages
          */
         if (unaligned_io)
-               xfs_ioend_wait(ip);
+               inode_dio_wait(inode);
         else if (*iolock == XFS_IOLOCK_EXCL) {
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
                 *iolock = XFS_IOLOCK_SHARED;
@@ -798,6 +839,7 @@ xfs_file_buffered_aio_write(
         unsigned long           nr_segs,
         loff_t                  pos,
         size_t                  ocount,
+       xfs_fsize_t             *new_size,
         int                     *iolock)
  {
         struct file             *file = iocb->ki_filp;
@@ -811,7 +853,7 @@ xfs_file_buffered_aio_write(
         *iolock = XFS_IOLOCK_EXCL;
         xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
  
-       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
         if (ret)
                 return ret;
  
@@ -851,6 +893,7 @@ xfs_file_aio_write(
         ssize_t                 ret;
         int                     iolock;
         size_t                  ocount = 0;
+       xfs_fsize_t             new_size = 0;
  
         XFS_STATS_INC(xs_write_calls);
  
@@ -870,10 +913,10 @@ xfs_file_aio_write(
  
         if (unlikely(file->f_flags & O_DIRECT))
                 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &iolock);
+                                               ocount, &new_size, &iolock);
         else
                 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &iolock);
+                                               ocount, &new_size, &iolock);
  
         xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
  
@@ -894,7 +937,7 @@ xfs_file_aio_write(
         }
  
  out_unlock:
-       xfs_aio_write_newsize_update(ip);
+       xfs_aio_write_newsize_update(ip, new_size);
         xfs_rw_iunlock(ip, iolock);
         return ret;
  }