Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

[mv-sheeva.git] / fs / xfs / linux-2.6 / xfs_sync.c
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c

index afb0d7cfad1cceafef852098d955cd8dfb42bee3..e22f0057d21fa8d2d3e04c11a2e62438ac17a3e9 100644 (file)
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
  {
         struct inode            *inode = VFS_I(ip);
  
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
         /* nothing to sync during shutdown */
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return EFSCORRUPTED;
  
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
         /* If we can't grab the inode, it must on it's way to reclaim. */
         if (!igrab(inode))
                 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
  
         /* inode is valid */
         return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
  }
  
  STATIC int
@@ -98,12 +118,12 @@ restart:
                 int             error = 0;
                 int             i;
  
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                         (void **)batch, first_index,
                                         XFS_LOOKUP_BATCH);
                 if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                         break;
                 }
  
@@ -118,18 +138,26 @@ restart:
                                 batch[i] = NULL;
  
                         /*
-                        * Update the index for the next lookup. Catch overflows
-                        * into the next AG range which can occur if we have inodes
-                        * in the last block of the AG and we are currently
-                        * pointing to the last inode.
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
                          */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
                         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                         if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                 done = 1;
                 }
  
                 /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
  
                 for (i = 0; i < nr_found; i++) {
                         if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
  
         /* mark the log as covered if needed */
         if (xfs_log_need_covered(mp))
-               error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+               error2 = xfs_fs_log_dummy(mp);
  
         /* flush data-only devices */
         if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
         int             error;
  
         if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               xfs_log_force(mp, 0);
-               xfs_reclaim_inodes(mp, 0);
                 /* dgc: errors ignored here */
-               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                 if (mp->m_super->s_frozen == SB_UNFROZEN &&
                     xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp, 0);
+                       error = xfs_fs_log_dummy(mp);
+               else
+                       xfs_log_force(mp, 0);
+               xfs_reclaim_inodes(mp, 0);
+               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
         }
         mp->m_sync_seq++;
         wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
         struct xfs_perag *pag;
  
         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
         spin_lock(&ip->i_flags_lock);
         __xfs_inode_set_reclaim_tag(pag, ip);
         __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
         spin_unlock(&ip->i_flags_lock);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
         xfs_perag_put(pag);
  }
  
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
         struct xfs_inode        *ip,
         int                     flags)
  {
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
  
         /*
-        * do some unlocked checks first to avoid unnecceary lock traffic.
+        * do some unlocked checks first to avoid unnecessary lock traffic.
          * The first is a flush lock check, the second is a already in reclaim
          * check. Only do these checks if we are not going to block on locks.
          */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
          * The radix tree lock here protects a thread in xfs_iget from racing
          * with us starting reclaim on the inode.  Once we have the
          * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
          */
         spin_lock(&ip->i_flags_lock);
-       ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* ignore as it is already under reclaim */
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
                 spin_unlock(&ip->i_flags_lock);
                 return 1;
         }
@@ -795,12 +834,12 @@ reclaim:
          * added to the tree assert that it's been there before to catch
          * problems with the inode life time early on.
          */
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
         if (!radix_tree_delete(&pag->pag_ici_root,
                                 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                 ASSERT(0);
         __xfs_inode_clear_reclaim(pag, ip);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
  
         /*
          * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
                         struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                         int     i;
  
-                       write_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                         nr_found = radix_tree_gang_lookup_tag(
                                         &pag->pag_ici_root,
                                         (void **)batch, first_index,
                                         XFS_LOOKUP_BATCH,
                                         XFS_ICI_RECLAIM_TAG);
                         if (!nr_found) {
-                               write_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                 break;
                         }
  
@@ -891,14 +930,24 @@ restart:
                                  * occur if we have inodes in the last block of
                                  * the AG and we are currently pointing to the
                                  * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
                                  */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
                                 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                         done = 1;
                         }
  
                         /* unlock now we've grabbed the inodes. */
-                       write_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
  
                         for (i = 0; i < nr_found; i++) {
                                 if (!batch[i])