]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - fs/xfs/linux-2.6/xfs_sync.c
Merge tag 'v2.6.38' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
[mv-sheeva.git] / fs / xfs / linux-2.6 / xfs_sync.c
index afb0d7cfad1cceafef852098d955cd8dfb42bee3..e22f0057d21fa8d2d3e04c11a2e62438ac17a3e9 100644 (file)
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
 
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
 
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
 
        /* inode is valid */
        return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
 }
 
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
 
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                        break;
                }
 
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
 
                        /*
-                        * Update the index for the next lookup. Catch overflows
-                        * into the next AG range which can occur if we have inodes
-                        * in the last block of the AG and we are currently
-                        * pointing to the last inode.
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
                         */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
 
                /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -334,7 +362,7 @@ xfs_quiesce_data(
 
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-               error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+               error2 = xfs_fs_log_dummy(mp);
 
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -475,13 +503,14 @@ xfs_sync_worker(
        int             error;
 
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               xfs_log_force(mp, 0);
-               xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp, 0);
+                       error = xfs_fs_log_dummy(mp);
+               else
+                       xfs_log_force(mp, 0);
+               xfs_reclaim_inodes(mp, 0);
+               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -592,12 +621,12 @@ xfs_inode_set_reclaim_tag(
        struct xfs_perag *pag;
 
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
        __xfs_inode_set_reclaim_tag(pag, ip);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
        xfs_perag_put(pag);
 }
 
@@ -639,9 +668,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
 
        /*
-        * do some unlocked checks first to avoid unnecceary lock traffic.
+        * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +688,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-       ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* ignore as it is already under reclaim */
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -795,12 +834,12 @@ reclaim:
         * added to the tree assert that it's been there before to catch
         * problems with the inode life time early on.
         */
-       write_lock(&pag->pag_ici_lock);
+       spin_lock(&pag->pag_ici_lock);
        if (!radix_tree_delete(&pag->pag_ici_root,
                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
                ASSERT(0);
        __xfs_inode_clear_reclaim(pag, ip);
-       write_unlock(&pag->pag_ici_lock);
+       spin_unlock(&pag->pag_ici_lock);
 
        /*
         * Here we do an (almost) spurious inode lock in order to coordinate
@@ -864,14 +903,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
 
-                       write_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                               write_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                break;
                        }
 
@@ -891,14 +930,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
                                 */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
 
                        /* unlock now we've grabbed the inodes. */
-                       write_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])