]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - fs/dcache.c
Merge remote-tracking branch 'fuse/for-next'
[karo-tx-linux.git] / fs / dcache.c
index 96655f4f45749e530cee80a32c36fb8e6061164b..d70df2e0e0da8a6b41d9bfeba798fb4cc9481a80 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -48,7 +49,7 @@
  *   - the dcache hash table
  * s_anon bl list spinlock protects:
  *   - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
  *   - d_flags
@@ -63,7 +64,7 @@
  * Ordering:
  * dentry->d_inode->i_lock
  *   dentry->d_lock
- *     dcache_lru_lock
+ *     dentry->d_sb->s_dentry_lru_lock
  *     dcache_hash_bucket lock
  *     s_anon lock
  *
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+       if (!(*seq & 1))        /* Even */
+               *seq = read_seqbegin(lock);
+       else                    /* Odd */
+               read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+       return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+       if (seq & 1)
+               read_sequnlock_excl(lock);
+}
+
 /*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
@@ -117,23 +146,47 @@ struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
 };
 
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
 {
        int i;
-       int sum = 0;
+       long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_unused(void)
+{
+       int i;
+       long sum = 0;
+       for_each_possible_cpu(i)
+               sum += per_cpu(nr_dentry_unused, i);
+       return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos)
 {
        dentry_stat.nr_dentry = get_nr_dentry();
-       return proc_dointvec(table, write, buffer, lenp, ppos);
+       dentry_stat.nr_unused = get_nr_dentry_unused();
+       return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -229,7 +282,7 @@ static void __d_free(struct rcu_head *head)
  */
 static void d_free(struct dentry *dentry)
 {
-       BUG_ON(dentry->d_lockref.count);
+       BUG_ON((int)dentry->d_lockref.count > 0);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
@@ -304,50 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
  */
-static void dentry_lru_add(struct dentry *dentry)
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
 {
-       if (list_empty(&dentry->d_lru)) {
-               spin_lock(&dcache_lru_lock);
-               list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-               dentry->d_sb->s_nr_dentry_unused++;
-               dentry_stat.nr_unused++;
-               spin_unlock(&dcache_lru_lock);
-       }
+       D_FLAG_VERIFY(dentry, 0);
+       dentry->d_flags |= DCACHE_LRU_LIST;
+       this_cpu_inc(nr_dentry_unused);
+       WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
-static void __dentry_lru_del(struct dentry *dentry)
+static void d_lru_del(struct dentry *dentry)
 {
+       D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+       dentry->d_flags &= ~DCACHE_LRU_LIST;
+       this_cpu_dec(nr_dentry_unused);
+       WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+       D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
-       dentry->d_flags &= ~DCACHE_SHRINK_LIST;
-       dentry->d_sb->s_nr_dentry_unused--;
-       dentry_stat.nr_unused--;
+       dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
+       this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+       D_FLAG_VERIFY(dentry, 0);
+       list_add(&dentry->d_lru, list);
+       dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+       this_cpu_inc(nr_dentry_unused);
 }
 
 /*
- * Remove a dentry with references from the LRU.
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
  */
-static void dentry_lru_del(struct dentry *dentry)
+static void d_lru_isolate(struct dentry *dentry)
 {
-       if (!list_empty(&dentry->d_lru)) {
-               spin_lock(&dcache_lru_lock);
-               __dentry_lru_del(dentry);
-               spin_unlock(&dcache_lru_lock);
-       }
+       D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+       dentry->d_flags &= ~DCACHE_LRU_LIST;
+       this_cpu_dec(nr_dentry_unused);
+       list_del_init(&dentry->d_lru);
 }
 
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
 {
-       spin_lock(&dcache_lru_lock);
-       if (list_empty(&dentry->d_lru)) {
-               list_add_tail(&dentry->d_lru, list);
-               dentry->d_sb->s_nr_dentry_unused++;
-               dentry_stat.nr_unused++;
-       } else {
-               list_move_tail(&dentry->d_lru, list);
+       D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+       dentry->d_flags |= DCACHE_SHRINK_LIST;
+       list_move_tail(&dentry->d_lru, list);
+}
+
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+       if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+               d_lru_add(dentry);
+}
+
+/*
+ * Remove a dentry with references from the LRU.
+ *
+ * If we are on the shrink list, then we can get to try_prune_one_dentry() and
+ * lose our last reference through the parent walk. In this case, we need to
+ * remove ourselves from the shrink list, not the LRU.
+ */
+static void dentry_lru_del(struct dentry *dentry)
+{
+       if (dentry->d_flags & DCACHE_LRU_LIST) {
+               if (dentry->d_flags & DCACHE_SHRINK_LIST)
+                       return d_shrink_del(dentry);
+               d_lru_del(dentry);
        }
-       spin_unlock(&dcache_lru_lock);
 }
 
 /**
@@ -443,7 +542,8 @@ EXPORT_SYMBOL(d_drop);
  * If ref is non-zero, then decrement the refcount too.
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
-static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
+static inline struct dentry *
+dentry_kill(struct dentry *dentry, int unlock_on_failure)
        __releases(dentry->d_lock)
 {
        struct inode *inode;
@@ -452,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry, int ref)
        inode = dentry->d_inode;
        if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
-               spin_unlock(&dentry->d_lock);
-               cpu_relax();
+               if (unlock_on_failure) {
+                       spin_unlock(&dentry->d_lock);
+                       cpu_relax();
+               }
                return dentry; /* try again with same dentry */
        }
        if (IS_ROOT(dentry))
@@ -466,13 +568,16 @@ relock:
                goto relock;
        }
 
-       if (ref)
-               dentry->d_lockref.count--;
+       /*
+        * The dentry is now unrecoverably dead to the world.
+        */
+       lockref_mark_dead(&dentry->d_lockref);
+
        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
-       if (dentry->d_flags & DCACHE_OP_PRUNE)
+       if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
                dentry->d_op->d_prune(dentry);
 
        dentry_lru_del(dentry);
@@ -509,24 +614,22 @@ relock:
  */
 void dput(struct dentry *dentry)
 {
-       if (!dentry)
+       if (unlikely(!dentry))
                return;
 
 repeat:
-       if (dentry->d_lockref.count == 1)
-               might_sleep();
        if (lockref_put_or_lock(&dentry->d_lockref))
                return;
 
-       if (dentry->d_flags & DCACHE_OP_DELETE) {
+       /* Unreachable? Get rid of it */
+       if (unlikely(d_unhashed(dentry)))
+               goto kill_it;
+
+       if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
                if (dentry->d_op->d_delete(dentry))
                        goto kill_it;
        }
 
-       /* Unreachable? Get rid of it */
-       if (d_unhashed(dentry))
-               goto kill_it;
-
        dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
 
@@ -727,6 +830,14 @@ restart:
        hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count) {
+                       /*
+                        * inform the fs via d_prune that this dentry
+                        * is about to be unhashed and destroyed.
+                        */
+                       if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+                           !d_unhashed(dentry))
+                               dentry->d_op->d_prune(dentry);
+
                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
@@ -747,7 +858,7 @@ EXPORT_SYMBOL(d_prune_aliases);
  *
  * This may fail if locks cannot be acquired no problem, just try again.
  */
-static void try_prune_one_dentry(struct dentry *dentry)
+static struct dentry * try_prune_one_dentry(struct dentry *dentry)
        __releases(dentry->d_lock)
 {
        struct dentry *parent;
@@ -764,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry)
         * fragmentation.
         */
        if (!parent)
-               return;
+               return NULL;
        if (parent == dentry)
-               return;
+               return dentry;
 
        /* Prune ancestors. */
        dentry = parent;
        while (dentry) {
                if (lockref_put_or_lock(&dentry->d_lockref))
-                       return;
+                       return NULL;
                dentry = dentry_kill(dentry, 1);
        }
+       return NULL;
 }
 
 static void shrink_dentry_list(struct list_head *list)
@@ -786,83 +898,159 @@ static void shrink_dentry_list(struct list_head *list)
                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
                if (&dentry->d_lru == list)
                        break; /* empty */
+
+               /*
+                * Get the dentry lock, and re-verify that the dentry is
+                * this on the shrinking list. If it is, we know that
+                * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
+                */
                spin_lock(&dentry->d_lock);
                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
 
+               /*
+                * The dispose list is isolated and dentries are not accounted
+                * to the LRU here, so we can simply remove it from the list
+                * here regardless of whether it is referenced or not.
+                */
+               d_shrink_del(dentry);
+
                /*
                 * We found an inuse dentry which was not removed from
-                * the LRU because of laziness during lookup.  Do not free
-                * it - just keep it off the LRU list.
+                * the LRU because of laziness during lookup. Do not free it.
                 */
                if (dentry->d_lockref.count) {
-                       dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
-
                rcu_read_unlock();
 
-               try_prune_one_dentry(dentry);
+               /*
+                * If 'try_to_prune()' returns a dentry, it will
+                * be the same one we passed in, and d_lock will
+                * have been held the whole time, so it will not
+                * have been added to any other lists. We failed
+                * to get the inode lock.
+                *
+                * We just add it back to the shrink list.
+                */
+               dentry = try_prune_one_dentry(dentry);
 
                rcu_read_lock();
+               if (dentry) {
+                       d_shrink_add(dentry, list);
+                       spin_unlock(&dentry->d_lock);
+               }
        }
        rcu_read_unlock();
 }
 
+static enum lru_status
+dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+{
+       struct list_head *freeable = arg;
+       struct dentry   *dentry = container_of(item, struct dentry, d_lru);
+
+
+       /*
+        * we are inverting the lru lock/dentry->d_lock here,
+        * so use a trylock. If we fail to get the lock, just skip
+        * it
+        */
+       if (!spin_trylock(&dentry->d_lock))
+               return LRU_SKIP;
+
+       /*
+        * Referenced dentries are still in use. If they have active
+        * counts, just remove them from the LRU. Otherwise give them
+        * another pass through the LRU.
+        */
+       if (dentry->d_lockref.count) {
+               d_lru_isolate(dentry);
+               spin_unlock(&dentry->d_lock);
+               return LRU_REMOVED;
+       }
+
+       if (dentry->d_flags & DCACHE_REFERENCED) {
+               dentry->d_flags &= ~DCACHE_REFERENCED;
+               spin_unlock(&dentry->d_lock);
+
+               /*
+                * The list move itself will be made by the common LRU code. At
+                * this point, we've dropped the dentry->d_lock but keep the
+                * lru lock. This is safe to do, since every list movement is
+                * protected by the lru lock even if both locks are held.
+                *
+                * This is guaranteed by the fact that all LRU management
+                * functions are intermediated by the LRU API calls like
+                * list_lru_add and list_lru_del. List movement in this file
+                * only ever occur through this functions or through callbacks
+                * like this one, that are called from the LRU API.
+                *
+                * The only exceptions to this are functions like
+                * shrink_dentry_list, and code that first checks for the
+                * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+                * operating only with stack provided lists after they are
+                * properly isolated from the main list.  It is thus, always a
+                * local access.
+                */
+               return LRU_ROTATE;
+       }
+
+       d_lru_shrink_move(dentry, freeable);
+       spin_unlock(&dentry->d_lock);
+
+       return LRU_REMOVED;
+}
+
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @count: number of entries to try to free
+ * @nr_to_scan : number of entries to try to free
+ * @nid: which node to scan for freeable entities
  *
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
  * done when we need more memory an called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+                    int nid)
 {
-       struct dentry *dentry;
-       LIST_HEAD(referenced);
-       LIST_HEAD(tmp);
+       LIST_HEAD(dispose);
+       long freed;
 
-relock:
-       spin_lock(&dcache_lru_lock);
-       while (!list_empty(&sb->s_dentry_lru)) {
-               dentry = list_entry(sb->s_dentry_lru.prev,
-                               struct dentry, d_lru);
-               BUG_ON(dentry->d_sb != sb);
-
-               if (!spin_trylock(&dentry->d_lock)) {
-                       spin_unlock(&dcache_lru_lock);
-                       cpu_relax();
-                       goto relock;
-               }
+       freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+                                      &dispose, &nr_to_scan);
+       shrink_dentry_list(&dispose);
+       return freed;
+}
 
-               if (dentry->d_flags & DCACHE_REFERENCED) {
-                       dentry->d_flags &= ~DCACHE_REFERENCED;
-                       list_move(&dentry->d_lru, &referenced);
-                       spin_unlock(&dentry->d_lock);
-               } else {
-                       list_move_tail(&dentry->d_lru, &tmp);
-                       dentry->d_flags |= DCACHE_SHRINK_LIST;
-                       spin_unlock(&dentry->d_lock);
-                       if (!--count)
-                               break;
-               }
-               cond_resched_lock(&dcache_lru_lock);
-       }
-       if (!list_empty(&referenced))
-               list_splice(&referenced, &sb->s_dentry_lru);
-       spin_unlock(&dcache_lru_lock);
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+                                               spinlock_t *lru_lock, void *arg)
+{
+       struct list_head *freeable = arg;
+       struct dentry   *dentry = container_of(item, struct dentry, d_lru);
+
+       /*
+        * we are inverting the lru lock/dentry->d_lock here,
+        * so use a trylock. If we fail to get the lock, just skip
+        * it
+        */
+       if (!spin_trylock(&dentry->d_lock))
+               return LRU_SKIP;
 
-       shrink_dentry_list(&tmp);
+       d_lru_shrink_move(dentry, freeable);
+       spin_unlock(&dentry->d_lock);
+
+       return LRU_REMOVED;
 }
 
+
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -872,16 +1060,17 @@ relock:
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-       LIST_HEAD(tmp);
+       long freed;
 
-       spin_lock(&dcache_lru_lock);
-       while (!list_empty(&sb->s_dentry_lru)) {
-               list_splice_init(&sb->s_dentry_lru, &tmp);
-               spin_unlock(&dcache_lru_lock);
-               shrink_dentry_list(&tmp);
-               spin_lock(&dcache_lru_lock);
-       }
-       spin_unlock(&dcache_lru_lock);
+       do {
+               LIST_HEAD(dispose);
+
+               freed = list_lru_walk(&sb->s_dentry_lru,
+                       dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+               this_cpu_sub(nr_dentry_unused, freed);
+               shrink_dentry_list(&dispose);
+       } while (freed > 0);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -911,7 +1100,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
                         * inform the fs that this dentry is about to be
                         * unhashed and destroyed.
                         */
-                       if (dentry->d_flags & DCACHE_OP_PRUNE)
+                       if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+                           !d_unhashed(dentry))
                                dentry->d_op->d_prune(dentry);
 
                        dentry_lru_del(dentry);
@@ -1000,7 +1190,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
  * the parenthood after dropping the lock and check
  * that the sequence number still matches.
  */
-static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
 {
        struct dentry *new = old->d_parent;
 
@@ -1014,7 +1204,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
         */
        if (new != old->d_parent ||
                 (old->d_flags & DCACHE_DENTRY_KILLED) ||
-                (!locked && read_seqretry(&rename_lock, seq))) {
+                need_seqretry(&rename_lock, seq)) {
                spin_unlock(&new->d_lock);
                new = NULL;
        }
@@ -1022,34 +1212,55 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
        return new;
 }
 
+/**
+ * enum d_walk_ret - action to talke during tree walk
+ * @D_WALK_CONTINUE:   contrinue walk
+ * @D_WALK_QUIT:       quit walk
+ * @D_WALK_NORETRY:    quit when retry is needed
+ * @D_WALK_SKIP:       skip this dentry and its children
+ */
+enum d_walk_ret {
+       D_WALK_CONTINUE,
+       D_WALK_QUIT,
+       D_WALK_NORETRY,
+       D_WALK_SKIP,
+};
 
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
 /**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
+ * d_walk - walk the dentry tree
+ * @parent:    start of walk
+ * @data:      data passed to @enter() and @finish()
+ * @enter:     callback when first entering the dentry
+ * @finish:    callback when successfully finished the walk
  *
- * Return true if the parent or its subdirectories contain
- * a mount point
+ * The @enter() and @finish() callbacks are called with d_lock held.
  */
-int have_submounts(struct dentry *parent)
+static void d_walk(struct dentry *parent, void *data,
+                  enum d_walk_ret (*enter)(void *, struct dentry *),
+                  void (*finish)(void *))
 {
        struct dentry *this_parent;
        struct list_head *next;
-       unsigned seq;
-       int locked = 0;
+       unsigned seq = 0;
+       enum d_walk_ret ret;
+       bool retry = true;
 
-       seq = read_seqbegin(&rename_lock);
 again:
+       read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
-
-       if (d_mountpoint(parent))
-               goto positive;
        spin_lock(&this_parent->d_lock);
+
+       ret = enter(data, this_parent);
+       switch (ret) {
+       case D_WALK_CONTINUE:
+               break;
+       case D_WALK_QUIT:
+       case D_WALK_SKIP:
+               goto out_unlock;
+       case D_WALK_NORETRY:
+               retry = false;
+               break;
+       }
 repeat:
        next = this_parent->d_subdirs.next;
 resume:
@@ -1059,12 +1270,22 @@ resume:
                next = tmp->next;
 
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-               /* Have we found a mount point ? */
-               if (d_mountpoint(dentry)) {
+
+               ret = enter(data, dentry);
+               switch (ret) {
+               case D_WALK_CONTINUE:
+                       break;
+               case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
-                       spin_unlock(&this_parent->d_lock);
-                       goto positive;
+                       goto out_unlock;
+               case D_WALK_NORETRY:
+                       retry = false;
+                       break;
+               case D_WALK_SKIP:
+                       spin_unlock(&dentry->d_lock);
+                       continue;
                }
+
                if (!list_empty(&dentry->d_subdirs)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
@@ -1079,34 +1300,97 @@ resume:
         */
        if (this_parent != parent) {
                struct dentry *child = this_parent;
-               this_parent = try_to_ascend(this_parent, locked, seq);
+               this_parent = try_to_ascend(this_parent, seq);
                if (!this_parent)
                        goto rename_retry;
                next = child->d_u.d_child.next;
                goto resume;
        }
-       spin_unlock(&this_parent->d_lock);
-       if (!locked && read_seqretry(&rename_lock, seq))
+       if (need_seqretry(&rename_lock, seq)) {
+               spin_unlock(&this_parent->d_lock);
                goto rename_retry;
-       if (locked)
-               write_sequnlock(&rename_lock);
-       return 0; /* No mount points found in tree */
-positive:
-       if (!locked && read_seqretry(&rename_lock, seq))
-               goto rename_retry;
-       if (locked)
-               write_sequnlock(&rename_lock);
-       return 1;
+       }
+       if (finish)
+               finish(data);
+
+out_unlock:
+       spin_unlock(&this_parent->d_lock);
+       done_seqretry(&rename_lock, seq);
+       return;
 
 rename_retry:
-       if (locked)
-               goto again;
-       locked = 1;
-       write_seqlock(&rename_lock);
+       if (!retry)
+               return;
+       seq = 1;
        goto again;
 }
+
+/*
+ * Search for at least 1 mount point in the dentry's subdirs.
+ * We descend to the next level whenever the d_subdirs
+ * list is non-empty and continue searching.
+ */
+
+static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
+{
+       int *ret = data;
+       if (d_mountpoint(dentry)) {
+               *ret = 1;
+               return D_WALK_QUIT;
+       }
+       return D_WALK_CONTINUE;
+}
+
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
+int have_submounts(struct dentry *parent)
+{
+       int ret = 0;
+
+       d_walk(parent, &ret, check_mount, NULL);
+
+       return ret;
+}
 EXPORT_SYMBOL(have_submounts);
 
+/*
+ * Called by mount code to set a mountpoint and check if the mountpoint is
+ * reachable (e.g. NFS can unhash a directory dentry and then the complete
+ * subtree can become unreachable).
+ *
+ * Only one of check_submounts_and_drop() and d_set_mounted() must succeed.  For
+ * this reason take rename_lock and d_lock on dentry and ancestors.
+ */
+int d_set_mounted(struct dentry *dentry)
+{
+       struct dentry *p;
+       int ret = -ENOENT;
+       write_seqlock(&rename_lock);
+       for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
+               /* Need exclusion wrt. check_submounts_and_drop() */
+               spin_lock(&p->d_lock);
+               if (unlikely(d_unhashed(p))) {
+                       spin_unlock(&p->d_lock);
+                       goto out;
+               }
+               spin_unlock(&p->d_lock);
+       }
+       spin_lock(&dentry->d_lock);
+       if (!d_unlinked(dentry)) {
+               dentry->d_flags |= DCACHE_MOUNTED;
+               ret = 0;
+       }
+       spin_unlock(&dentry->d_lock);
+out:
+       write_sequnlock(&rename_lock);
+       return ret;
+}
+
 /*
  * Search the dentry child list of the specified parent,
  * and move any unused dentries to the end of the unused
@@ -1121,93 +1405,51 @@ EXPORT_SYMBOL(have_submounts);
  * drop the lock and return early due to latency
  * constraints.
  */
-static int select_parent(struct dentry *parent, struct list_head *dispose)
-{
-       struct dentry *this_parent;
-       struct list_head *next;
-       unsigned seq;
-       int found = 0;
-       int locked = 0;
 
-       seq = read_seqbegin(&rename_lock);
-again:
-       this_parent = parent;
-       spin_lock(&this_parent->d_lock);
-repeat:
-       next = this_parent->d_subdirs.next;
-resume:
-       while (next != &this_parent->d_subdirs) {
-               struct list_head *tmp = next;
-               struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
-               next = tmp->next;
+struct select_data {
+       struct dentry *start;
+       struct list_head dispose;
+       int found;
+};
 
-               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
+{
+       struct select_data *data = _data;
+       enum d_walk_ret ret = D_WALK_CONTINUE;
 
-               /*
-                * move only zero ref count dentries to the dispose list.
-                *
-                * Those which are presently on the shrink list, being processed
-                * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
-                * loop in shrink_dcache_parent() might not make any progress
-                * and loop forever.
-                */
-               if (dentry->d_lockref.count) {
-                       dentry_lru_del(dentry);
-               } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-                       dentry_lru_move_list(dentry, dispose);
-                       dentry->d_flags |= DCACHE_SHRINK_LIST;
-                       found++;
-               }
-               /*
-                * We can return to the caller if we have found some (this
-                * ensures forward progress). We'll be coming back to find
-                * the rest.
-                */
-               if (found && need_resched()) {
-                       spin_unlock(&dentry->d_lock);
-                       goto out;
-               }
+       if (data->start == dentry)
+               goto out;
 
+       /*
+        * move only zero ref count dentries to the dispose list.
+        *
+        * Those which are presently on the shrink list, being processed
+        * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
+        * loop in shrink_dcache_parent() might not make any progress
+        * and loop forever.
+        */
+       if (dentry->d_lockref.count) {
+               dentry_lru_del(dentry);
+       } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                /*
-                * Descend a level if the d_subdirs list is non-empty.
+                * We can't use d_lru_shrink_move() because we
+                * need to get the global LRU lock and do the
+                * LRU accounting.
                 */
-               if (!list_empty(&dentry->d_subdirs)) {
-                       spin_unlock(&this_parent->d_lock);
-                       spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
-                       this_parent = dentry;
-                       spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
-                       goto repeat;
-               }
-
-               spin_unlock(&dentry->d_lock);
+               d_lru_del(dentry);
+               d_shrink_add(dentry, &data->dispose);
+               data->found++;
+               ret = D_WALK_NORETRY;
        }
        /*
-        * All done at this level ... ascend and resume the search.
+        * We can return to the caller if we have found some (this
+        * ensures forward progress). We'll be coming back to find
+        * the rest.
         */
-       if (this_parent != parent) {
-               struct dentry *child = this_parent;
-               this_parent = try_to_ascend(this_parent, locked, seq);
-               if (!this_parent)
-                       goto rename_retry;
-               next = child->d_u.d_child.next;
-               goto resume;
-       }
+       if (data->found && need_resched())
+               ret = D_WALK_QUIT;
 out:
-       spin_unlock(&this_parent->d_lock);
-       if (!locked && read_seqretry(&rename_lock, seq))
-               goto rename_retry;
-       if (locked)
-               write_sequnlock(&rename_lock);
-       return found;
-
-rename_retry:
-       if (found)
-               return found;
-       if (locked)
-               goto again;
-       locked = 1;
-       write_seqlock(&rename_lock);
-       goto again;
+       return ret;
 }
 
 /**
@@ -1216,18 +1458,90 @@ rename_retry:
  *
  * Prune the dcache to remove unused children of the parent dentry.
  */
-void shrink_dcache_parent(struct dentry * parent)
+void shrink_dcache_parent(struct dentry *parent)
 {
-       LIST_HEAD(dispose);
-       int found;
+       for (;;) {
+               struct select_data data;
 
-       while ((found = select_parent(parent, &dispose)) != 0) {
-               shrink_dentry_list(&dispose);
+               INIT_LIST_HEAD(&data.dispose);
+               data.start = parent;
+               data.found = 0;
+
+               d_walk(parent, &data, select_collect, NULL);
+               if (!data.found)
+                       break;
+
+               shrink_dentry_list(&data.dispose);
                cond_resched();
        }
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
+static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
+{
+       struct select_data *data = _data;
+
+       if (d_mountpoint(dentry)) {
+               data->found = -EBUSY;
+               return D_WALK_QUIT;
+       }
+
+       return select_collect(_data, dentry);
+}
+
+static void check_and_drop(void *_data)
+{
+       struct select_data *data = _data;
+
+       if (d_mountpoint(data->start))
+               data->found = -EBUSY;
+       if (!data->found)
+               __d_drop(data->start);
+}
+
+/**
+ * check_submounts_and_drop - prune dcache, check for submounts and drop
+ *
+ * All done as a single atomic operation relative to has_unlinked_ancestor().
+ * Returns 0 if successfully unhashed @parent.  If there were submounts then
+ * return -EBUSY.
+ *
+ * @dentry: dentry to prune and drop
+ */
+int check_submounts_and_drop(struct dentry *dentry)
+{
+       int ret = 0;
+
+       /* Negative dentries can be dropped without further checks */
+       if (!dentry->d_inode) {
+               d_drop(dentry);
+               goto out;
+       }
+
+       for (;;) {
+               struct select_data data;
+
+               INIT_LIST_HEAD(&data.dispose);
+               data.start = dentry;
+               data.found = 0;
+
+               d_walk(dentry, &data, check_and_collect, check_and_drop);
+               ret = data.found;
+
+               if (!list_empty(&data.dispose))
+                       shrink_dentry_list(&data.dispose);
+
+               if (ret <= 0)
+                       break;
+
+               cond_resched();
+       }
+
+out:
+       return ret;
+}
+EXPORT_SYMBOL(check_submounts_and_drop);
+
 /**
  * __d_alloc   -       allocate a dcache entry
  * @sb: filesystem it will belong to
@@ -1486,6 +1800,32 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 
 EXPORT_SYMBOL(d_instantiate_unique);
 
+/**
+ * d_instantiate_no_diralias - instantiate a non-aliased dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry.  If a directory alias is found, then
+ * return an error.  Together with d_materialise_unique() this guarantees that a
+ * directory inode may never have more than one alias.
+ */
+int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
+{
+       BUG_ON(!hlist_unhashed(&entry->d_alias));
+
+       spin_lock(&inode->i_lock);
+       if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
+               spin_unlock(&inode->i_lock);
+               return -EBUSY;
+       }
+       __d_instantiate(entry, inode);
+       spin_unlock(&inode->i_lock);
+       security_d_instantiate(entry, inode);
+
+       return 0;
+}
+EXPORT_SYMBOL(d_instantiate_no_diralias);
+
 struct dentry *d_make_root(struct inode *root_inode)
 {
        struct dentry *res = NULL;
@@ -2510,9 +2850,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
        return 0;
 }
 
+/**
+ * prepend_name - prepend a pathname in front of current buffer pointer
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name:   name string and length qstr structure
+ *
+ * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
+ * make sure that either the old or the new name pointer and length are
+ * fetched. However, there may be mismatch between length and pointer.
+ * The length cannot be trusted, we need to copy it byte-by-byte until
+ * the length is reached or a null byte is found. It also prepends "/" at
+ * the beginning of the name. The sequence number check at the caller will
+ * retry it again when a d_move() does happen. So any garbage in the buffer
+ * due to mismatched pointer and length will be discarded.
+ */
 static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 {
-       return prepend(buffer, buflen, name->name, name->len);
+       const char *dname = ACCESS_ONCE(name->name);
+       u32 dlen = ACCESS_ONCE(name->len);
+       char *p;
+
+       if (*buflen < dlen + 1)
+               return -ENAMETOOLONG;
+       *buflen -= dlen + 1;
+       p = *buffer -= dlen + 1;
+       *p++ = '/';
+       while (dlen--) {
+               char c = *dname++;
+               if (!c)
+                       break;
+               *p++ = c;
+       }
+       return 0;
 }
 
 /**
@@ -2522,7 +2892,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * Caller holds the rename_lock.
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
  */
 static int prepend_path(const struct path *path,
                        const struct path *root,
@@ -2531,54 +2909,66 @@ static int prepend_path(const struct path *path,
        struct dentry *dentry = path->dentry;
        struct vfsmount *vfsmnt = path->mnt;
        struct mount *mnt = real_mount(vfsmnt);
-       bool slash = false;
        int error = 0;
+       unsigned seq = 0;
+       char *bptr;
+       int blen;
 
+       rcu_read_lock();
+restart:
+       bptr = *buffer;
+       blen = *buflen;
+       read_seqbegin_or_lock(&rename_lock, &seq);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;
 
                if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                        /* Global root? */
-                       if (!mnt_has_parent(mnt))
-                               goto global_root;
-                       dentry = mnt->mnt_mountpoint;
-                       mnt = mnt->mnt_parent;
-                       vfsmnt = &mnt->mnt;
-                       continue;
+                       if (mnt_has_parent(mnt)) {
+                               dentry = mnt->mnt_mountpoint;
+                               mnt = mnt->mnt_parent;
+                               vfsmnt = &mnt->mnt;
+                               continue;
+                       }
+                       /*
+                        * Filesystems needing to implement special "root names"
+                        * should do so with ->d_dname()
+                        */
+                       if (IS_ROOT(dentry) &&
+                          (dentry->d_name.len != 1 ||
+                           dentry->d_name.name[0] != '/')) {
+                               WARN(1, "Root dentry has weird name <%.*s>\n",
+                                    (int) dentry->d_name.len,
+                                    dentry->d_name.name);
+                       }
+                       if (!error)
+                               error = is_mounted(vfsmnt) ? 1 : 2;
+                       break;
                }
                parent = dentry->d_parent;
                prefetch(parent);
-               spin_lock(&dentry->d_lock);
-               error = prepend_name(buffer, buflen, &dentry->d_name);
-               spin_unlock(&dentry->d_lock);
-               if (!error)
-                       error = prepend(buffer, buflen, "/", 1);
+               error = prepend_name(&bptr, &blen, &dentry->d_name);
                if (error)
                        break;
 
-               slash = true;
                dentry = parent;
        }
+       if (!(seq & 1))
+               rcu_read_unlock();
+       if (need_seqretry(&rename_lock, seq)) {
+               seq = 1;
+               goto restart;
+       }
+       done_seqretry(&rename_lock, seq);
 
-       if (!error && !slash)
-               error = prepend(buffer, buflen, "/", 1);
-
-       return error;
-
-global_root:
-       /*
-        * Filesystems needing to implement special "root names"
-        * should do so with ->d_dname()
-        */
-       if (IS_ROOT(dentry) &&
-           (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
-               WARN(1, "Root dentry has weird name <%.*s>\n",
-                    (int) dentry->d_name.len, dentry->d_name.name);
-       }
-       if (!slash)
-               error = prepend(buffer, buflen, "/", 1);
-       if (!error)
-               error = is_mounted(vfsmnt) ? 1 : 2;
+       if (error >= 0 && bptr == *buffer) {
+               if (--blen < 0)
+                       error = -ENAMETOOLONG;
+               else
+                       *--bptr = '/';
+       }
+       *buffer = bptr;
+       *buflen = blen;
        return error;
 }
 
@@ -2607,9 +2997,7 @@ char *__d_path(const struct path *path,
 
        prepend(&res, &buflen, "\0", 1);
        br_read_lock(&vfsmount_lock);
-       write_seqlock(&rename_lock);
        error = prepend_path(path, root, &res, &buflen);
-       write_sequnlock(&rename_lock);
        br_read_unlock(&vfsmount_lock);
 
        if (error < 0)
@@ -2628,9 +3016,7 @@ char *d_absolute_path(const struct path *path,
 
        prepend(&res, &buflen, "\0", 1);
        br_read_lock(&vfsmount_lock);
-       write_seqlock(&rename_lock);
        error = prepend_path(path, &root, &res, &buflen);
-       write_sequnlock(&rename_lock);
        br_read_unlock(&vfsmount_lock);
 
        if (error > 1)
@@ -2662,6 +3048,16 @@ static int prepend_unreachable(char **buffer, int *buflen)
        return prepend(buffer, buflen, "(unreachable)", 13);
 }
 
+static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
+{
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&fs->seq);
+               *root = fs->root;
+       } while (read_seqcount_retry(&fs->seq, seq));
+}
+
 /**
  * d_path - return the path of a dentry
  * @path: path to report
@@ -2694,15 +3090,15 @@ char *d_path(const struct path *path, char *buf, int buflen)
        if (path->dentry->d_op && path->dentry->d_op->d_dname)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
-       get_fs_root(current->fs, &root);
+       rcu_read_lock();
+       get_fs_root_rcu(current->fs, &root);
        br_read_lock(&vfsmount_lock);
-       write_seqlock(&rename_lock);
        error = path_with_deleted(path, &root, &res, &buflen);
-       write_sequnlock(&rename_lock);
        br_read_unlock(&vfsmount_lock);
+       rcu_read_unlock();
+
        if (error < 0)
                res = ERR_PTR(error);
-       path_put(&root);
        return res;
 }
 EXPORT_SYMBOL(d_path);
@@ -2733,10 +3129,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
        char *end = buffer + buflen;
        /* these dentries are never renamed, so d_lock is not needed */
        if (prepend(&end, &buflen, " (deleted)", 11) ||
-           prepend_name(&end, &buflen, &dentry->d_name) ||
+           prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
            prepend(&end, &buflen, "/", 1))  
                end = ERR_PTR(-ENAMETOOLONG);
-       return end;  
+       return end;
 }
 
 /*
@@ -2744,30 +3140,42 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
  */
 static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
 {
-       char *end = buf + buflen;
-       char *retval;
+       char *end, *retval;
+       int len, seq = 0;
+       int error = 0;
 
-       prepend(&end, &buflen, "\0", 1);
+       rcu_read_lock();
+restart:
+       end = buf + buflen;
+       len = buflen;
+       prepend(&end, &len, "\0", 1);
        if (buflen < 1)
                goto Elong;
        /* Get '/' right */
        retval = end-1;
        *retval = '/';
-
+       read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                struct dentry *parent = dentry->d_parent;
                int error;
 
                prefetch(parent);
-               spin_lock(&dentry->d_lock);
-               error = prepend_name(&end, &buflen, &dentry->d_name);
-               spin_unlock(&dentry->d_lock);
-               if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
-                       goto Elong;
+               error = prepend_name(&end, &len, &dentry->d_name);
+               if (error)
+                       break;
 
                retval = end;
                dentry = parent;
        }
+       if (!(seq & 1))
+               rcu_read_unlock();
+       if (need_seqretry(&rename_lock, seq)) {
+               seq = 1;
+               goto restart;
+       }
+       done_seqretry(&rename_lock, seq);
+       if (error)
+               goto Elong;
        return retval;
 Elong:
        return ERR_PTR(-ENAMETOOLONG);
@@ -2775,13 +3183,7 @@ Elong:
 
 char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
 {
-       char *retval;
-
-       write_seqlock(&rename_lock);
-       retval = __dentry_path(dentry, buf, buflen);
-       write_sequnlock(&rename_lock);
-
-       return retval;
+       return __dentry_path(dentry, buf, buflen);
 }
 EXPORT_SYMBOL(dentry_path_raw);
 
@@ -2790,7 +3192,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        char *p = NULL;
        char *retval;
 
-       write_seqlock(&rename_lock);
        if (d_unlinked(dentry)) {
                p = buf + buflen;
                if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2798,7 +3199,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
                buflen++;
        }
        retval = __dentry_path(dentry, buf, buflen);
-       write_sequnlock(&rename_lock);
        if (!IS_ERR(retval) && p)
                *p = '/';       /* restore '/' overriden with '\0' */
        return retval;
@@ -2806,6 +3206,18 @@ Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
 
+static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
+                                   struct path *pwd)
+{
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&fs->seq);
+               *root = fs->root;
+               *pwd = fs->pwd;
+       } while (read_seqcount_retry(&fs->seq, seq));
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
@@ -2828,25 +3240,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 {
        int error;
        struct path pwd, root;
-       char *page = (char *) __get_free_page(GFP_USER);
+       char *page = __getname();
 
        if (!page)
                return -ENOMEM;
 
-       get_fs_root_and_pwd(current->fs, &root, &pwd);
+       rcu_read_lock();
+       get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
 
        error = -ENOENT;
        br_read_lock(&vfsmount_lock);
-       write_seqlock(&rename_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
-               char *cwd = page + PAGE_SIZE;
-               int buflen = PAGE_SIZE;
+               char *cwd = page + PATH_MAX;
+               int buflen = PATH_MAX;
 
                prepend(&cwd, &buflen, "\0", 1);
                error = prepend_path(&pwd, &root, &cwd, &buflen);
-               write_sequnlock(&rename_lock);
                br_read_unlock(&vfsmount_lock);
+               rcu_read_unlock();
 
                if (error < 0)
                        goto out;
@@ -2859,21 +3271,19 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
                }
 
                error = -ERANGE;
-               len = PAGE_SIZE + page - cwd;
+               len = PATH_MAX + page - cwd;
                if (len <= size) {
                        error = len;
                        if (copy_to_user(buf, cwd, len))
                                error = -EFAULT;
                }
        } else {
-               write_sequnlock(&rename_lock);
                br_read_unlock(&vfsmount_lock);
+               rcu_read_unlock();
        }
 
 out:
-       path_put(&pwd);
-       path_put(&root);
-       free_page((unsigned long) page);
+       __putname(page);
        return error;
 }
 
@@ -2919,68 +3329,24 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
        return result;
 }
 
-void d_genocide(struct dentry *root)
+static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
 {
-       struct dentry *this_parent;
-       struct list_head *next;
-       unsigned seq;
-       int locked = 0;
-
-       seq = read_seqbegin(&rename_lock);
-again:
-       this_parent = root;
-       spin_lock(&this_parent->d_lock);
-repeat:
-       next = this_parent->d_subdirs.next;
-resume:
-       while (next != &this_parent->d_subdirs) {
-               struct list_head *tmp = next;
-               struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
-               next = tmp->next;
+       struct dentry *root = data;
+       if (dentry != root) {
+               if (d_unhashed(dentry) || !dentry->d_inode)
+                       return D_WALK_SKIP;
 
-               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-               if (d_unhashed(dentry) || !dentry->d_inode) {
-                       spin_unlock(&dentry->d_lock);
-                       continue;
-               }
-               if (!list_empty(&dentry->d_subdirs)) {
-                       spin_unlock(&this_parent->d_lock);
-                       spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
-                       this_parent = dentry;
-                       spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
-                       goto repeat;
-               }
                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
-               spin_unlock(&dentry->d_lock);
        }
-       if (this_parent != root) {
-               struct dentry *child = this_parent;
-               if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
-                       this_parent->d_flags |= DCACHE_GENOCIDE;
-                       this_parent->d_lockref.count--;
-               }
-               this_parent = try_to_ascend(this_parent, locked, seq);
-               if (!this_parent)
-                       goto rename_retry;
-               next = child->d_u.d_child.next;
-               goto resume;
-       }
-       spin_unlock(&this_parent->d_lock);
-       if (!locked && read_seqretry(&rename_lock, seq))
-               goto rename_retry;
-       if (locked)
-               write_sequnlock(&rename_lock);
-       return;
+       return D_WALK_CONTINUE;
+}
 
-rename_retry:
-       if (locked)
-               goto again;
-       locked = 1;
-       write_seqlock(&rename_lock);
-       goto again;
+void d_genocide(struct dentry *parent)
+{
+       d_walk(parent, parent, d_genocide_kill, NULL);
 }
 
 void d_tmpfile(struct dentry *dentry, struct inode *inode)