vfs: Lazily remove mounts on unlinked files and directories. v2

[karo-tx-linux.git] / fs / namei.c
diff --git a/fs/namei.c b/fs/namei.c

index 7720fbd5277bb8830a336cb08dc39fd0634c199f..a12c1d31d4c8ab87231be38387dd3165b447ddb4 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -508,56 +508,78 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
  {
         struct fs_struct *fs = current->fs;
         struct dentry *parent = nd->path.dentry;
-       int want_root = 0;
  
         BUG_ON(!(nd->flags & LOOKUP_RCU));
-       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-               want_root = 1;
-               spin_lock(&fs->lock);
-               if (nd->root.mnt != fs->root.mnt ||
-                               nd->root.dentry != fs->root.dentry)
-                       goto err_root;
-       }
-       spin_lock(&parent->d_lock);
+
+       /*
+        * Get a reference to the parent first: we're
+        * going to make "path_put(nd->path)" valid in
+        * non-RCU context for "terminate_walk()".
+        *
+        * If this doesn't work, return immediately with
+        * RCU walking still active (and then we will do
+        * the RCU walk cleanup in terminate_walk()).
+        */
+       if (!lockref_get_not_dead(&parent->d_lockref))
+               return -ECHILD;
+
+       /*
+        * After the mntget(), we terminate_walk() will do
+        * the right thing for non-RCU mode, and all our
+        * subsequent exit cases should unlock_rcu_walk()
+        * before returning.
+        */
+       mntget(nd->path.mnt);
+       nd->flags &= ~LOOKUP_RCU;
+
+       /*
+        * For a negative lookup, the lookup sequence point is the parents
+        * sequence point, and it only needs to revalidate the parent dentry.
+        *
+        * For a positive lookup, we need to move both the parent and the
+        * dentry from the RCU domain to be properly refcounted. And the
+        * sequence number in the dentry validates *both* dentry counters,
+        * since we checked the sequence number of the parent after we got
+        * the child sequence number. So we know the parent must still
+        * be valid if the child sequence number is still valid.
+        */
         if (!dentry) {
-               if (!__d_rcu_to_refcount(parent, nd->seq))
-                       goto err_parent;
+               if (read_seqcount_retry(&parent->d_seq, nd->seq))
+                       goto out;
                 BUG_ON(nd->inode != parent->d_inode);
         } else {
-               if (dentry->d_parent != parent)
-                       goto err_parent;
-               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-               if (!__d_rcu_to_refcount(dentry, nd->seq))
-                       goto err_child;
-               /*
-                * If the sequence check on the child dentry passed, then
-                * the child has not been removed from its parent. This
-                * means the parent dentry must be valid and able to take
-                * a reference at this point.
-                */
-               BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
-               BUG_ON(!parent->d_lockref.count);
-               parent->d_lockref.count++;
-               spin_unlock(&dentry->d_lock);
+               if (!lockref_get_not_dead(&dentry->d_lockref))
+                       goto out;
+               if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+                       goto drop_dentry;
         }
-       spin_unlock(&parent->d_lock);
-       if (want_root) {
+
+       /*
+        * Sequence counts matched. Now make sure that the root is
+        * still valid and get it if required.
+        */
+       if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+               spin_lock(&fs->lock);
+               if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
+                       goto unlock_and_drop_dentry;
                 path_get(&nd->root);
                 spin_unlock(&fs->lock);
         }
-       mntget(nd->path.mnt);
  
         unlock_rcu_walk();
-       nd->flags &= ~LOOKUP_RCU;
         return 0;
  
-err_child:
-       spin_unlock(&dentry->d_lock);
-err_parent:
-       spin_unlock(&parent->d_lock);
-err_root:
-       if (want_root)
-               spin_unlock(&fs->lock);
+unlock_and_drop_dentry:
+       spin_unlock(&fs->lock);
+drop_dentry:
+       unlock_rcu_walk();
+       dput(dentry);
+       goto drop_root_mnt;
+out:
+       unlock_rcu_walk();
+drop_root_mnt:
+       if (!(nd->flags & LOOKUP_ROOT))
+               nd->root.mnt = NULL;
         return -ECHILD;
  }
  
@@ -585,14 +607,16 @@ static int complete_walk(struct nameidata *nd)
                 nd->flags &= ~LOOKUP_RCU;
                 if (!(nd->flags & LOOKUP_ROOT))
                         nd->root.mnt = NULL;
-               spin_lock(&dentry->d_lock);
-               if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
-                       spin_unlock(&dentry->d_lock);
+
+               if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
+                       unlock_rcu_walk();
+                       return -ECHILD;
+               }
+               if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
                         unlock_rcu_walk();
+                       dput(dentry);
                         return -ECHILD;
                 }
-               BUG_ON(nd->inode != dentry->d_inode);
-               spin_unlock(&dentry->d_lock);
                 mntget(nd->path.mnt);
                 unlock_rcu_walk();
         }
@@ -636,29 +660,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
         }
  }
  
-static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
-{
-       int ret;
-
-       if (IS_ERR(link))
-               goto fail;
-
-       if (*link == '/') {
-               set_root(nd);
-               path_put(&nd->path);
-               nd->path = nd->root;
-               path_get(&nd->root);
-               nd->flags |= LOOKUP_JUMPED;
-       }
-       nd->inode = nd->path.dentry->d_inode;
-
-       ret = link_path_walk(link, nd);
-       return ret;
-fail:
-       path_put(&nd->path);
-       return PTR_ERR(link);
-}
-
  static void path_put_conditional(struct path *path, struct nameidata *nd)
  {
         dput(path->dentry);
@@ -850,7 +851,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
         error = 0;
         s = nd_get_link(nd);
         if (s) {
-               error = __vfs_follow_link(nd, s);
+               if (unlikely(IS_ERR(s))) {
+                       path_put(&nd->path);
+                       put_link(nd, link, *p);
+                       return PTR_ERR(s);
+               }
+               if (*s == '/') {
+                       set_root(nd);
+                       path_put(&nd->path);
+                       nd->path = nd->root;
+                       path_get(&nd->root);
+                       nd->flags |= LOOKUP_JUMPED;
+               }
+               nd->inode = nd->path.dentry->d_inode;
+               error = link_path_walk(s, nd);
                 if (unlikely(error))
                         put_link(nd, link, *p);
         }
@@ -2184,6 +2198,197 @@ user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
         return s;
  }
  
+/**
+ * mountpoint_last - look up last component for umount
+ * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
+ * @path: pointer to container for result
+ *
+ * This is a special lookup_last function just for umount. In this case, we
+ * need to resolve the path without doing any revalidation.
+ *
+ * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+ * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+ * in almost all cases, this lookup will be served out of the dcache. The only
+ * cases where it won't are if nd->last refers to a symlink or the path is
+ * bogus and it doesn't exist.
+ *
+ * Returns:
+ * -error: if there was an error during lookup. This includes -ENOENT if the
+ *         lookup found a negative dentry. The nd->path reference will also be
+ *         put in this case.
+ *
+ * 0:      if we successfully resolved nd->path and found it to not to be a
+ *         symlink that needs to be followed. "path" will also be populated.
+ *         The nd->path reference will also be put.
+ *
+ * 1:      if we successfully resolved nd->last and found it to be a symlink
+ *         that needs to be followed. "path" will be populated with the path
+ *         to the link, and nd->path will *not* be put.
+ */
+static int
+mountpoint_last(struct nameidata *nd, struct path *path)
+{
+       int error = 0;
+       struct dentry *dentry;
+       struct dentry *dir = nd->path.dentry;
+
+       /* If we're in rcuwalk, drop out of it to handle last component */
+       if (nd->flags & LOOKUP_RCU) {
+               if (unlazy_walk(nd, NULL)) {
+                       error = -ECHILD;
+                       goto out;
+               }
+       }
+
+       nd->flags &= ~LOOKUP_PARENT;
+
+       if (unlikely(nd->last_type != LAST_NORM)) {
+               error = handle_dots(nd, nd->last_type);
+               if (error)
+                       goto out;
+               dentry = dget(nd->path.dentry);
+               goto done;
+       }
+
+       mutex_lock(&dir->d_inode->i_mutex);
+       dentry = d_lookup(dir, &nd->last);
+       if (!dentry) {
+               /*
+                * No cached dentry. Mounted dentries are pinned in the cache,
+                * so that means that this dentry is probably a symlink or the
+                * path doesn't actually point to a mounted dentry.
+                */
+               dentry = d_alloc(dir, &nd->last);
+               if (!dentry) {
+                       error = -ENOMEM;
+                       mutex_unlock(&dir->d_inode->i_mutex);
+                       goto out;
+               }
+               dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+               error = PTR_ERR(dentry);
+               if (IS_ERR(dentry)) {
+                       mutex_unlock(&dir->d_inode->i_mutex);
+                       goto out;
+               }
+       }
+       mutex_unlock(&dir->d_inode->i_mutex);
+
+done:
+       if (!dentry->d_inode) {
+               error = -ENOENT;
+               dput(dentry);
+               goto out;
+       }
+       path->dentry = dentry;
+       path->mnt = mntget(nd->path.mnt);
+       if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
+               return 1;
+       follow_mount(path);
+       error = 0;
+out:
+       terminate_walk(nd);
+       return error;
+}
+
+/**
+ * path_mountpoint - look up a path to be umounted
+ * @dfd:       directory file descriptor to start walk from
+ * @name:      full pathname to walk
+ * @flags:     lookup flags
+ *
+ * Look up the given name, but don't attempt to revalidate the last component.
+ * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ */
+static int
+path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+{
+       struct file *base = NULL;
+       struct nameidata nd;
+       int err;
+
+       err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+       if (unlikely(err))
+               return err;
+
+       current->total_link_count = 0;
+       err = link_path_walk(name, &nd);
+       if (err)
+               goto out;
+
+       err = mountpoint_last(&nd, path);
+       while (err > 0) {
+               void *cookie;
+               struct path link = *path;
+               err = may_follow_link(&link, &nd);
+               if (unlikely(err))
+                       break;
+               nd.flags |= LOOKUP_PARENT;
+               err = follow_link(&link, &nd, &cookie);
+               if (err)
+                       break;
+               err = mountpoint_last(&nd, path);
+               put_link(&nd, &link, cookie);
+       }
+out:
+       if (base)
+               fput(base);
+
+       if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
+               path_put(&nd.root);
+
+       return err;
+}
+
+static int
+filename_mountpoint(int dfd, struct filename *s, struct path *path,
+                       unsigned int flags)
+{
+       int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+       if (unlikely(error == -ECHILD))
+               error = path_mountpoint(dfd, s->name, path, flags);
+       if (unlikely(error == -ESTALE))
+               error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+       if (likely(!error))
+               audit_inode(s, path->dentry, 0);
+       return error;
+}
+
+/**
+ * user_path_mountpoint_at - lookup a path from userland in order to umount it
+ * @dfd:       directory file descriptor
+ * @name:      pathname from userland
+ * @flags:     lookup flags
+ * @path:      pointer to container to hold result
+ *
+ * A umount is a special case for path walking. We're not actually interested
+ * in the inode in this situation, and ESTALE errors can be a problem. We
+ * simply want track down the dentry and vfsmount attached at the mountpoint
+ * and avoid revalidating the last component.
+ *
+ * Returns 0 and populates "path" on success.
+ */
+int
+user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
+                       struct path *path)
+{
+       struct filename *s = getname(name);
+       int error;
+       if (IS_ERR(s))
+               return PTR_ERR(s);
+       error = filename_mountpoint(dfd, s, path, flags);
+       putname(s);
+       return error;
+}
+
+int
+kern_path_mountpoint(int dfd, const char *name, struct path *path,
+                       unsigned int flags)
+{
+       struct filename s = {.name = name};
+       return filename_mountpoint(dfd, &s, path, flags);
+}
+EXPORT_SYMBOL(kern_path_mountpoint);
+
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
@@ -2451,6 +2656,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
         int acc_mode;
         int create_error = 0;
         struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
+       bool excl;
  
         BUG_ON(dentry->d_inode);
  
@@ -2464,10 +2670,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
         if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
                 mode &= ~current_umask();
  
-       if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
+       excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
+       if (excl)
                 open_flag &= ~O_TRUNC;
-               *opened |= FILE_CREATED;
-       }
  
         /*
          * Checking write permission is tricky, bacuse we don't know if we are
@@ -2520,12 +2725,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                 goto out;
         }
  
-       acc_mode = op->acc_mode;
-       if (*opened & FILE_CREATED) {
-               fsnotify_create(dir, dentry);
-               acc_mode = MAY_OPEN;
-       }
-
         if (error) {    /* returned 1, that is */
                 if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                         error = -EIO;
@@ -2535,9 +2734,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                         dput(dentry);
                         dentry = file->f_path.dentry;
                 }
-               if (create_error && dentry->d_inode == NULL) {
-                       error = create_error;
-                       goto out;
+               if (*opened & FILE_CREATED)
+                       fsnotify_create(dir, dentry);
+               if (!dentry->d_inode) {
+                       WARN_ON(*opened & FILE_CREATED);
+                       if (create_error) {
+                               error = create_error;
+                               goto out;
+                       }
+               } else {
+                       if (excl && !(*opened & FILE_CREATED)) {
+                               error = -EEXIST;
+                               goto out;
+                       }
                 }
                 goto looked_up;
         }
@@ -2546,6 +2755,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
          * We didn't have the inode before the open, so check open permission
          * here.
          */
+       acc_mode = op->acc_mode;
+       if (*opened & FILE_CREATED) {
+               WARN_ON(!(open_flag & O_CREAT));
+               fsnotify_create(dir, dentry);
+               acc_mode = MAY_OPEN;
+       }
         error = may_open(&file->f_path, acc_mode, open_flag);
         if (error)
                 fput(file);
@@ -3332,6 +3547,20 @@ void dentry_unhash(struct dentry *dentry)
         spin_unlock(&dentry->d_lock);
  }
  
+static bool covered(struct vfsmount *mnt, struct dentry *dentry)
+{
+       /* test to see if a dentry is covered with a mount in
+        * the current mount namespace.
+        */
+       bool is_covered;
+
+       rcu_read_lock();
+       is_covered = d_mountpoint(dentry) && __lookup_mnt(mnt, dentry, 1);
+       rcu_read_unlock();
+
+       return is_covered;
+}
+
  int vfs_rmdir(struct inode *dir, struct dentry *dentry)
  {
         int error = may_delete(dir, dentry, 1);
@@ -3345,10 +3574,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
         dget(dentry);
         mutex_lock(&dentry->d_inode->i_mutex);
  
-       error = -EBUSY;
-       if (d_mountpoint(dentry))
-               goto out;
-
         error = security_inode_rmdir(dir, dentry);
         if (error)
                 goto out;
@@ -3360,6 +3585,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
  
         dentry->d_inode->i_flags |= S_DEAD;
         dont_mount(dentry);
+       detach_mounts(dentry);
  
  out:
         mutex_unlock(&dentry->d_inode->i_mutex);
@@ -3407,6 +3633,9 @@ retry:
                 error = -ENOENT;
                 goto exit3;
         }
+       error = -EBUSY;
+       if (covered(nd.path.mnt, dentry))
+               goto exit3;
         error = security_path_rmdir(&nd.path, dentry);
         if (error)
                 goto exit3;
@@ -3442,14 +3671,12 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
                 return -EPERM;
  
         mutex_lock(&dentry->d_inode->i_mutex);
-       if (d_mountpoint(dentry))
-               error = -EBUSY;
-       else {
-               error = security_inode_unlink(dir, dentry);
+       error = security_inode_unlink(dir, dentry);
+       if (!error) {
+               error = dir->i_op->unlink(dir, dentry);
                 if (!error) {
-                       error = dir->i_op->unlink(dir, dentry);
-                       if (!error)
-                               dont_mount(dentry);
+                       dont_mount(dentry);
+                       detach_mounts(dentry);
                 }
         }
         mutex_unlock(&dentry->d_inode->i_mutex);
@@ -3501,6 +3728,9 @@ retry:
                 inode = dentry->d_inode;
                 if (!inode)
                         goto slashes;
+               error = -EBUSY;
+               if (covered(nd.path.mnt, dentry))
+                       goto exit2;
                 ihold(inode);
                 error = security_path_unlink(&nd.path, dentry);
                 if (error)
@@ -3773,10 +4003,6 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
         if (target)
                 mutex_lock(&target->i_mutex);
  
-       error = -EBUSY;
-       if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
-               goto out;
-
         error = -EMLINK;
         if (max_links && !target && new_dir != old_dir &&
             new_dir->i_nlink >= max_links)
@@ -3791,6 +4017,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
         if (target) {
                 target->i_flags |= S_DEAD;
                 dont_mount(new_dentry);
+               detach_mounts(new_dentry);
         }
  out:
         if (target)
@@ -3816,16 +4043,14 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
         if (target)
                 mutex_lock(&target->i_mutex);
  
-       error = -EBUSY;
-       if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-               goto out;
-
         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
         if (error)
                 goto out;
  
-       if (target)
+       if (target) {
                 dont_mount(new_dentry);
+               detach_mounts(new_dentry);
+       }
         if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                 d_move(old_dentry, new_dentry);
  out:
@@ -3949,6 +4174,11 @@ retry:
         error = -ENOTEMPTY;
         if (new_dentry == trap)
                 goto exit5;
+       error = -EBUSY;
+       if (covered(oldnd.path.mnt, old_dentry))
+               goto exit5;
+       if (covered(newnd.path.mnt, new_dentry))
+               goto exit5;
  
         error = security_path_rename(&oldnd.path, old_dentry,
                                      &newnd.path, new_dentry);
@@ -4024,11 +4254,6 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
         return res;
  }
  
-int vfs_follow_link(struct nameidata *nd, const char *link)
-{
-       return __vfs_follow_link(nd, link);
-}
-
  /* get the link contents into pagecache */
  static char *page_getlink(struct dentry * dentry, struct page **ppage)
  {
@@ -4140,7 +4365,6 @@ EXPORT_SYMBOL(vfs_path_lookup);
  EXPORT_SYMBOL(inode_permission);
  EXPORT_SYMBOL(unlock_rename);
  EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_follow_link);
  EXPORT_SYMBOL(vfs_link);
  EXPORT_SYMBOL(vfs_mkdir);
  EXPORT_SYMBOL(vfs_mknod);