[CVE-2009-0029] System call wrappers part 02

[karo-tx-linux.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 4c475ce4e22212dcab53e60f3c9174d8e4b927cd..c29831076e7a2e156e584725453c51568682eb93 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -271,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
  
         rcu_read_lock();
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup *cgrp = cg->subsys[i]->cgroup;
+               struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
                 if (atomic_dec_and_test(&cgrp->count) &&
                     notify_on_release(cgrp)) {
                         if (taskexit)
@@ -594,6 +594,13 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
         return;
  }
  
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+       struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+       kfree(cgrp);
+}
+
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
         /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -619,11 +626,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 cgrp->root->number_of_cgroups--;
                 mutex_unlock(&cgroup_mutex);
  
-               /* Drop the active superblock reference that we took when we
-                * created the cgroup */
+               /*
+                * Drop the active superblock reference that we took when we
+                * created the cgroup
+                */
                 deactivate_super(cgrp->root->sb);
  
-               kfree(cgrp);
+               call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
         }
         iput(inode);
  }
@@ -713,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         BUG_ON(cgrp->subsys[i]);
                         BUG_ON(!dummytop->subsys[i]);
                         BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                       mutex_lock(&ss->hierarchy_mutex);
                         cgrp->subsys[i] = dummytop->subsys[i];
                         cgrp->subsys[i]->cgroup = cgrp;
                         list_move(&ss->sibling, &root->subsys_list);
                         ss->root = root;
                         if (ss->bind)
                                 ss->bind(ss, cgrp);
-
+                       mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & removed_bits) {
                         /* We're removing this subsystem */
                         BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                         BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                       mutex_lock(&ss->hierarchy_mutex);
                         if (ss->bind)
                                 ss->bind(ss, dummytop);
                         dummytop->subsys[i]->cgroup = dummytop;
                         cgrp->subsys[i] = NULL;
                         subsys[i]->root = &rootnode;
                         list_move(&ss->sibling, &rootnode.subsys_list);
+                       mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & final_bits) {
                         /* Subsystem state should already exist */
                         BUG_ON(!cgrp->subsys[i]);
@@ -1134,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
         char *start;
+       struct dentry *dentry = rcu_dereference(cgrp->dentry);
  
-       if (cgrp == dummytop) {
+       if (!dentry || cgrp == dummytop) {
                 /*
                  * Inactive subsystems have no dentry for their root
                  * cgroup
@@ -1154,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  
         *--start = '\0';
         for (;;) {
-               int len = cgrp->dentry->d_name.len;
+               int len = dentry->d_name.len;
                 if ((start -= len) < buf)
                         return -ENAMETOOLONG;
                 memcpy(start, cgrp->dentry->d_name.name, len);
                 cgrp = cgrp->parent;
                 if (!cgrp)
                         break;
+               dentry = rcu_dereference(cgrp->dentry);
                 if (!cgrp->parent)
                         continue;
                 if (--start < buf)
@@ -1663,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
         if (!error) {
                 dentry->d_fsdata = cgrp;
                 inc_nlink(parent->d_inode);
-               cgrp->dentry = dentry;
+               rcu_assign_pointer(cgrp->dentry, dentry);
                 dget(dentry);
         }
         dput(dentry);
@@ -2007,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
   */
  static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
  {
-       int n = 0;
+       int n = 0, pid;
         struct cgroup_iter it;
         struct task_struct *tsk;
         cgroup_iter_start(cgrp, &it);
         while ((tsk = cgroup_iter_next(cgrp, &it))) {
                 if (unlikely(n == npids))
                         break;
-               pidarray[n++] = task_pid_vnr(tsk);
+               pid = task_pid_vnr(tsk);
+               if (pid > 0)
+                       pidarray[n++] = pid;
         }
         cgroup_iter_end(cgrp, &it);
         return n;
@@ -2316,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                                struct cgroup *cgrp)
  {
         css->cgroup = cgrp;
-       atomic_set(&css->refcnt, 0);
+       atomic_set(&css->refcnt, 1);
         css->flags = 0;
         if (cgrp == dummytop)
                 set_bit(CSS_ROOT, &css->flags);
@@ -2324,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
         cgrp->subsys[ss->subsys_id] = css;
  }
  
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+       /* We need to take each hierarchy_mutex in a consistent order */
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_lock_nested(&ss->hierarchy_mutex, i);
+       }
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_unlock(&ss->hierarchy_mutex);
+       }
+}
+
  /*
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
@@ -2372,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 init_cgroup_css(css, ss, cgrp);
         }
  
+       cgroup_lock_hierarchy(root);
         list_add(&cgrp->sibling, &cgrp->parent->children);
+       cgroup_unlock_hierarchy(root);
         root->number_of_cgroups++;
  
         err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2423,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
  {
         /* Check the reference count on each subsystem. Since we
          * already established that there are no tasks in the
-        * cgroup, if the css refcount is also 0, then there should
+        * cgroup, if the css refcount is also 1, then there should
          * be no outstanding references, so the subsystem is safe to
          * destroy. We scan across all subsystems rather than using
          * the per-hierarchy linked list of mounted subsystems since
@@ -2444,12 +2486,62 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
                  * matter, since it can only happen if the cgroup
                  * has been deleted and hence no longer needs the
                  * release agent to be called anyway. */
-               if (css && atomic_read(&css->refcnt))
+               if (css && (atomic_read(&css->refcnt) > 1))
                         return 1;
         }
         return 0;
  }
  
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+       struct cgroup_subsys *ss;
+       unsigned long flags;
+       bool failed = false;
+       local_irq_save(flags);
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               int refcnt;
+               do {
+                       /* We can only remove a CSS with a refcnt==1 */
+                       refcnt = atomic_read(&css->refcnt);
+                       if (refcnt > 1) {
+                               failed = true;
+                               goto done;
+                       }
+                       BUG_ON(!refcnt);
+                       /*
+                        * Drop the refcnt to 0 while we check other
+                        * subsystems. This will cause any racing
+                        * css_tryget() to spin until we set the
+                        * CSS_REMOVED bits or abort
+                        */
+               } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+       }
+ done:
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               if (failed) {
+                       /*
+                        * Restore old refcnt if we previously managed
+                        * to clear it from 1 to 0
+                        */
+                       if (!atomic_read(&css->refcnt))
+                               atomic_set(&css->refcnt, 1);
+               } else {
+                       /* Commit the fact that the CSS is removed */
+                       set_bit(CSS_REMOVED, &css->flags);
+               }
+       }
+       local_irq_restore(flags);
+       return !failed;
+}
+
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
         struct cgroup *cgrp = dentry->d_fsdata;
@@ -2480,7 +2572,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  
         if (atomic_read(&cgrp->count)
             || !list_empty(&cgrp->children)
-           || cgroup_has_css_refs(cgrp)) {
+           || !cgroup_clear_css_refs(cgrp)) {
                 mutex_unlock(&cgroup_mutex);
                 return -EBUSY;
         }
@@ -2490,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         if (!list_empty(&cgrp->release_list))
                 list_del(&cgrp->release_list);
         spin_unlock(&release_list_lock);
-       /* delete my sibling from parent->children */
+
+       cgroup_lock_hierarchy(cgrp->root);
+       /* delete this cgroup from parent->children */
         list_del(&cgrp->sibling);
+       cgroup_unlock_hierarchy(cgrp->root);
+
         spin_lock(&cgrp->dentry->d_lock);
         d = dget(cgrp->dentry);
         spin_unlock(&d->d_lock);
@@ -2533,6 +2629,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
          * need to invoke fork callbacks here. */
         BUG_ON(!list_empty(&init_task.tasks));
  
+       mutex_init(&ss->hierarchy_mutex);
         ss->active = 1;
  }
  
@@ -3031,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
  {
         struct cgroup *cgrp = css->cgroup;
         rcu_read_lock();
-       if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+       if ((atomic_dec_return(&css->refcnt) == 1) &&
+           notify_on_release(cgrp)) {
                 set_bit(CGRP_RELEASABLE, &cgrp->flags);
                 check_for_release(cgrp);
         }