]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - kernel/cgroup.c
cgroup: don't hold css_set_rwsem across css task iteration
[karo-tx-linux.git] / kernel / cgroup.c
index 56e2b772b143ae411380f6afb21cf8a7f2b523e5..0c5b0e605c474db402938444c7abb62ba1a05398 100644 (file)
@@ -216,6 +216,7 @@ static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                      bool visible);
@@ -593,6 +594,7 @@ struct css_set init_css_set = {
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+       .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count       = 1;    /* 1 for init_css_set */
@@ -675,8 +677,9 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated_cnt updates but the caller
- * is responsible for managing @from_cset and @to_cset's reference counts.
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
  */
 static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
@@ -685,7 +688,22 @@ static void css_set_move_task(struct task_struct *task,
        lockdep_assert_held(&css_set_rwsem);
 
        if (from_cset) {
+               struct css_task_iter *it, *pos;
+
                WARN_ON_ONCE(list_empty(&task->cg_list));
+
+               /*
+                * @task is leaving, advance task iterators which are
+                * pointing to it so that they can resume at the next
+                * position.  Advancing an iterator might remove it from
+                * the list, use safe walk.  See css_task_iter_advance*()
+                * for details.
+                */
+               list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+                                        iters_node)
+                       if (it->task_pos == &task->cg_list)
+                               css_task_iter_advance(it);
+
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
@@ -1019,6 +1037,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->mg_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);
+       INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
 
        /* Copy the set of subsystem state objects generated in
@@ -3804,6 +3823,8 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
        struct cgrp_cset_link *link;
        struct css_set *cset;
 
+       lockdep_assert_held(&css_set_rwsem);
+
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
@@ -3831,12 +3852,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+
+       /*
+        * We don't keep css_sets locked across iteration steps and thus
+        * need to take steps to ensure that iteration can be resumed after
+        * the lock is re-acquired.  Iteration is performed at two levels -
+        * css_sets and tasks in them.
+        *
+        * Once created, a css_set never leaves its cgroup lists, so a
+        * pinned css_set is guaranteed to stay put and we can resume
+        * iteration afterwards.
+        *
+        * Tasks may leave @cset across iteration steps.  This is resolved
+        * by registering each iterator with the css_set currently being
+        * walked and making css_set_move_task() advance iterators whose
+        * next task is leaving.
+        */
+       if (it->cur_cset) {
+               list_del(&it->iters_node);
+               put_css_set_locked(it->cur_cset);
+       }
+       get_css_set(cset);
+       it->cur_cset = cset;
+       list_add(&it->iters_node, &cset->task_iters);
 }
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
        struct list_head *l = it->task_pos;
 
+       lockdep_assert_held(&css_set_rwsem);
        WARN_ON_ONCE(!l);
 
        /*
@@ -3864,19 +3909,16 @@ static void css_task_iter_advance(struct css_task_iter *it)
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
                         struct css_task_iter *it)
-       __acquires(css_set_rwsem)
 {
        /* no one should try to iterate before mounting cgroups */
        WARN_ON_ONCE(!use_task_css_set_links);
 
-       down_read(&css_set_rwsem);
+       memset(it, 0, sizeof(*it));
+
+       down_write(&css_set_rwsem);
 
        it->ss = css->ss;
 
@@ -3888,6 +3930,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        it->cset_head = it->cset_pos;
 
        css_task_iter_advance_css_set(it);
+
+       up_write(&css_set_rwsem);
 }
 
 /**
@@ -3900,14 +3944,22 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-       struct task_struct *res;
-
        if (!it->cset_pos)
                return NULL;
 
-       res = list_entry(it->task_pos, struct task_struct, cg_list);
+       if (it->cur_task)
+               put_task_struct(it->cur_task);
+
+       down_write(&css_set_rwsem);
+
+       it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
+       get_task_struct(it->cur_task);
+
        css_task_iter_advance(it);
-       return res;
+
+       up_write(&css_set_rwsem);
+
+       return it->cur_task;
 }
 
 /**
@@ -3917,9 +3969,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
-       __releases(css_set_rwsem)
 {
-       up_read(&css_set_rwsem);
+       if (it->cur_cset) {
+               down_write(&css_set_rwsem);
+               list_del(&it->iters_node);
+               put_css_set_locked(it->cur_cset);
+               up_write(&css_set_rwsem);
+       }
+
+       if (it->cur_task)
+               put_task_struct(it->cur_task);
 }
 
 /**