From c9592d3bb64e8e595459bffbc2425415c6f4b5e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Nov 2011 10:41:40 +1100 Subject: [PATCH] cgroup: Fix task counter common ancestor logic The task counter subsystem has been written assuming that can_attach_task/attach_task/cancel_attach_task calls are serialized per task. This is true when we attach only one task but not when we attach a whole thread group, in which case the sequence is: for each thread if (can_attach_task() < 0) goto rollback for each_thread attach_task() rollback: for each thread cancel_attach_task() The common ancestor, searched on task_counter_attach_task(), can thus change between each of these calls for a given task. This breaks if some tasks in the thread group are not in the same cgroup origin. The uncharge made in attach_task() or the rollback in cancel_attach_task() there would have an erroneous propagation. This can even break seriously is some scenario. For example there with $PID beeing the pid of a multithread process: mkdir /dev/cgroup/cgroup0 echo $PID > /dev/cgroup/cgroup0/cgroup.procs echo $PID > /dev/cgroup/tasks echo $PID > /dev/cgroup/cgroup0/cgroup.procs On the last move, attach_task() is called on the thread leader with the wrong common_ancestor, leading to a crash because we uncharge a res_counter that doesn't exist: [ 149.805063] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040 [ 149.806013] IP: [] __lock_acquire+0x62/0x15d0 [ 149.806013] PGD 51d38067 PUD 5119e067 PMD 0 [ 149.806013] Oops: 0000 [#1] PREEMPT SMP [ 149.806013] Dumping ftrace buffer: [ 149.806013] (ftrace buffer empty) [ 149.806013] CPU 3 [ 149.806013] Modules linked in: [ 149.806013] [ 149.806013] Pid: 1111, comm: spread_thread_g Not tainted 3.1.0-rc3+ #165 FUJITSU SIEMENS AMD690VM-FMH/AMD690VM-FMH [ 149.806013] RIP: 0010:[] [] __lock_acquire+0x62/0x15d0 [ 149.806013] RSP: 0018:ffff880051479b38 EFLAGS: 00010046 [ 149.806013] RAX: 0000000000000046 RBX: 0000000000000040 RCX: 0000000000000000 [ 149.868002] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000040 [ 149.868002] RBP: ffff880051479c08 R08: 0000000000000002 R09: 0000000000000001 [ 149.868002] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002 [ 149.868002] R13: 0000000000000000 R14: 0000000000000000 R15: ffff880051f120a0 [ 149.868002] FS: 00007f1e01dd7700(0000) GS:ffff880057d80000(0000) knlGS:0000000000000000 [ 149.868002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 149.868002] CR2: 0000000000000040 CR3: 0000000051c59000 CR4: 00000000000006e0 [ 149.868002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 149.868002] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 149.868002] Process spread_thread_g (pid: 1111, threadinfo ffff880051478000, task ffff880051f120a0) [ 149.868002] Stack: [ 149.868002] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 149.868002] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 149.868002] 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 149.868002] Call Trace: [ 149.868002] [] lock_acquire+0xa2/0x1a0 [ 149.868002] [] ? res_counter_uncharge_until+0x4c/0xb0 [ 149.868002] [] _raw_spin_lock+0x3b/0x50 [ 149.868002] [] ? res_counter_uncharge_until+0x4c/0xb0 [ 149.868002] [] res_counter_uncharge_until+0x4c/0xb0 [ 149.868002] [] task_counter_attach_task+0x44/0x50 [ 149.868002] [] cgroup_attach_proc+0x5ad/0x720 [ 149.868002] [] ? cgroup_attach_proc+0x79/0x720 [ 149.868002] [] attach_task_by_pid+0x8f/0x220 [ 149.868002] [] ? attach_task_by_pid+0xf0/0x220 [ 149.868002] [] ? attach_task_by_pid+0xf0/0x220 [ 149.868002] [] cgroup_procs_write+0x28/0x40 [ 149.868002] [] cgroup_file_write+0x209/0x2f0 [ 149.868002] [] ? apparmor_file_permission+0x18/0x20 [ 149.868002] [] ? security_file_permission+0x23/0x90 [ 149.868002] [] vfs_write+0xc8/0x190 [ 149.868002] [] sys_write+0x51/0x90 [ 149.868002] [] system_call_fastpath+0x16/0x1b To solve this, keep the original cgroup of each thread in the thread group cached in the flex array and pass it to can_attach_task()/attach_task() and cancel_attach_task() so that the correct common ancestor between the old and new cgroup can be safely retrieved for each task. This is inspired by a previous patch from Li Zefan: "[PATCH] cgroups: don't cache common ancestor in task counter subsys". Reported-by: Ben Blum Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Cc: Paul Menage Cc: Tim Hockin Cc: Tejun Heo Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton --- Documentation/cgroups/cgroups.txt | 3 +- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 48 +++++++++++++++++++------------ kernel/cgroup_task_counter.c | 26 ++++++++++------- 4 files changed, 47 insertions(+), 32 deletions(-) diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 3fa646f6c6d6..7df0e5b6f4cd 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -623,7 +623,8 @@ function, so that the subsystem can implement a rollback. If not, not necessary. This will be called only about subsystems whose can_attach() operation have succeeded. -void cancel_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +void cancel_attach_task(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *tsk) (cgroup_mutex held by caller) As cancel_attach, but for operations that must be cancelled once per diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ddc13eb76b09..8dada1dbc73e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -475,7 +475,7 @@ struct cgroup_subsys { struct task_struct *tsk); void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk); - void (*cancel_attach_task)(struct cgroup *cgrp, + void (*cancel_attach_task)(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *tsk); void (*pre_attach)(struct cgroup *cgrp); void (*attach_task)(struct cgroup *cgrp, struct cgroup *old_cgrp, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 893fc3d5c876..19a4faff2331 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1885,7 +1885,7 @@ out: break; if (ss->cancel_attach_task) - ss->cancel_attach_task(cgrp, tsk); + ss->cancel_attach_task(cgrp, oldcgrp, tsk); if (ss->cancel_attach) ss->cancel_attach(ss, cgrp, tsk); } @@ -1983,6 +1983,11 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, return 0; } +struct task_cgroup { + struct task_struct *tsk; + struct cgroup *oldcgrp; +}; + /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2003,6 +2008,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* threadgroup list cursor and array */ struct task_struct *tsk; struct flex_array *group; + struct task_cgroup *tc; /* * we need to make sure we have css_sets for all the tasks we're * going to move -before- we actually start moving them, so that in @@ -2020,7 +2026,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ group_size = get_nr_threads(leader); /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(struct task_struct *), group_size, + group = flex_array_alloc(sizeof(struct task_cgroup), group_size, GFP_KERNEL); if (!group) return -ENOMEM; @@ -2047,14 +2053,18 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) tsk = leader; i = 0; do { + struct task_cgroup tsk_cgrp; + /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); get_task_struct(tsk); + tsk_cgrp.tsk = tsk; + tsk_cgrp.oldcgrp = task_cgroup_from_root(tsk, root); /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + retval = flex_array_put(group, i, &tsk_cgrp, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); @@ -2077,14 +2087,13 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (ss->can_attach_task) { /* run on each task in the threadgroup. */ for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - oldcgrp = task_cgroup_from_root(tsk, root); - + tc = flex_array_get(group, i); retval = ss->can_attach_task(cgrp, - oldcgrp, tsk); + tc->oldcgrp, + tc->tsk); if (retval) { failed_ss = ss; - failed_task = tsk; + failed_task = tc->tsk; goto out_cancel_attach; } } @@ -2097,10 +2106,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); + tc = flex_array_get(group, i); + tsk = tc->tsk; /* nothing to do if this task is already in the cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) + if (cgrp == tc->oldcgrp) continue; /* get old css_set pointer */ task_lock(tsk); @@ -2136,9 +2145,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) ss->pre_attach(cgrp); } for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); + tc = flex_array_get(group, i); + tsk = tc->tsk; + oldcgrp = tc->oldcgrp; /* leave current thread as it is if it's already there */ - oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; /* if the thread is PF_EXITING, it can just get skipped. */ @@ -2151,7 +2161,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } } else if (retval == -ESRCH) { if (ss->cancel_attach_task) - ss->cancel_attach_task(cgrp, tsk); + ss->cancel_attach_task(cgrp, oldcgrp, tsk); } else { BUG_ON(1); } @@ -2188,10 +2198,10 @@ out_cancel_attach: if (ss->cancel_attach_task && (ss != failed_ss || failed_task)) { for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - if (tsk == failed_task) + tc = flex_array_get(group, i); + if (tc->tsk == failed_task) break; - ss->cancel_attach_task(cgrp, tsk); + ss->cancel_attach_task(cgrp, tc->oldcgrp, tc->tsk); } } @@ -2206,8 +2216,8 @@ out_cancel_attach: } /* clean up the array of referenced threads in the group. */ for (i = 0; i < group_size; i++) { - tsk = flex_array_get_ptr(group, i); - put_task_struct(tsk); + tc = flex_array_get(group, i); + put_task_struct(tc->tsk); } out_free_group_list: flex_array_free(group); diff --git a/kernel/cgroup_task_counter.c b/kernel/cgroup_task_counter.c index d3b2a8290fef..c04340d0ef98 100644 --- a/kernel/cgroup_task_counter.c +++ b/kernel/cgroup_task_counter.c @@ -94,12 +94,6 @@ static void task_counter_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, res_counter_uncharge(cgroup_task_res_counter(old_cgrp), 1); } -/* - * Protected amongst can_attach_task/attach_task/cancel_attach_task by - * cgroup mutex - */ -static struct res_counter *common_ancestor; - /* * This does more than just probing the ability to attach to the dest cgroup. * We can not just _check_ if we can attach to the destination and do the real @@ -111,9 +105,10 @@ static int task_counter_can_attach_task(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *tsk) { + int err; + struct res_counter *common_ancestor; struct res_counter *res = cgroup_task_res_counter(cgrp); struct res_counter *old_res = cgroup_task_res_counter(old_cgrp); - int err; /* * When moving a task from a cgroup to another, we don't want @@ -138,10 +133,15 @@ static int task_counter_can_attach_task(struct cgroup *cgrp, /* Uncharge the dest cgroup that we charged in task_counter_can_attach_task() */ static void task_counter_cancel_attach_task(struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *tsk) { - res_counter_uncharge_until(cgroup_task_res_counter(cgrp), - common_ancestor, 1); + struct res_counter *common_ancestor; + struct res_counter *res = cgroup_task_res_counter(cgrp); + struct res_counter *old_res = cgroup_task_res_counter(old_cgrp); + + common_ancestor = res_counter_common_ancestor(res, old_res); + res_counter_uncharge_until(res, common_ancestor, 1); } /* @@ -155,8 +155,12 @@ static void task_counter_attach_task(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *tsk) { - res_counter_uncharge_until(cgroup_task_res_counter(old_cgrp), - common_ancestor, 1); + struct res_counter *common_ancestor; + struct res_counter *res = cgroup_task_res_counter(cgrp); + struct res_counter *old_res = cgroup_task_res_counter(old_cgrp); + + common_ancestor = res_counter_common_ancestor(res, old_res); + res_counter_uncharge_until(old_res, common_ancestor, 1); } static u64 task_counter_read_u64(struct cgroup *cgrp, struct cftype *cft) -- 2.39.2