X-Git-Url: https://git.karo-electronics.de/?a=blobdiff_plain;f=kernel%2Fcgroup.c;h=dcb06e181ce4f6178a02bd82e3bc6d898faf533b;hb=1fed1b2e36ba1aa0257004a97e75bbdb70f216b5;hp=0e2c401ed7b954e633a4742a0332e343cac87423;hpb=a2bed8209a3afc3b2cf1c28383fb48155c1fea46;p=linux-beck.git diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0e2c401ed7b9..dcb06e181ce4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -70,15 +70,6 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) -/* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding. This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -100,8 +91,8 @@ static DECLARE_RWSEM(css_set_rwsem); #endif /* - * Protects cgroup_idr so that IDs can be released without grabbing - * cgroup_mutex. + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); @@ -111,11 +102,10 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked() \ +#define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_[tree_]mutex or RCU read lock required"); + "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -186,11 +176,12 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); +static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -203,9 +194,9 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, int ret; idr_preload(gfp_mask); - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } @@ -214,23 +205,32 @@ static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); +} + +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -243,16 +243,15 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_tree_mutex) || lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effctive css, which is defined * as the matching css of the nearest ancestor including self which has @ss @@ -265,14 +264,14 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); if (!ss) - return &cgrp->dummy_css; + return &cgrp->self; if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; - while (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ss->id))) - cgrp = cgrp->parent; + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); } @@ -283,11 +282,10 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = seq_cft(seq); + struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). @@ -300,9 +298,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else - return &cgrp->dummy_css; + return &cgrp->self; } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -318,7 +316,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } @@ -348,7 +346,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else @@ -381,29 +378,12 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->children, sibling) \ - if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} - /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -439,7 +419,7 @@ struct cgrp_cset_link { * reference-counted, to improve performance when child cgroups * haven't been created. */ -static struct css_set init_css_set = { +struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), @@ -483,7 +463,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (cgrp->populated_kn) kernfs_notify(cgrp->populated_kn); - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -887,11 +867,10 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); @@ -917,7 +896,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); @@ -1035,77 +1013,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write) mode |= S_IWUSR; return mode; } -static void cgroup_free_fn(struct work_struct *work) +static void cgroup_get(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); - - if (cgrp->parent) { - /* - * We get a ref to the parent, and put the ref when this - * cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - cgroup_put(cgrp->parent); - kernfs_put(cgrp->kn); - kfree(cgrp); - } else { - /* - * This is root cgroup's refcnt reaching zero, which - * indicates that the root should be released. - */ - cgroup_destroy_root(cgrp->root); - } + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); } -static void cgroup_free_rcu(struct rcu_head *head) +static void cgroup_put(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); + css_put(&cgrp->self); } -static void cgroup_get(struct cgroup *cgrp) +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) { - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); - atomic_inc(&cgrp->refcnt); + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); } -static void cgroup_put(struct cgroup *cgrp) +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) { - if (!atomic_dec_and_test(&cgrp->refcnt)) - return; - if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) - return; + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; /* - * XXX: cgrp->id is only used to look up css's. As cgroup and - * css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. */ - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; + cgroup_get(cgrp); + kernfs_break_active_protection(kn); - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); + mutex_lock(&cgroup_mutex); + + if (!cgroup_is_dead(cgrp)) + return cgrp; + + cgroup_kn_unlock(kn); + return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1134,7 +1130,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) struct cgroup_subsys *ss; int ssid, i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for_each_subsys(ss, ssid) { @@ -1172,11 +1167,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { struct cgroup_root *src_root; @@ -1414,7 +1407,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1440,7 +1432,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.children)) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } @@ -1460,7 +1452,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -1521,14 +1512,13 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - atomic_set(&cgrp->refcnt, 1); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; + cgrp->self.cgroup = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -1563,7 +1553,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) struct css_set *cset; int i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); @@ -1571,6 +1560,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; + /* * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding @@ -1579,11 +1572,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out; + goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) - goto out; + goto cancel_ref; root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1619,7 +1612,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) link_css_set(&tmp_links, cset, root_cgrp); up_write(&css_set_rwsem); - BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); @@ -1631,6 +1624,8 @@ destroy_root: root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1653,14 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; -retry: + /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1710,19 +1704,18 @@ retry: } /* - * A root's lifetime is governed by its root cgroup. Zero - * ref indicate that the root is being destroyed. Wait for - * destruction to complete so that the subsystems are free. - * We can use wait_queue for the wait but this path is - * super cold. Let's just sleep for a bit and retry. + * A root's lifetime is governed by its root cgroup. + * tryget_live failure indicate that the root is being + * destroyed. Wait for destruction to complete so that the + * subsystems are free. We can use wait_queue for the wait + * but this path is super cold. Let's just sleep for a bit + * and retry. */ - if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); msleep(10); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - goto retry; + ret = restart_syscall(); + goto out_free; } ret = 0; @@ -1753,8 +1746,7 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - +out_free: kfree(opts.release_agent); kfree(opts.name); @@ -1772,7 +1764,16 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->cgrp); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + */ + if (cgroup_has_live_children(&root->cgrp)) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); + kernfs_kill_sb(sb); } @@ -1894,7 +1895,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * @@ -2026,7 +2027,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && dst_cgrp->child_subsys_mask) return -EBUSY; @@ -2240,13 +2241,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp; + pid_t pid; int ret; - if (!cgroup_lock_live_group(cgrp)) + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2312,8 +2320,8 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); - return ret; + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; } /** @@ -2347,43 +2355,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cgroup_root *root = css->cgroup->root; + struct cgroup *cgrp; + + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(css->cgroup)) + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, buffer, - sizeof(root->release_agent_path)); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_mutex); - return 0; + cgroup_kn_unlock(of->kn); + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } @@ -2427,7 +2436,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); return 0; } @@ -2456,7 +2465,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct css_set *src_cset; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ @@ -2538,31 +2546,34 @@ out_finish: } /* change the enabled child controllers for a cgroup in the default hierarchy */ -static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { - unsigned int enable_req = 0, disable_req = 0, enable, disable; - struct cgroup *cgrp = dummy_css->cgroup, *child; + unsigned int enable = 0, disable = 0; + struct cgroup *cgrp, *child; struct cgroup_subsys *ss; - char *tok, *p; + char *tok; int ssid, ret; /* - * Parse input - white space separated list of subsystem names - * prefixed with either + or -. + * Parse input - space separated list of subsystem names prefixed + * with either + or -. */ - p = buffer; - while ((tok = strsep(&p, " \t\n"))) { + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { + if (tok[0] == '\0') + continue; for_each_subsys(ss, ssid) { if (ss->disabled || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { - enable_req |= 1 << ssid; - disable_req &= ~(1 << ssid); + enable |= 1 << ssid; + disable &= ~(1 << ssid); } else if (*tok == '-') { - disable_req |= 1 << ssid; - enable_req &= ~(1 << ssid); + disable |= 1 << ssid; + enable &= ~(1 << ssid); } else { return -EINVAL; } @@ -2572,19 +2583,9 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, return -EINVAL; } - /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs - * active_ref. cgroup_lock_live_group() already provides enough - * protection. Ensure @cgrp stays accessible and break the - * active_ref protection. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(cgrp->control_kn); -retry: - enable = enable_req; - disable = disable_req; - - mutex_lock(&cgroup_tree_mutex); + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) + return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2600,25 +2601,28 @@ retry: * cases, wait till it's gone using offline_waitq. */ cgroup_for_each_live_child(child, cgrp) { - wait_queue_t wait; + DEFINE_WAIT(wait); if (!cgroup_css(child, ss)) continue; + cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_tree_mutex); + cgroup_kn_unlock(of->kn); schedule(); finish_wait(&child->offline_waitq, &wait); - goto retry; + cgroup_put(child); + + return restart_syscall(); } /* unavailable or not enabled on the parent? */ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; - goto out_unlock_tree; + goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->child_subsys_mask & (1 << ssid))) { @@ -2630,7 +2634,7 @@ retry: cgroup_for_each_live_child(child, cgrp) { if (child->child_subsys_mask & (1 << ssid)) { ret = -EBUSY; - goto out_unlock_tree; + goto out_unlock; } } } @@ -2638,19 +2642,14 @@ retry: if (!enable && !disable) { ret = 0; - goto out_unlock_tree; - } - - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; + goto out_unlock; } /* * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { ret = -EBUSY; goto out_unlock; } @@ -2691,12 +2690,8 @@ retry: kernfs_activate(cgrp->kn); ret = 0; out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); - kernfs_unbreak_active_protection(cgrp->control_kn); - cgroup_put(cgrp); - return ret; + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; err_undo_css: cgrp->child_subsys_mask &= ~enable; @@ -2729,6 +2724,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (cft->write) + return cft->write(of, buf, nbytes, off); + /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and @@ -2739,9 +2737,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { + if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) @@ -2751,8 +2747,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); } else { ret = -EINVAL; } @@ -2829,20 +2823,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return -EPERM; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_tree_mutex. + * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -2885,9 +2877,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_subtree_control_show) - cgrp->control_kn = kn; - else if (cft->seq_show == cgroup_populated_show) + if (cft->seq_show == cgroup_populated_show) cgrp->populated_kn = kn; return 0; } @@ -2909,7 +2899,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2917,9 +2907,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { @@ -2944,7 +2934,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) struct cgroup_subsys_state *css; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -3012,7 +3002,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; @@ -3038,9 +3028,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; - mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_tree_mutex); + mutex_unlock(&cgroup_mutex); return ret; } @@ -3069,14 +3059,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_tree_mutex); + mutex_unlock(&cgroup_mutex); return ret; } @@ -3116,7 +3106,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3138,11 +3128,11 @@ css_next_child(struct cgroup_subsys_state *pos_css, * cgroup is removed or iteration and removal race. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) if (next->serial_nr > pos->serial_nr) break; } @@ -3152,12 +3142,12 @@ css_next_child(struct cgroup_subsys_state *pos_css, * the next sibling; however, it might have @ss disabled. If so, * fast-forward to the next enabled one. */ - while (&next->sibling != &cgrp->children) { + while (&next->self.sibling != &cgrp->self.children) { struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); if (next_css) return next_css; - next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); } return NULL; } @@ -3182,7 +3172,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3195,10 +3185,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; @@ -3222,7 +3212,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3269,7 +3259,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3280,12 +3270,27 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; +} + +static bool cgroup_has_live_children(struct cgroup *cgrp) +{ + struct cgroup *child; + + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { + if (!cgroup_is_dead(child)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; } /** @@ -3447,7 +3452,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->dummy_css, &it); + css_task_iter_start(&from->self, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -3712,7 +3717,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3774,7 +3779,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) /* * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); @@ -3786,7 +3791,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -3957,7 +3962,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -3985,7 +3990,7 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.subtree_control", .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, - .write_string = cgroup_subtree_control_write, + .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", @@ -4006,7 +4011,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -4019,7 +4024,7 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, + .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ @@ -4063,9 +4068,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -4084,11 +4089,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_put(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgroup_parent(cgrp)) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgroup_parent(cgrp)); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4100,26 +4131,55 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) queue_work(cgroup_destroy_wq, &css->destroy_work); } +static void css_release_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + mutex_lock(&cgroup_mutex); + + list_del_rcu(&css->sibling); + + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } + + mutex_unlock(&cgroup_mutex); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + cgroup_get(cgrp); + + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); - if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss); - else - css->flags |= CSS_ROOT; + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); + css_get(css->parent); + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -4130,14 +4190,12 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; @@ -4148,7 +4206,6 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4158,7 +4215,6 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); @@ -4175,34 +4231,42 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); - init_css(css, ss, cgrp); - cgroup_get(cgrp); - css_get(css->parent); + init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release); if (err) goto err_free_css; + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free_percpu_ref; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_clear_dir; + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { + cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) @@ -4212,8 +4276,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_clear_dir: +err_list_del: + list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: @@ -4221,55 +4288,45 @@ err_free_css: return err; } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; - struct cgroup_root *root = parent->root; - int ssid, err; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + int ssid, ret; + + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - - mutex_lock(&cgroup_tree_mutex); - - /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. - */ - if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_unlock_tree; + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; } + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { - err = -ENOMEM; - goto err_unlock; + ret = -ENOMEM; + goto out_cancel_ref; } init_cgroup_housekeeping(cgrp); - cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->self.parent = &parent->self; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4280,8 +4337,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { - err = PTR_ERR(kn); - goto err_free_id; + ret = PTR_ERR(kn); + goto out_free_id; } cgrp->kn = kn; @@ -4294,7 +4351,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4304,20 +4361,20 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_kn_set_ugid(kn); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } @@ -4330,86 +4387,38 @@ static long cgroup_create(struct cgroup *parent, const char *name, kernfs_activate(kn); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - return 0; + ret = 0; + goto out_unlock; -err_free_id: +out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: - mutex_unlock(&cgroup_mutex); -err_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); +out_free_cgrp: kfree(cgrp); - return err; +out_unlock: + cgroup_kn_unlock(parent_kn); + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - return err; -} - -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - struct cgroup *parent = parent_kn->priv; - int ret; - - /* - * cgroup_create() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - - ret = cgroup_create(parent, name, mode); - - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); - return ret; + goto out_unlock; } /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -4429,12 +4438,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* * This must happen before css is disassociated with its cgroup. @@ -4451,7 +4460,7 @@ static void kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -4467,9 +4476,9 @@ static void kill_css(struct cgroup_subsys_state *css) * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -4488,12 +4497,10 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *child; struct cgroup_subsys_state *css; bool empty; int ssid; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4507,19 +4514,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (cgroup_has_live_children(cgrp)) return -EBUSY; /* @@ -4531,16 +4530,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ set_bit(CGRP_DEAD, &cgrp->flags); - /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. This - * involves removing the subsystem's files, drop cgroup_mutex. - */ - mutex_unlock(&cgroup_mutex); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - mutex_lock(&cgroup_mutex); /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); @@ -4549,85 +4541,44 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); + kernfs_remove(cgrp->kn); - /* remove @cgrp directory along with the base files */ - mutex_unlock(&cgroup_mutex); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_from_dir(). Those are supported by RCU protecting - * clearing of cgrp->kn->priv backpointer, which should happen - * after all files under it have been removed. - */ - kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - mutex_lock(&cgroup_mutex); + /* put the base reference */ + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ - struct cgroup *parent = cgrp->parent; - - lockdep_assert_held(&cgroup_tree_mutex); - lockdep_assert_held(&cgroup_mutex); - - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - - cgroup_put(cgrp); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); -} - static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *cgrp = kn->priv; + struct cgroup *cgrp; int ret = 0; - /* - * This is self-destruction but @kn can't be removed while this - * callback is in progress. Let's break active protection. Once - * the protection is broken, @cgrp can be destroyed at any point. - * Pin it so that it stays accessible. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(kn); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(cgrp); + + cgroup_kn_unlock(kn); /* - * @cgrp might already have been destroyed while we're trying to - * grab the mutexes. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ - if (!cgroup_is_dead(cgrp)) - ret = cgroup_destroy_locked(cgrp); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); return ret; } @@ -4640,15 +4591,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .rename = cgroup_rename, }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ @@ -4656,7 +4607,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, &cgrp_dfl_root.cgrp); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + + if (early) { + /* allocation can't be done safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4676,7 +4641,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); } /** @@ -4693,6 +4657,8 @@ int __init cgroup_init_early(void) int i; init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4707,7 +4673,7 @@ int __init cgroup_init_early(void) ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4726,7 +4692,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -4736,11 +4701,18 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { - if (!ss->early_init) - cgroup_init_subsys(ss); + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); @@ -5029,7 +5001,7 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -5146,7 +5118,7 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * @@ -5154,8 +5126,8 @@ __setup("cgroup_disable=", cgroup_disable); * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup_subsys_state *css = NULL; @@ -5171,13 +5143,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See destroy_locked() for details. + * protected for this access. See cgroup_rmdir() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); - if (!css || !css_tryget(css)) + if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); @@ -5194,14 +5166,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG