kernel/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include <linux/cgroup.h>
  32 #include <linux/cred.h>
  33 #include <linux/ctype.h>
  34 #include <linux/errno.h>
  35 #include <linux/init_task.h>
  36 #include <linux/kernel.h>
  37 #include <linux/list.h>
  38 #include <linux/mm.h>
  39 #include <linux/mutex.h>
  40 #include <linux/mount.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/slab.h>
  46 #include <linux/spinlock.h>
  47 #include <linux/rwsem.h>
  48 #include <linux/string.h>
  49 #include <linux/sort.h>
  50 #include <linux/kmod.h>
  51 #include <linux/delayacct.h>
  52 #include <linux/cgroupstats.h>
  53 #include <linux/hashtable.h>
  54 #include <linux/pid_namespace.h>
  55 #include <linux/idr.h>
  56 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  57 #include <linux/kthread.h>
  58 #include <linux/delay.h>
  59
  60 #include <linux/atomic.h>
  61
  62 /*
  63  * pidlists linger the following amount before being destroyed.  The goal
  64  * is avoiding frequent destruction in the middle of consecutive read calls
  65  * Expiring in the middle is a performance problem not a correctness one.
  66  * 1 sec should be enough.
  67  */
  68 #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
  69
  70 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  71                                          MAX_CFTYPE_NAME + 2)
  72
  73 /*
  74  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  75  * hierarchy must be performed while holding it.
  76  *
  77  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  78  * objects, and the chain of tasks off each css_set.
  79  *
  80  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  81  * cgroup.h can use them for lockdep annotations.
  82  */
  83 #ifdef CONFIG_PROVE_RCU
  84 DEFINE_MUTEX(cgroup_mutex);
  85 DECLARE_RWSEM(css_set_rwsem);
  86 EXPORT_SYMBOL_GPL(cgroup_mutex);
  87 EXPORT_SYMBOL_GPL(css_set_rwsem);
  88 #else
  89 static DEFINE_MUTEX(cgroup_mutex);
  90 static DECLARE_RWSEM(css_set_rwsem);
  91 #endif
  92
  93 /*
  94  * Protects cgroup_idr and css_idr so that IDs can be released without
  95  * grabbing cgroup_mutex.
  96  */
  97 static DEFINE_SPINLOCK(cgroup_idr_lock);
  98
  99 /*
 100  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 101  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 102  */
 103 static DEFINE_SPINLOCK(release_agent_path_lock);
 104
 105 #define cgroup_assert_mutex_or_rcu_locked()                             \
 106         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
 107                            lockdep_is_held(&cgroup_mutex),              \
 108                            "cgroup_mutex or RCU read lock required");
 109
 110 /*
 111  * cgroup destruction makes heavy use of work items and there can be a lot
 112  * of concurrent destructions.  Use a separate workqueue so that cgroup
 113  * destruction work items don't end up filling up max_active of system_wq
 114  * which may lead to deadlock.
 115  */
 116 static struct workqueue_struct *cgroup_destroy_wq;
 117
 118 /*
 119  * pidlist destructions need to be flushed on cgroup destruction.  Use a
 120  * separate workqueue as flush domain.
 121  */
 122 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 123
 124 /* generate an array of cgroup subsystem pointers */
 125 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 126 static struct cgroup_subsys *cgroup_subsys[] = {
 127 #include <linux/cgroup_subsys.h>
 128 };
 129 #undef SUBSYS
 130
 131 /* array of cgroup subsystem names */
 132 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 133 static const char *cgroup_subsys_name[] = {
 134 #include <linux/cgroup_subsys.h>
 135 };
 136 #undef SUBSYS
 137
 138 /*
 139  * The default hierarchy, reserved for the subsystems that are otherwise
 140  * unattached - it never has more than a single cgroup, and all tasks are
 141  * part of that cgroup.
 142  */
 143 struct cgroup_root cgrp_dfl_root;
 144
 145 /*
 146  * The default hierarchy always exists but is hidden until mounted for the
 147  * first time.  This is for backward compatibility.
 148  */
 149 static bool cgrp_dfl_root_visible;
 150
 151 /* The list of hierarchy roots */
 152
 153 static LIST_HEAD(cgroup_roots);
 154 static int cgroup_root_count;
 155
 156 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 157 static DEFINE_IDR(cgroup_hierarchy_idr);
 158
 159 /*
 160  * Assign a monotonically increasing serial number to cgroups.  It
 161  * guarantees cgroups with bigger numbers are newer than those with smaller
 162  * numbers.  Also, as cgroups are always appended to the parent's
 163  * ->children list, it guarantees that sibling cgroups are always sorted in
 164  * the ascending serial number order on the list.  Protected by
 165  * cgroup_mutex.
 166  */
 167 static u64 cgroup_serial_nr_next = 1;
 168
 169 /* This flag indicates whether tasks in the fork and exit paths should
 170  * check for fork/exit handlers to call. This avoids us having to do
 171  * extra work in the fork/exit path if none of the subsystems need to
 172  * be called.
 173  */
 174 static int need_forkexit_callback __read_mostly;
 175
 176 static struct cftype cgroup_base_files[];
 177
 178 static void cgroup_put(struct cgroup *cgrp);
 179 static int rebind_subsystems(struct cgroup_root *dst_root,
 180                              unsigned int ss_mask);
 181 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 182 static int cgroup_destroy_locked(struct cgroup *cgrp);
 183 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
 184 static void kill_css(struct cgroup_subsys_state *css);
 185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 186                               bool is_add);
 187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 188
 189 /* IDR wrappers which synchronize using cgroup_idr_lock */
 190 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 191                             gfp_t gfp_mask)
 192 {
 193         int ret;
 194
 195         idr_preload(gfp_mask);
 196         spin_lock_bh(&cgroup_idr_lock);
 197         ret = idr_alloc(idr, ptr, start, end, gfp_mask);
 198         spin_unlock_bh(&cgroup_idr_lock);
 199         idr_preload_end();
 200         return ret;
 201 }
 202
 203 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 204 {
 205         void *ret;
 206
 207         spin_lock_bh(&cgroup_idr_lock);
 208         ret = idr_replace(idr, ptr, id);
 209         spin_unlock_bh(&cgroup_idr_lock);
 210         return ret;
 211 }
 212
 213 static void cgroup_idr_remove(struct idr *idr, int id)
 214 {
 215         spin_lock_bh(&cgroup_idr_lock);
 216         idr_remove(idr, id);
 217         spin_unlock_bh(&cgroup_idr_lock);
 218 }
 219
 220 /**
 221  * cgroup_css - obtain a cgroup's css for the specified subsystem
 222  * @cgrp: the cgroup of interest
 223  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 224  *
 225  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 226  * function must be called either under cgroup_mutex or rcu_read_lock() and
 227  * the caller is responsible for pinning the returned css if it wants to
 228  * keep accessing it outside the said locks.  This function may return
 229  * %NULL if @cgrp doesn't have @subsys_id enabled.
 230  */
 231 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 232                                               struct cgroup_subsys *ss)
 233 {
 234         if (ss)
 235                 return rcu_dereference_check(cgrp->subsys[ss->id],
 236                                         lockdep_is_held(&cgroup_mutex));
 237         else
 238                 return &cgrp->self;
 239 }
 240
 241 /**
 242  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 243  * @cgrp: the cgroup of interest
 244  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 245  *
 246  * Similar to cgroup_css() but returns the effctive css, which is defined
 247  * as the matching css of the nearest ancestor including self which has @ss
 248  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 249  * function is guaranteed to return non-NULL css.
 250  */
 251 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 252                                                 struct cgroup_subsys *ss)
 253 {
 254         lockdep_assert_held(&cgroup_mutex);
 255
 256         if (!ss)
 257                 return &cgrp->self;
 258
 259         if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 260                 return NULL;
 261
 262         while (cgrp->parent &&
 263                !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
 264                 cgrp = cgrp->parent;
 265
 266         return cgroup_css(cgrp, ss);
 267 }
 268
 269 /* convenient tests for these bits */
 270 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 271 {
 272         return test_bit(CGRP_DEAD, &cgrp->flags);
 273 }
 274
 275 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 276 {
 277         struct cgroup *cgrp = of->kn->parent->priv;
 278         struct cftype *cft = of_cft(of);
 279
 280         /*
 281          * This is open and unprotected implementation of cgroup_css().
 282          * seq_css() is only called from a kernfs file operation which has
 283          * an active reference on the file.  Because all the subsystem
 284          * files are drained before a css is disassociated with a cgroup,
 285          * the matching css from the cgroup's subsys table is guaranteed to
 286          * be and stay valid until the enclosing operation is complete.
 287          */
 288         if (cft->ss)
 289                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 290         else
 291                 return &cgrp->self;
 292 }
 293 EXPORT_SYMBOL_GPL(of_css);
 294
 295 /**
 296  * cgroup_is_descendant - test ancestry
 297  * @cgrp: the cgroup to be tested
 298  * @ancestor: possible ancestor of @cgrp
 299  *
 300  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 301  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 302  * and @ancestor are accessible.
 303  */
 304 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 305 {
 306         while (cgrp) {
 307                 if (cgrp == ancestor)
 308                         return true;
 309                 cgrp = cgrp->parent;
 310         }
 311         return false;
 312 }
 313
 314 static int cgroup_is_releasable(const struct cgroup *cgrp)
 315 {
 316         const int bits =
 317                 (1 << CGRP_RELEASABLE) |
 318                 (1 << CGRP_NOTIFY_ON_RELEASE);
 319         return (cgrp->flags & bits) == bits;
 320 }
 321
 322 static int notify_on_release(const struct cgroup *cgrp)
 323 {
 324         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 325 }
 326
 327 /**
 328  * for_each_css - iterate all css's of a cgroup
 329  * @css: the iteration cursor
 330  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 331  * @cgrp: the target cgroup to iterate css's of
 332  *
 333  * Should be called under cgroup_[tree_]mutex.
 334  */
 335 #define for_each_css(css, ssid, cgrp)                                   \
 336         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 337                 if (!((css) = rcu_dereference_check(                    \
 338                                 (cgrp)->subsys[(ssid)],                 \
 339                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 340                 else
 341
 342 /**
 343  * for_each_e_css - iterate all effective css's of a cgroup
 344  * @css: the iteration cursor
 345  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 346  * @cgrp: the target cgroup to iterate css's of
 347  *
 348  * Should be called under cgroup_[tree_]mutex.
 349  */
 350 #define for_each_e_css(css, ssid, cgrp)                                 \
 351         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 352                 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 353                         ;                                               \
 354                 else
 355
 356 /**
 357  * for_each_subsys - iterate all enabled cgroup subsystems
 358  * @ss: the iteration cursor
 359  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 360  */
 361 #define for_each_subsys(ss, ssid)                                       \
 362         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
 363              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 364
 365 /* iterate across the hierarchies */
 366 #define for_each_root(root)                                             \
 367         list_for_each_entry((root), &cgroup_roots, root_list)
 368
 369 /* iterate over child cgrps, lock should be held throughout iteration */
 370 #define cgroup_for_each_live_child(child, cgrp)                         \
 371         list_for_each_entry((child), &(cgrp)->children, sibling)        \
 372                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 373                        cgroup_is_dead(child); }))                       \
 374                         ;                                               \
 375                 else
 376
 377 /* the list of cgroups eligible for automatic release. Protected by
 378  * release_list_lock */
 379 static LIST_HEAD(release_list);
 380 static DEFINE_RAW_SPINLOCK(release_list_lock);
 381 static void cgroup_release_agent(struct work_struct *work);
 382 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 383 static void check_for_release(struct cgroup *cgrp);
 384
 385 /*
 386  * A cgroup can be associated with multiple css_sets as different tasks may
 387  * belong to different cgroups on different hierarchies.  In the other
 388  * direction, a css_set is naturally associated with multiple cgroups.
 389  * This M:N relationship is represented by the following link structure
 390  * which exists for each association and allows traversing the associations
 391  * from both sides.
 392  */
 393 struct cgrp_cset_link {
 394         /* the cgroup and css_set this link associates */
 395         struct cgroup           *cgrp;
 396         struct css_set          *cset;
 397
 398         /* list of cgrp_cset_links anchored at cgrp->cset_links */
 399         struct list_head        cset_link;
 400
 401         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
 402         struct list_head        cgrp_link;
 403 };
 404
 405 /*
 406  * The default css_set - used by init and its children prior to any
 407  * hierarchies being mounted. It contains a pointer to the root state
 408  * for each subsystem. Also used to anchor the list of css_sets. Not
 409  * reference-counted, to improve performance when child cgroups
 410  * haven't been created.
 411  */
 412 struct css_set init_css_set = {
 413         .refcount               = ATOMIC_INIT(1),
 414         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 415         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 416         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 417         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 418         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 419 };
 420
 421 static int css_set_count        = 1;    /* 1 for init_css_set */
 422
 423 /**
 424  * cgroup_update_populated - updated populated count of a cgroup
 425  * @cgrp: the target cgroup
 426  * @populated: inc or dec populated count
 427  *
 428  * @cgrp is either getting the first task (css_set) or losing the last.
 429  * Update @cgrp->populated_cnt accordingly.  The count is propagated
 430  * towards root so that a given cgroup's populated_cnt is zero iff the
 431  * cgroup and all its descendants are empty.
 432  *
 433  * @cgrp's interface file "cgroup.populated" is zero if
 434  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 435  * changes from or to zero, userland is notified that the content of the
 436  * interface file has changed.  This can be used to detect when @cgrp and
 437  * its descendants become populated or empty.
 438  */
 439 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 440 {
 441         lockdep_assert_held(&css_set_rwsem);
 442
 443         do {
 444                 bool trigger;
 445
 446                 if (populated)
 447                         trigger = !cgrp->populated_cnt++;
 448                 else
 449                         trigger = !--cgrp->populated_cnt;
 450
 451                 if (!trigger)
 452                         break;
 453
 454                 if (cgrp->populated_kn)
 455                         kernfs_notify(cgrp->populated_kn);
 456                 cgrp = cgrp->parent;
 457         } while (cgrp);
 458 }
 459
 460 /*
 461  * hash table for cgroup groups. This improves the performance to find
 462  * an existing css_set. This hash doesn't (currently) take into
 463  * account cgroups in empty hierarchies.
 464  */
 465 #define CSS_SET_HASH_BITS       7
 466 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 467
 468 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 469 {
 470         unsigned long key = 0UL;
 471         struct cgroup_subsys *ss;
 472         int i;
 473
 474         for_each_subsys(ss, i)
 475                 key += (unsigned long)css[i];
 476         key = (key >> 16) ^ key;
 477
 478         return key;
 479 }
 480
 481 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 482 {
 483         struct cgrp_cset_link *link, *tmp_link;
 484         struct cgroup_subsys *ss;
 485         int ssid;
 486
 487         lockdep_assert_held(&css_set_rwsem);
 488
 489         if (!atomic_dec_and_test(&cset->refcount))
 490                 return;
 491
 492         /* This css_set is dead. unlink it and release cgroup refcounts */
 493         for_each_subsys(ss, ssid)
 494                 list_del(&cset->e_cset_node[ssid]);
 495         hash_del(&cset->hlist);
 496         css_set_count--;
 497
 498         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 499                 struct cgroup *cgrp = link->cgrp;
 500
 501                 list_del(&link->cset_link);
 502                 list_del(&link->cgrp_link);
 503
 504                 /* @cgrp can't go away while we're holding css_set_rwsem */
 505                 if (list_empty(&cgrp->cset_links)) {
 506                         cgroup_update_populated(cgrp, false);
 507                         if (notify_on_release(cgrp)) {
 508                                 if (taskexit)
 509                                         set_bit(CGRP_RELEASABLE, &cgrp->flags);
 510                                 check_for_release(cgrp);
 511                         }
 512                 }
 513
 514                 kfree(link);
 515         }
 516
 517         kfree_rcu(cset, rcu_head);
 518 }
 519
 520 static void put_css_set(struct css_set *cset, bool taskexit)
 521 {
 522         /*
 523          * Ensure that the refcount doesn't hit zero while any readers
 524          * can see it. Similar to atomic_dec_and_lock(), but for an
 525          * rwlock
 526          */
 527         if (atomic_add_unless(&cset->refcount, -1, 1))
 528                 return;
 529
 530         down_write(&css_set_rwsem);
 531         put_css_set_locked(cset, taskexit);
 532         up_write(&css_set_rwsem);
 533 }
 534
 535 /*
 536  * refcounted get/put for css_set objects
 537  */
 538 static inline void get_css_set(struct css_set *cset)
 539 {
 540         atomic_inc(&cset->refcount);
 541 }
 542
 543 /**
 544  * compare_css_sets - helper function for find_existing_css_set().
 545  * @cset: candidate css_set being tested
 546  * @old_cset: existing css_set for a task
 547  * @new_cgrp: cgroup that's being entered by the task
 548  * @template: desired set of css pointers in css_set (pre-calculated)
 549  *
 550  * Returns true if "cset" matches "old_cset" except for the hierarchy
 551  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 552  */
 553 static bool compare_css_sets(struct css_set *cset,
 554                              struct css_set *old_cset,
 555                              struct cgroup *new_cgrp,
 556                              struct cgroup_subsys_state *template[])
 557 {
 558         struct list_head *l1, *l2;
 559
 560         /*
 561          * On the default hierarchy, there can be csets which are
 562          * associated with the same set of cgroups but different csses.
 563          * Let's first ensure that csses match.
 564          */
 565         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 566                 return false;
 567
 568         /*
 569          * Compare cgroup pointers in order to distinguish between
 570          * different cgroups in hierarchies.  As different cgroups may
 571          * share the same effective css, this comparison is always
 572          * necessary.
 573          */
 574         l1 = &cset->cgrp_links;
 575         l2 = &old_cset->cgrp_links;
 576         while (1) {
 577                 struct cgrp_cset_link *link1, *link2;
 578                 struct cgroup *cgrp1, *cgrp2;
 579
 580                 l1 = l1->next;
 581                 l2 = l2->next;
 582                 /* See if we reached the end - both lists are equal length. */
 583                 if (l1 == &cset->cgrp_links) {
 584                         BUG_ON(l2 != &old_cset->cgrp_links);
 585                         break;
 586                 } else {
 587                         BUG_ON(l2 == &old_cset->cgrp_links);
 588                 }
 589                 /* Locate the cgroups associated with these links. */
 590                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 591                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 592                 cgrp1 = link1->cgrp;
 593                 cgrp2 = link2->cgrp;
 594                 /* Hierarchies should be linked in the same order. */
 595                 BUG_ON(cgrp1->root != cgrp2->root);
 596
 597                 /*
 598                  * If this hierarchy is the hierarchy of the cgroup
 599                  * that's changing, then we need to check that this
 600                  * css_set points to the new cgroup; if it's any other
 601                  * hierarchy, then this css_set should point to the
 602                  * same cgroup as the old css_set.
 603                  */
 604                 if (cgrp1->root == new_cgrp->root) {
 605                         if (cgrp1 != new_cgrp)
 606                                 return false;
 607                 } else {
 608                         if (cgrp1 != cgrp2)
 609                                 return false;
 610                 }
 611         }
 612         return true;
 613 }
 614
 615 /**
 616  * find_existing_css_set - init css array and find the matching css_set
 617  * @old_cset: the css_set that we're using before the cgroup transition
 618  * @cgrp: the cgroup that we're moving into
 619  * @template: out param for the new set of csses, should be clear on entry
 620  */
 621 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 622                                         struct cgroup *cgrp,
 623                                         struct cgroup_subsys_state *template[])
 624 {
 625         struct cgroup_root *root = cgrp->root;
 626         struct cgroup_subsys *ss;
 627         struct css_set *cset;
 628         unsigned long key;
 629         int i;
 630
 631         /*
 632          * Build the set of subsystem state objects that we want to see in the
 633          * new css_set. while subsystems can change globally, the entries here
 634          * won't change, so no need for locking.
 635          */
 636         for_each_subsys(ss, i) {
 637                 if (root->subsys_mask & (1UL << i)) {
 638                         /*
 639                          * @ss is in this hierarchy, so we want the
 640                          * effective css from @cgrp.
 641                          */
 642                         template[i] = cgroup_e_css(cgrp, ss);
 643                 } else {
 644                         /*
 645                          * @ss is not in this hierarchy, so we don't want
 646                          * to change the css.
 647                          */
 648                         template[i] = old_cset->subsys[i];
 649                 }
 650         }
 651
 652         key = css_set_hash(template);
 653         hash_for_each_possible(css_set_table, cset, hlist, key) {
 654                 if (!compare_css_sets(cset, old_cset, cgrp, template))
 655                         continue;
 656
 657                 /* This css_set matches what we need */
 658                 return cset;
 659         }
 660
 661         /* No existing cgroup group matched */
 662         return NULL;
 663 }
 664
 665 static void free_cgrp_cset_links(struct list_head *links_to_free)
 666 {
 667         struct cgrp_cset_link *link, *tmp_link;
 668
 669         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 670                 list_del(&link->cset_link);
 671                 kfree(link);
 672         }
 673 }
 674
 675 /**
 676  * allocate_cgrp_cset_links - allocate cgrp_cset_links
 677  * @count: the number of links to allocate
 678  * @tmp_links: list_head the allocated links are put on
 679  *
 680  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 681  * through ->cset_link.  Returns 0 on success or -errno.
 682  */
 683 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 684 {
 685         struct cgrp_cset_link *link;
 686         int i;
 687
 688         INIT_LIST_HEAD(tmp_links);
 689
 690         for (i = 0; i < count; i++) {
 691                 link = kzalloc(sizeof(*link), GFP_KERNEL);
 692                 if (!link) {
 693                         free_cgrp_cset_links(tmp_links);
 694                         return -ENOMEM;
 695                 }
 696                 list_add(&link->cset_link, tmp_links);
 697         }
 698         return 0;
 699 }
 700
 701 /**
 702  * link_css_set - a helper function to link a css_set to a cgroup
 703  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 704  * @cset: the css_set to be linked
 705  * @cgrp: the destination cgroup
 706  */
 707 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 708                          struct cgroup *cgrp)
 709 {
 710         struct cgrp_cset_link *link;
 711
 712         BUG_ON(list_empty(tmp_links));
 713
 714         if (cgroup_on_dfl(cgrp))
 715                 cset->dfl_cgrp = cgrp;
 716
 717         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 718         link->cset = cset;
 719         link->cgrp = cgrp;
 720
 721         if (list_empty(&cgrp->cset_links))
 722                 cgroup_update_populated(cgrp, true);
 723         list_move(&link->cset_link, &cgrp->cset_links);
 724
 725         /*
 726          * Always add links to the tail of the list so that the list
 727          * is sorted by order of hierarchy creation
 728          */
 729         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 730 }
 731
 732 /**
 733  * find_css_set - return a new css_set with one cgroup updated
 734  * @old_cset: the baseline css_set
 735  * @cgrp: the cgroup to be updated
 736  *
 737  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 738  * substituted into the appropriate hierarchy.
 739  */
 740 static struct css_set *find_css_set(struct css_set *old_cset,
 741                                     struct cgroup *cgrp)
 742 {
 743         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 744         struct css_set *cset;
 745         struct list_head tmp_links;
 746         struct cgrp_cset_link *link;
 747         struct cgroup_subsys *ss;
 748         unsigned long key;
 749         int ssid;
 750
 751         lockdep_assert_held(&cgroup_mutex);
 752
 753         /* First see if we already have a cgroup group that matches
 754          * the desired set */
 755         down_read(&css_set_rwsem);
 756         cset = find_existing_css_set(old_cset, cgrp, template);
 757         if (cset)
 758                 get_css_set(cset);
 759         up_read(&css_set_rwsem);
 760
 761         if (cset)
 762                 return cset;
 763
 764         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 765         if (!cset)
 766                 return NULL;
 767
 768         /* Allocate all the cgrp_cset_link objects that we'll need */
 769         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 770                 kfree(cset);
 771                 return NULL;
 772         }
 773
 774         atomic_set(&cset->refcount, 1);
 775         INIT_LIST_HEAD(&cset->cgrp_links);
 776         INIT_LIST_HEAD(&cset->tasks);
 777         INIT_LIST_HEAD(&cset->mg_tasks);
 778         INIT_LIST_HEAD(&cset->mg_preload_node);
 779         INIT_LIST_HEAD(&cset->mg_node);
 780         INIT_HLIST_NODE(&cset->hlist);
 781
 782         /* Copy the set of subsystem state objects generated in
 783          * find_existing_css_set() */
 784         memcpy(cset->subsys, template, sizeof(cset->subsys));
 785
 786         down_write(&css_set_rwsem);
 787         /* Add reference counts and links from the new css_set. */
 788         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 789                 struct cgroup *c = link->cgrp;
 790
 791                 if (c->root == cgrp->root)
 792                         c = cgrp;
 793                 link_css_set(&tmp_links, cset, c);
 794         }
 795
 796         BUG_ON(!list_empty(&tmp_links));
 797
 798         css_set_count++;
 799
 800         /* Add @cset to the hash table */
 801         key = css_set_hash(cset->subsys);
 802         hash_add(css_set_table, &cset->hlist, key);
 803
 804         for_each_subsys(ss, ssid)
 805                 list_add_tail(&cset->e_cset_node[ssid],
 806                               &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 807
 808         up_write(&css_set_rwsem);
 809
 810         return cset;
 811 }
 812
 813 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 814 {
 815         struct cgroup *root_cgrp = kf_root->kn->priv;
 816
 817         return root_cgrp->root;
 818 }
 819
 820 static int cgroup_init_root_id(struct cgroup_root *root)
 821 {
 822         int id;
 823
 824         lockdep_assert_held(&cgroup_mutex);
 825
 826         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 827         if (id < 0)
 828                 return id;
 829
 830         root->hierarchy_id = id;
 831         return 0;
 832 }
 833
 834 static void cgroup_exit_root_id(struct cgroup_root *root)
 835 {
 836         lockdep_assert_held(&cgroup_mutex);
 837
 838         if (root->hierarchy_id) {
 839                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 840                 root->hierarchy_id = 0;
 841         }
 842 }
 843
 844 static void cgroup_free_root(struct cgroup_root *root)
 845 {
 846         if (root) {
 847                 /* hierarhcy ID shoulid already have been released */
 848                 WARN_ON_ONCE(root->hierarchy_id);
 849
 850                 idr_destroy(&root->cgroup_idr);
 851                 kfree(root);
 852         }
 853 }
 854
 855 static void cgroup_destroy_root(struct cgroup_root *root)
 856 {
 857         struct cgroup *cgrp = &root->cgrp;
 858         struct cgrp_cset_link *link, *tmp_link;
 859
 860         mutex_lock(&cgroup_mutex);
 861
 862         BUG_ON(atomic_read(&root->nr_cgrps));
 863         BUG_ON(!list_empty(&cgrp->children));
 864
 865         /* Rebind all subsystems back to the default hierarchy */
 866         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 867
 868         /*
 869          * Release all the links from cset_links to this hierarchy's
 870          * root cgroup
 871          */
 872         down_write(&css_set_rwsem);
 873
 874         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 875                 list_del(&link->cset_link);
 876                 list_del(&link->cgrp_link);
 877                 kfree(link);
 878         }
 879         up_write(&css_set_rwsem);
 880
 881         if (!list_empty(&root->root_list)) {
 882                 list_del(&root->root_list);
 883                 cgroup_root_count--;
 884         }
 885
 886         cgroup_exit_root_id(root);
 887
 888         mutex_unlock(&cgroup_mutex);
 889
 890         kernfs_destroy_root(root->kf_root);
 891         cgroup_free_root(root);
 892 }
 893
 894 /* look up cgroup associated with given css_set on the specified hierarchy */
 895 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 896                                             struct cgroup_root *root)
 897 {
 898         struct cgroup *res = NULL;
 899
 900         lockdep_assert_held(&cgroup_mutex);
 901         lockdep_assert_held(&css_set_rwsem);
 902
 903         if (cset == &init_css_set) {
 904                 res = &root->cgrp;
 905         } else {
 906                 struct cgrp_cset_link *link;
 907
 908                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 909                         struct cgroup *c = link->cgrp;
 910
 911                         if (c->root == root) {
 912                                 res = c;
 913                                 break;
 914                         }
 915                 }
 916         }
 917
 918         BUG_ON(!res);
 919         return res;
 920 }
 921
 922 /*
 923  * Return the cgroup for "task" from the given hierarchy. Must be
 924  * called with cgroup_mutex and css_set_rwsem held.
 925  */
 926 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 927                                             struct cgroup_root *root)
 928 {
 929         /*
 930          * No need to lock the task - since we hold cgroup_mutex the
 931          * task can't change groups, so the only thing that can happen
 932          * is that it exits and its css is set back to init_css_set.
 933          */
 934         return cset_cgroup_from_root(task_css_set(task), root);
 935 }
 936
 937 /*
 938  * A task must hold cgroup_mutex to modify cgroups.
 939  *
 940  * Any task can increment and decrement the count field without lock.
 941  * So in general, code holding cgroup_mutex can't rely on the count
 942  * field not changing.  However, if the count goes to zero, then only
 943  * cgroup_attach_task() can increment it again.  Because a count of zero
 944  * means that no tasks are currently attached, therefore there is no
 945  * way a task attached to that cgroup can fork (the other way to
 946  * increment the count).  So code holding cgroup_mutex can safely
 947  * assume that if the count is zero, it will stay zero. Similarly, if
 948  * a task holds cgroup_mutex on a cgroup with zero count, it
 949  * knows that the cgroup won't be removed, as cgroup_rmdir()
 950  * needs that mutex.
 951  *
 952  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 953  * (usually) take cgroup_mutex.  These are the two most performance
 954  * critical pieces of code here.  The exception occurs on cgroup_exit(),
 955  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 956  * is taken, and if the cgroup count is zero, a usermode call made
 957  * to the release agent with the name of the cgroup (path relative to
 958  * the root of cgroup file system) as the argument.
 959  *
 960  * A cgroup can only be deleted if both its 'count' of using tasks
 961  * is zero, and its list of 'children' cgroups is empty.  Since all
 962  * tasks in the system use _some_ cgroup, and since there is always at
 963  * least one task in the system (init, pid == 1), therefore, root cgroup
 964  * always has either children cgroups and/or using tasks.  So we don't
 965  * need a special hack to ensure that root cgroup cannot be deleted.
 966  *
 967  * P.S.  One more locking exception.  RCU is used to guard the
 968  * update of a tasks cgroup pointer by cgroup_attach_task()
 969  */
 970
 971 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
 972 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 973 static const struct file_operations proc_cgroupstats_operations;
 974
 975 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 976                               char *buf)
 977 {
 978         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 979             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 980                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 981                          cft->ss->name, cft->name);
 982         else
 983                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 984         return buf;
 985 }
 986
 987 /**
 988  * cgroup_file_mode - deduce file mode of a control file
 989  * @cft: the control file in question
 990  *
 991  * returns cft->mode if ->mode is not 0
 992  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 993  * returns S_IRUGO if it has only a read handler
 994  * returns S_IWUSR if it has only a write hander
 995  */
 996 static umode_t cgroup_file_mode(const struct cftype *cft)
 997 {
 998         umode_t mode = 0;
 999
1000         if (cft->mode)
1001                 return cft->mode;
1002
1003         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1004                 mode |= S_IRUGO;
1005
1006         if (cft->write_u64 || cft->write_s64 || cft->write)
1007                 mode |= S_IWUSR;
1008
1009         return mode;
1010 }
1011
1012 static void cgroup_free_fn(struct work_struct *work)
1013 {
1014         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
1015
1016         atomic_dec(&cgrp->root->nr_cgrps);
1017         cgroup_pidlist_destroy_all(cgrp);
1018
1019         if (cgrp->parent) {
1020                 /*
1021                  * We get a ref to the parent, and put the ref when this
1022                  * cgroup is being freed, so it's guaranteed that the
1023                  * parent won't be destroyed before its children.
1024                  */
1025                 cgroup_put(cgrp->parent);
1026                 kernfs_put(cgrp->kn);
1027                 kfree(cgrp);
1028         } else {
1029                 /*
1030                  * This is root cgroup's refcnt reaching zero, which
1031                  * indicates that the root should be released.
1032                  */
1033                 cgroup_destroy_root(cgrp->root);
1034         }
1035 }
1036
1037 static void cgroup_free_rcu(struct rcu_head *head)
1038 {
1039         struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
1040
1041         INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
1042         queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
1043 }
1044
1045 static void cgroup_get(struct cgroup *cgrp)
1046 {
1047         WARN_ON_ONCE(cgroup_is_dead(cgrp));
1048         WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
1049         atomic_inc(&cgrp->refcnt);
1050 }
1051
1052 static void cgroup_put(struct cgroup *cgrp)
1053 {
1054         if (!atomic_dec_and_test(&cgrp->refcnt))
1055                 return;
1056         if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
1057                 return;
1058
1059         cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
1060         cgrp->id = -1;
1061
1062         call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1063 }
1064
1065 /**
1066  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1067  * @kn: the kernfs_node being serviced
1068  *
1069  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1070  * the method finishes if locking succeeded.  Note that once this function
1071  * returns the cgroup returned by cgroup_kn_lock_live() may become
1072  * inaccessible any time.  If the caller intends to continue to access the
1073  * cgroup, it should pin it before invoking this function.
1074  */
1075 static void cgroup_kn_unlock(struct kernfs_node *kn)
1076 {
1077         struct cgroup *cgrp;
1078
1079         if (kernfs_type(kn) == KERNFS_DIR)
1080                 cgrp = kn->priv;
1081         else
1082                 cgrp = kn->parent->priv;
1083
1084         mutex_unlock(&cgroup_mutex);
1085
1086         kernfs_unbreak_active_protection(kn);
1087         cgroup_put(cgrp);
1088 }
1089
1090 /**
1091  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1092  * @kn: the kernfs_node being serviced
1093  *
1094  * This helper is to be used by a cgroup kernfs method currently servicing
1095  * @kn.  It breaks the active protection, performs cgroup locking and
1096  * verifies that the associated cgroup is alive.  Returns the cgroup if
1097  * alive; otherwise, %NULL.  A successful return should be undone by a
1098  * matching cgroup_kn_unlock() invocation.
1099  *
1100  * Any cgroup kernfs method implementation which requires locking the
1101  * associated cgroup should use this helper.  It avoids nesting cgroup
1102  * locking under kernfs active protection and allows all kernfs operations
1103  * including self-removal.
1104  */
1105 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1106 {
1107         struct cgroup *cgrp;
1108
1109         if (kernfs_type(kn) == KERNFS_DIR)
1110                 cgrp = kn->priv;
1111         else
1112                 cgrp = kn->parent->priv;
1113
1114         /*
1115          * We're gonna grab cgroup_mutex which nests outside kernfs
1116          * active_ref.  cgroup liveliness check alone provides enough
1117          * protection against removal.  Ensure @cgrp stays accessible and
1118          * break the active_ref protection.
1119          */
1120         cgroup_get(cgrp);
1121         kernfs_break_active_protection(kn);
1122
1123         mutex_lock(&cgroup_mutex);
1124
1125         if (!cgroup_is_dead(cgrp))
1126                 return cgrp;
1127
1128         cgroup_kn_unlock(kn);
1129         return NULL;
1130 }
1131
1132 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1133 {
1134         char name[CGROUP_FILE_NAME_MAX];
1135
1136         lockdep_assert_held(&cgroup_mutex);
1137         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1138 }
1139
1140 /**
1141  * cgroup_clear_dir - remove subsys files in a cgroup directory
1142  * @cgrp: target cgroup
1143  * @subsys_mask: mask of the subsystem ids whose files should be removed
1144  */
1145 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1146 {
1147         struct cgroup_subsys *ss;
1148         int i;
1149
1150         for_each_subsys(ss, i) {
1151                 struct cftype *cfts;
1152
1153                 if (!(subsys_mask & (1 << i)))
1154                         continue;
1155                 list_for_each_entry(cfts, &ss->cfts, node)
1156                         cgroup_addrm_files(cgrp, cfts, false);
1157         }
1158 }
1159
1160 static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1161 {
1162         struct cgroup_subsys *ss;
1163         int ssid, i, ret;
1164
1165         lockdep_assert_held(&cgroup_mutex);
1166
1167         for_each_subsys(ss, ssid) {
1168                 if (!(ss_mask & (1 << ssid)))
1169                         continue;
1170
1171                 /* if @ss has non-root csses attached to it, can't move */
1172                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1173                         return -EBUSY;
1174
1175                 /* can't move between two non-dummy roots either */
1176                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1177                         return -EBUSY;
1178         }
1179
1180         ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1181         if (ret) {
1182                 if (dst_root != &cgrp_dfl_root)
1183                         return ret;
1184
1185                 /*
1186                  * Rebinding back to the default root is not allowed to
1187                  * fail.  Using both default and non-default roots should
1188                  * be rare.  Moving subsystems back and forth even more so.
1189                  * Just warn about it and continue.
1190                  */
1191                 if (cgrp_dfl_root_visible) {
1192                         pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1193                                 ret, ss_mask);
1194                         pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1195                 }
1196         }
1197
1198         /*
1199          * Nothing can fail from this point on.  Remove files for the
1200          * removed subsystems and rebind each subsystem.
1201          */
1202         for_each_subsys(ss, ssid)
1203                 if (ss_mask & (1 << ssid))
1204                         cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1205
1206         for_each_subsys(ss, ssid) {
1207                 struct cgroup_root *src_root;
1208                 struct cgroup_subsys_state *css;
1209                 struct css_set *cset;
1210
1211                 if (!(ss_mask & (1 << ssid)))
1212                         continue;
1213
1214                 src_root = ss->root;
1215                 css = cgroup_css(&src_root->cgrp, ss);
1216
1217                 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1218
1219                 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1220                 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1221                 ss->root = dst_root;
1222                 css->cgroup = &dst_root->cgrp;
1223
1224                 down_write(&css_set_rwsem);
1225                 hash_for_each(css_set_table, i, cset, hlist)
1226                         list_move_tail(&cset->e_cset_node[ss->id],
1227                                        &dst_root->cgrp.e_csets[ss->id]);
1228                 up_write(&css_set_rwsem);
1229
1230                 src_root->subsys_mask &= ~(1 << ssid);
1231                 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1232
1233                 /* default hierarchy doesn't enable controllers by default */
1234                 dst_root->subsys_mask |= 1 << ssid;
1235                 if (dst_root != &cgrp_dfl_root)
1236                         dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1237
1238                 if (ss->bind)
1239                         ss->bind(css);
1240         }
1241
1242         kernfs_activate(dst_root->cgrp.kn);
1243         return 0;
1244 }
1245
1246 static int cgroup_show_options(struct seq_file *seq,
1247                                struct kernfs_root *kf_root)
1248 {
1249         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1250         struct cgroup_subsys *ss;
1251         int ssid;
1252
1253         for_each_subsys(ss, ssid)
1254                 if (root->subsys_mask & (1 << ssid))
1255                         seq_printf(seq, ",%s", ss->name);
1256         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1257                 seq_puts(seq, ",sane_behavior");
1258         if (root->flags & CGRP_ROOT_NOPREFIX)
1259                 seq_puts(seq, ",noprefix");
1260         if (root->flags & CGRP_ROOT_XATTR)
1261                 seq_puts(seq, ",xattr");
1262
1263         spin_lock(&release_agent_path_lock);
1264         if (strlen(root->release_agent_path))
1265                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1266         spin_unlock(&release_agent_path_lock);
1267
1268         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1269                 seq_puts(seq, ",clone_children");
1270         if (strlen(root->name))
1271                 seq_printf(seq, ",name=%s", root->name);
1272         return 0;
1273 }
1274
1275 struct cgroup_sb_opts {
1276         unsigned int subsys_mask;
1277         unsigned int flags;
1278         char *release_agent;
1279         bool cpuset_clone_children;
1280         char *name;
1281         /* User explicitly requested empty subsystem */
1282         bool none;
1283 };
1284
1285 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1286 {
1287         char *token, *o = data;
1288         bool all_ss = false, one_ss = false;
1289         unsigned int mask = -1U;
1290         struct cgroup_subsys *ss;
1291         int i;
1292
1293 #ifdef CONFIG_CPUSETS
1294         mask = ~(1U << cpuset_cgrp_id);
1295 #endif
1296
1297         memset(opts, 0, sizeof(*opts));
1298
1299         while ((token = strsep(&o, ",")) != NULL) {
1300                 if (!*token)
1301                         return -EINVAL;
1302                 if (!strcmp(token, "none")) {
1303                         /* Explicitly have no subsystems */
1304                         opts->none = true;
1305                         continue;
1306                 }
1307                 if (!strcmp(token, "all")) {
1308                         /* Mutually exclusive option 'all' + subsystem name */
1309                         if (one_ss)
1310                                 return -EINVAL;
1311                         all_ss = true;
1312                         continue;
1313                 }
1314                 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1315                         opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1316                         continue;
1317                 }
1318                 if (!strcmp(token, "noprefix")) {
1319                         opts->flags |= CGRP_ROOT_NOPREFIX;
1320                         continue;
1321                 }
1322                 if (!strcmp(token, "clone_children")) {
1323                         opts->cpuset_clone_children = true;
1324                         continue;
1325                 }
1326                 if (!strcmp(token, "xattr")) {
1327                         opts->flags |= CGRP_ROOT_XATTR;
1328                         continue;
1329                 }
1330                 if (!strncmp(token, "release_agent=", 14)) {
1331                         /* Specifying two release agents is forbidden */
1332                         if (opts->release_agent)
1333                                 return -EINVAL;
1334                         opts->release_agent =
1335                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1336                         if (!opts->release_agent)
1337                                 return -ENOMEM;
1338                         continue;
1339                 }
1340                 if (!strncmp(token, "name=", 5)) {
1341                         const char *name = token + 5;
1342                         /* Can't specify an empty name */
1343                         if (!strlen(name))
1344                                 return -EINVAL;
1345                         /* Must match [\w.-]+ */
1346                         for (i = 0; i < strlen(name); i++) {
1347                                 char c = name[i];
1348                                 if (isalnum(c))
1349                                         continue;
1350                                 if ((c == '.') || (c == '-') || (c == '_'))
1351                                         continue;
1352                                 return -EINVAL;
1353                         }
1354                         /* Specifying two names is forbidden */
1355                         if (opts->name)
1356                                 return -EINVAL;
1357                         opts->name = kstrndup(name,
1358                                               MAX_CGROUP_ROOT_NAMELEN - 1,
1359                                               GFP_KERNEL);
1360                         if (!opts->name)
1361                                 return -ENOMEM;
1362
1363                         continue;
1364                 }
1365
1366                 for_each_subsys(ss, i) {
1367                         if (strcmp(token, ss->name))
1368                                 continue;
1369                         if (ss->disabled)
1370                                 continue;
1371
1372                         /* Mutually exclusive option 'all' + subsystem name */
1373                         if (all_ss)
1374                                 return -EINVAL;
1375                         opts->subsys_mask |= (1 << i);
1376                         one_ss = true;
1377
1378                         break;
1379                 }
1380                 if (i == CGROUP_SUBSYS_COUNT)
1381                         return -ENOENT;
1382         }
1383
1384         /* Consistency checks */
1385
1386         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1387                 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1388
1389                 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1390                     opts->cpuset_clone_children || opts->release_agent ||
1391                     opts->name) {
1392                         pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1393                         return -EINVAL;
1394                 }
1395         } else {
1396                 /*
1397                  * If the 'all' option was specified select all the
1398                  * subsystems, otherwise if 'none', 'name=' and a subsystem
1399                  * name options were not specified, let's default to 'all'
1400                  */
1401                 if (all_ss || (!one_ss && !opts->none && !opts->name))
1402                         for_each_subsys(ss, i)
1403                                 if (!ss->disabled)
1404                                         opts->subsys_mask |= (1 << i);
1405
1406                 /*
1407                  * We either have to specify by name or by subsystems. (So
1408                  * all empty hierarchies must have a name).
1409                  */
1410                 if (!opts->subsys_mask && !opts->name)
1411                         return -EINVAL;
1412         }
1413
1414         /*
1415          * Option noprefix was introduced just for backward compatibility
1416          * with the old cpuset, so we allow noprefix only if mounting just
1417          * the cpuset subsystem.
1418          */
1419         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1420                 return -EINVAL;
1421
1422
1423         /* Can't specify "none" and some subsystems */
1424         if (opts->subsys_mask && opts->none)
1425                 return -EINVAL;
1426
1427         return 0;
1428 }
1429
1430 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1431 {
1432         int ret = 0;
1433         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1434         struct cgroup_sb_opts opts;
1435         unsigned int added_mask, removed_mask;
1436
1437         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1438                 pr_err("sane_behavior: remount is not allowed\n");
1439                 return -EINVAL;
1440         }
1441
1442         mutex_lock(&cgroup_mutex);
1443
1444         /* See what subsystems are wanted */
1445         ret = parse_cgroupfs_options(data, &opts);
1446         if (ret)
1447                 goto out_unlock;
1448
1449         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1450                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1451                         task_tgid_nr(current), current->comm);
1452
1453         added_mask = opts.subsys_mask & ~root->subsys_mask;
1454         removed_mask = root->subsys_mask & ~opts.subsys_mask;
1455
1456         /* Don't allow flags or name to change at remount */
1457         if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1458             (opts.name && strcmp(opts.name, root->name))) {
1459                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1460                        opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1461                        root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1462                 ret = -EINVAL;
1463                 goto out_unlock;
1464         }
1465
1466         /* remounting is not allowed for populated hierarchies */
1467         if (!list_empty(&root->cgrp.children)) {
1468                 ret = -EBUSY;
1469                 goto out_unlock;
1470         }
1471
1472         ret = rebind_subsystems(root, added_mask);
1473         if (ret)
1474                 goto out_unlock;
1475
1476         rebind_subsystems(&cgrp_dfl_root, removed_mask);
1477
1478         if (opts.release_agent) {
1479                 spin_lock(&release_agent_path_lock);
1480                 strcpy(root->release_agent_path, opts.release_agent);
1481                 spin_unlock(&release_agent_path_lock);
1482         }
1483  out_unlock:
1484         kfree(opts.release_agent);
1485         kfree(opts.name);
1486         mutex_unlock(&cgroup_mutex);
1487         return ret;
1488 }
1489
1490 /*
1491  * To reduce the fork() overhead for systems that are not actually using
1492  * their cgroups capability, we don't maintain the lists running through
1493  * each css_set to its tasks until we see the list actually used - in other
1494  * words after the first mount.
1495  */
1496 static bool use_task_css_set_links __read_mostly;
1497
1498 static void cgroup_enable_task_cg_lists(void)
1499 {
1500         struct task_struct *p, *g;
1501
1502         down_write(&css_set_rwsem);
1503
1504         if (use_task_css_set_links)
1505                 goto out_unlock;
1506
1507         use_task_css_set_links = true;
1508
1509         /*
1510          * We need tasklist_lock because RCU is not safe against
1511          * while_each_thread(). Besides, a forking task that has passed
1512          * cgroup_post_fork() without seeing use_task_css_set_links = 1
1513          * is not guaranteed to have its child immediately visible in the
1514          * tasklist if we walk through it with RCU.
1515          */
1516         read_lock(&tasklist_lock);
1517         do_each_thread(g, p) {
1518                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1519                              task_css_set(p) != &init_css_set);
1520
1521                 /*
1522                  * We should check if the process is exiting, otherwise
1523                  * it will race with cgroup_exit() in that the list
1524                  * entry won't be deleted though the process has exited.
1525                  * Do it while holding siglock so that we don't end up
1526                  * racing against cgroup_exit().
1527                  */
1528                 spin_lock_irq(&p->sighand->siglock);
1529                 if (!(p->flags & PF_EXITING)) {
1530                         struct css_set *cset = task_css_set(p);
1531
1532                         list_add(&p->cg_list, &cset->tasks);
1533                         get_css_set(cset);
1534                 }
1535                 spin_unlock_irq(&p->sighand->siglock);
1536         } while_each_thread(g, p);
1537         read_unlock(&tasklist_lock);
1538 out_unlock:
1539         up_write(&css_set_rwsem);
1540 }
1541
1542 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1543 {
1544         struct cgroup_subsys *ss;
1545         int ssid;
1546
1547         atomic_set(&cgrp->refcnt, 1);
1548         INIT_LIST_HEAD(&cgrp->sibling);
1549         INIT_LIST_HEAD(&cgrp->children);
1550         INIT_LIST_HEAD(&cgrp->cset_links);
1551         INIT_LIST_HEAD(&cgrp->release_list);
1552         INIT_LIST_HEAD(&cgrp->pidlists);
1553         mutex_init(&cgrp->pidlist_mutex);
1554         cgrp->self.cgroup = cgrp;
1555
1556         for_each_subsys(ss, ssid)
1557                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1558
1559         init_waitqueue_head(&cgrp->offline_waitq);
1560 }
1561
1562 static void init_cgroup_root(struct cgroup_root *root,
1563                              struct cgroup_sb_opts *opts)
1564 {
1565         struct cgroup *cgrp = &root->cgrp;
1566
1567         INIT_LIST_HEAD(&root->root_list);
1568         atomic_set(&root->nr_cgrps, 1);
1569         cgrp->root = root;
1570         init_cgroup_housekeeping(cgrp);
1571         idr_init(&root->cgroup_idr);
1572
1573         root->flags = opts->flags;
1574         if (opts->release_agent)
1575                 strcpy(root->release_agent_path, opts->release_agent);
1576         if (opts->name)
1577                 strcpy(root->name, opts->name);
1578         if (opts->cpuset_clone_children)
1579                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1580 }
1581
1582 static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1583 {
1584         LIST_HEAD(tmp_links);
1585         struct cgroup *root_cgrp = &root->cgrp;
1586         struct css_set *cset;
1587         int i, ret;
1588
1589         lockdep_assert_held(&cgroup_mutex);
1590
1591         ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1592         if (ret < 0)
1593                 goto out;
1594         root_cgrp->id = ret;
1595
1596         /*
1597          * We're accessing css_set_count without locking css_set_rwsem here,
1598          * but that's OK - it can only be increased by someone holding
1599          * cgroup_lock, and that's us. The worst that can happen is that we
1600          * have some link structures left over
1601          */
1602         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1603         if (ret)
1604                 goto out;
1605
1606         ret = cgroup_init_root_id(root);
1607         if (ret)
1608                 goto out;
1609
1610         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1611                                            KERNFS_ROOT_CREATE_DEACTIVATED,
1612                                            root_cgrp);
1613         if (IS_ERR(root->kf_root)) {
1614                 ret = PTR_ERR(root->kf_root);
1615                 goto exit_root_id;
1616         }
1617         root_cgrp->kn = root->kf_root->kn;
1618
1619         ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1620         if (ret)
1621                 goto destroy_root;
1622
1623         ret = rebind_subsystems(root, ss_mask);
1624         if (ret)
1625                 goto destroy_root;
1626
1627         /*
1628          * There must be no failure case after here, since rebinding takes
1629          * care of subsystems' refcounts, which are explicitly dropped in
1630          * the failure exit path.
1631          */
1632         list_add(&root->root_list, &cgroup_roots);
1633         cgroup_root_count++;
1634
1635         /*
1636          * Link the root cgroup in this hierarchy into all the css_set
1637          * objects.
1638          */
1639         down_write(&css_set_rwsem);
1640         hash_for_each(css_set_table, i, cset, hlist)
1641                 link_css_set(&tmp_links, cset, root_cgrp);
1642         up_write(&css_set_rwsem);
1643
1644         BUG_ON(!list_empty(&root_cgrp->children));
1645         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1646
1647         kernfs_activate(root_cgrp->kn);
1648         ret = 0;
1649         goto out;
1650
1651 destroy_root:
1652         kernfs_destroy_root(root->kf_root);
1653         root->kf_root = NULL;
1654 exit_root_id:
1655         cgroup_exit_root_id(root);
1656 out:
1657         free_cgrp_cset_links(&tmp_links);
1658         return ret;
1659 }
1660
1661 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1662                          int flags, const char *unused_dev_name,
1663                          void *data)
1664 {
1665         struct cgroup_root *root;
1666         struct cgroup_sb_opts opts;
1667         struct dentry *dentry;
1668         int ret;
1669         bool new_sb;
1670
1671         /*
1672          * The first time anyone tries to mount a cgroup, enable the list
1673          * linking each css_set to its tasks and fix up all existing tasks.
1674          */
1675         if (!use_task_css_set_links)
1676                 cgroup_enable_task_cg_lists();
1677
1678         mutex_lock(&cgroup_mutex);
1679
1680         /* First find the desired set of subsystems */
1681         ret = parse_cgroupfs_options(data, &opts);
1682         if (ret)
1683                 goto out_unlock;
1684
1685         /* look for a matching existing root */
1686         if (!opts.subsys_mask && !opts.none && !opts.name) {
1687                 cgrp_dfl_root_visible = true;
1688                 root = &cgrp_dfl_root;
1689                 cgroup_get(&root->cgrp);
1690                 ret = 0;
1691                 goto out_unlock;
1692         }
1693
1694         for_each_root(root) {
1695                 bool name_match = false;
1696
1697                 if (root == &cgrp_dfl_root)
1698                         continue;
1699
1700                 /*
1701                  * If we asked for a name then it must match.  Also, if
1702                  * name matches but sybsys_mask doesn't, we should fail.
1703                  * Remember whether name matched.
1704                  */
1705                 if (opts.name) {
1706                         if (strcmp(opts.name, root->name))
1707                                 continue;
1708                         name_match = true;
1709                 }
1710
1711                 /*
1712                  * If we asked for subsystems (or explicitly for no
1713                  * subsystems) then they must match.
1714                  */
1715                 if ((opts.subsys_mask || opts.none) &&
1716                     (opts.subsys_mask != root->subsys_mask)) {
1717                         if (!name_match)
1718                                 continue;
1719                         ret = -EBUSY;
1720                         goto out_unlock;
1721                 }
1722
1723                 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1724                         if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1725                                 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1726                                 ret = -EINVAL;
1727                                 goto out_unlock;
1728                         } else {
1729                                 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1730                         }
1731                 }
1732
1733                 /*
1734                  * A root's lifetime is governed by its root cgroup.  Zero
1735                  * ref indicate that the root is being destroyed.  Wait for
1736                  * destruction to complete so that the subsystems are free.
1737                  * We can use wait_queue for the wait but this path is
1738                  * super cold.  Let's just sleep for a bit and retry.
1739                  */
1740                 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1741                         mutex_unlock(&cgroup_mutex);
1742                         msleep(10);
1743                         ret = restart_syscall();
1744                         goto out_free;
1745                 }
1746
1747                 ret = 0;
1748                 goto out_unlock;
1749         }
1750
1751         /*
1752          * No such thing, create a new one.  name= matching without subsys
1753          * specification is allowed for already existing hierarchies but we
1754          * can't create new one without subsys specification.
1755          */
1756         if (!opts.subsys_mask && !opts.none) {
1757                 ret = -EINVAL;
1758                 goto out_unlock;
1759         }
1760
1761         root = kzalloc(sizeof(*root), GFP_KERNEL);
1762         if (!root) {
1763                 ret = -ENOMEM;
1764                 goto out_unlock;
1765         }
1766
1767         init_cgroup_root(root, &opts);
1768
1769         ret = cgroup_setup_root(root, opts.subsys_mask);
1770         if (ret)
1771                 cgroup_free_root(root);
1772
1773 out_unlock:
1774         mutex_unlock(&cgroup_mutex);
1775 out_free:
1776         kfree(opts.release_agent);
1777         kfree(opts.name);
1778
1779         if (ret)
1780                 return ERR_PTR(ret);
1781
1782         dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1783         if (IS_ERR(dentry) || !new_sb)
1784                 cgroup_put(&root->cgrp);
1785         return dentry;
1786 }
1787
1788 static void cgroup_kill_sb(struct super_block *sb)
1789 {
1790         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1791         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1792
1793         cgroup_put(&root->cgrp);
1794         kernfs_kill_sb(sb);
1795 }
1796
1797 static struct file_system_type cgroup_fs_type = {
1798         .name = "cgroup",
1799         .mount = cgroup_mount,
1800         .kill_sb = cgroup_kill_sb,
1801 };
1802
1803 static struct kobject *cgroup_kobj;
1804
1805 /**
1806  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1807  * @task: target task
1808  * @buf: the buffer to write the path into
1809  * @buflen: the length of the buffer
1810  *
1811  * Determine @task's cgroup on the first (the one with the lowest non-zero
1812  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1813  * function grabs cgroup_mutex and shouldn't be used inside locks used by
1814  * cgroup controller callbacks.
1815  *
1816  * Return value is the same as kernfs_path().
1817  */
1818 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1819 {
1820         struct cgroup_root *root;
1821         struct cgroup *cgrp;
1822         int hierarchy_id = 1;
1823         char *path = NULL;
1824
1825         mutex_lock(&cgroup_mutex);
1826         down_read(&css_set_rwsem);
1827
1828         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1829
1830         if (root) {
1831                 cgrp = task_cgroup_from_root(task, root);
1832                 path = cgroup_path(cgrp, buf, buflen);
1833         } else {
1834                 /* if no hierarchy exists, everyone is in "/" */
1835                 if (strlcpy(buf, "/", buflen) < buflen)
1836                         path = buf;
1837         }
1838
1839         up_read(&css_set_rwsem);
1840         mutex_unlock(&cgroup_mutex);
1841         return path;
1842 }
1843 EXPORT_SYMBOL_GPL(task_cgroup_path);
1844
1845 /* used to track tasks and other necessary states during migration */
1846 struct cgroup_taskset {
1847         /* the src and dst cset list running through cset->mg_node */
1848         struct list_head        src_csets;
1849         struct list_head        dst_csets;
1850
1851         /*
1852          * Fields for cgroup_taskset_*() iteration.
1853          *
1854          * Before migration is committed, the target migration tasks are on
1855          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1856          * the csets on ->dst_csets.  ->csets point to either ->src_csets
1857          * or ->dst_csets depending on whether migration is committed.
1858          *
1859          * ->cur_csets and ->cur_task point to the current task position
1860          * during iteration.
1861          */
1862         struct list_head        *csets;
1863         struct css_set          *cur_cset;
1864         struct task_struct      *cur_task;
1865 };
1866
1867 /**
1868  * cgroup_taskset_first - reset taskset and return the first task
1869  * @tset: taskset of interest
1870  *
1871  * @tset iteration is initialized and the first task is returned.
1872  */
1873 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1874 {
1875         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1876         tset->cur_task = NULL;
1877
1878         return cgroup_taskset_next(tset);
1879 }
1880
1881 /**
1882  * cgroup_taskset_next - iterate to the next task in taskset
1883  * @tset: taskset of interest
1884  *
1885  * Return the next task in @tset.  Iteration must have been initialized
1886  * with cgroup_taskset_first().
1887  */
1888 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1889 {
1890         struct css_set *cset = tset->cur_cset;
1891         struct task_struct *task = tset->cur_task;
1892
1893         while (&cset->mg_node != tset->csets) {
1894                 if (!task)
1895                         task = list_first_entry(&cset->mg_tasks,
1896                                                 struct task_struct, cg_list);
1897                 else
1898                         task = list_next_entry(task, cg_list);
1899
1900                 if (&task->cg_list != &cset->mg_tasks) {
1901                         tset->cur_cset = cset;
1902                         tset->cur_task = task;
1903                         return task;
1904                 }
1905
1906                 cset = list_next_entry(cset, mg_node);
1907                 task = NULL;
1908         }
1909
1910         return NULL;
1911 }
1912
1913 /**
1914  * cgroup_task_migrate - move a task from one cgroup to another.
1915  * @old_cgrp: the cgroup @tsk is being migrated from
1916  * @tsk: the task being migrated
1917  * @new_cset: the new css_set @tsk is being attached to
1918  *
1919  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1920  */
1921 static void cgroup_task_migrate(struct cgroup *old_cgrp,
1922                                 struct task_struct *tsk,
1923                                 struct css_set *new_cset)
1924 {
1925         struct css_set *old_cset;
1926
1927         lockdep_assert_held(&cgroup_mutex);
1928         lockdep_assert_held(&css_set_rwsem);
1929
1930         /*
1931          * We are synchronized through threadgroup_lock() against PF_EXITING
1932          * setting such that we can't race against cgroup_exit() changing the
1933          * css_set to init_css_set and dropping the old one.
1934          */
1935         WARN_ON_ONCE(tsk->flags & PF_EXITING);
1936         old_cset = task_css_set(tsk);
1937
1938         get_css_set(new_cset);
1939         rcu_assign_pointer(tsk->cgroups, new_cset);
1940
1941         /*
1942          * Use move_tail so that cgroup_taskset_first() still returns the
1943          * leader after migration.  This works because cgroup_migrate()
1944          * ensures that the dst_cset of the leader is the first on the
1945          * tset's dst_csets list.
1946          */
1947         list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1948
1949         /*
1950          * We just gained a reference on old_cset by taking it from the
1951          * task. As trading it for new_cset is protected by cgroup_mutex,
1952          * we're safe to drop it here; it will be freed under RCU.
1953          */
1954         set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1955         put_css_set_locked(old_cset, false);
1956 }
1957
1958 /**
1959  * cgroup_migrate_finish - cleanup after attach
1960  * @preloaded_csets: list of preloaded css_sets
1961  *
1962  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
1963  * those functions for details.
1964  */
1965 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1966 {
1967         struct css_set *cset, *tmp_cset;
1968
1969         lockdep_assert_held(&cgroup_mutex);
1970
1971         down_write(&css_set_rwsem);
1972         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1973                 cset->mg_src_cgrp = NULL;
1974                 cset->mg_dst_cset = NULL;
1975                 list_del_init(&cset->mg_preload_node);
1976                 put_css_set_locked(cset, false);
1977         }
1978         up_write(&css_set_rwsem);
1979 }
1980
1981 /**
1982  * cgroup_migrate_add_src - add a migration source css_set
1983  * @src_cset: the source css_set to add
1984  * @dst_cgrp: the destination cgroup
1985  * @preloaded_csets: list of preloaded css_sets
1986  *
1987  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
1988  * @src_cset and add it to @preloaded_csets, which should later be cleaned
1989  * up by cgroup_migrate_finish().
1990  *
1991  * This function may be called without holding threadgroup_lock even if the
1992  * target is a process.  Threads may be created and destroyed but as long
1993  * as cgroup_mutex is not dropped, no new css_set can be put into play and
1994  * the preloaded css_sets are guaranteed to cover all migrations.
1995  */
1996 static void cgroup_migrate_add_src(struct css_set *src_cset,
1997                                    struct cgroup *dst_cgrp,
1998                                    struct list_head *preloaded_csets)
1999 {
2000         struct cgroup *src_cgrp;
2001
2002         lockdep_assert_held(&cgroup_mutex);
2003         lockdep_assert_held(&css_set_rwsem);
2004
2005         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2006
2007         if (!list_empty(&src_cset->mg_preload_node))
2008                 return;
2009
2010         WARN_ON(src_cset->mg_src_cgrp);
2011         WARN_ON(!list_empty(&src_cset->mg_tasks));
2012         WARN_ON(!list_empty(&src_cset->mg_node));
2013
2014         src_cset->mg_src_cgrp = src_cgrp;
2015         get_css_set(src_cset);
2016         list_add(&src_cset->mg_preload_node, preloaded_csets);
2017 }
2018
2019 /**
2020  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2021  * @dst_cgrp: the destination cgroup (may be %NULL)
2022  * @preloaded_csets: list of preloaded source css_sets
2023  *
2024  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2025  * have been preloaded to @preloaded_csets.  This function looks up and
2026  * pins all destination css_sets, links each to its source, and append them
2027  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2028  * source css_set is assumed to be its cgroup on the default hierarchy.
2029  *
2030  * This function must be called after cgroup_migrate_add_src() has been
2031  * called on each migration source css_set.  After migration is performed
2032  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2033  * @preloaded_csets.
2034  */
2035 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2036                                       struct list_head *preloaded_csets)
2037 {
2038         LIST_HEAD(csets);
2039         struct css_set *src_cset, *tmp_cset;
2040
2041         lockdep_assert_held(&cgroup_mutex);
2042
2043         /*
2044          * Except for the root, child_subsys_mask must be zero for a cgroup
2045          * with tasks so that child cgroups don't compete against tasks.
2046          */
2047         if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
2048             dst_cgrp->child_subsys_mask)
2049                 return -EBUSY;
2050
2051         /* look up the dst cset for each src cset and link it to src */
2052         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2053                 struct css_set *dst_cset;
2054
2055                 dst_cset = find_css_set(src_cset,
2056                                         dst_cgrp ?: src_cset->dfl_cgrp);
2057                 if (!dst_cset)
2058                         goto err;
2059
2060                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062                 /*
2063                  * If src cset equals dst, it's noop.  Drop the src.
2064                  * cgroup_migrate() will skip the cset too.  Note that we
2065                  * can't handle src == dst as some nodes are used by both.
2066                  */
2067                 if (src_cset == dst_cset) {
2068                         src_cset->mg_src_cgrp = NULL;
2069                         list_del_init(&src_cset->mg_preload_node);
2070                         put_css_set(src_cset, false);
2071                         put_css_set(dst_cset, false);
2072                         continue;
2073                 }
2074
2075                 src_cset->mg_dst_cset = dst_cset;
2076
2077                 if (list_empty(&dst_cset->mg_preload_node))
2078                         list_add(&dst_cset->mg_preload_node, &csets);
2079                 else
2080                         put_css_set(dst_cset, false);
2081         }
2082
2083         list_splice_tail(&csets, preloaded_csets);
2084         return 0;
2085 err:
2086         cgroup_migrate_finish(&csets);
2087         return -ENOMEM;
2088 }
2089
2090 /**
2091  * cgroup_migrate - migrate a process or task to a cgroup
2092  * @cgrp: the destination cgroup
2093  * @leader: the leader of the process or the task to migrate
2094  * @threadgroup: whether @leader points to the whole process or a single task
2095  *
2096  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2097  * process, the caller must be holding threadgroup_lock of @leader.  The
2098  * caller is also responsible for invoking cgroup_migrate_add_src() and
2099  * cgroup_migrate_prepare_dst() on the targets before invoking this
2100  * function and following up with cgroup_migrate_finish().
2101  *
2102  * As long as a controller's ->can_attach() doesn't fail, this function is
2103  * guaranteed to succeed.  This means that, excluding ->can_attach()
2104  * failure, when migrating multiple targets, the success or failure can be
2105  * decided for all targets by invoking group_migrate_prepare_dst() before
2106  * actually starting migrating.
2107  */
2108 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2109                           bool threadgroup)
2110 {
2111         struct cgroup_taskset tset = {
2112                 .src_csets      = LIST_HEAD_INIT(tset.src_csets),
2113                 .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
2114                 .csets          = &tset.src_csets,
2115         };
2116         struct cgroup_subsys_state *css, *failed_css = NULL;
2117         struct css_set *cset, *tmp_cset;
2118         struct task_struct *task, *tmp_task;
2119         int i, ret;
2120
2121         /*
2122          * Prevent freeing of tasks while we take a snapshot. Tasks that are
2123          * already PF_EXITING could be freed from underneath us unless we
2124          * take an rcu_read_lock.
2125          */
2126         down_write(&css_set_rwsem);
2127         rcu_read_lock();
2128         task = leader;
2129         do {
2130                 /* @task either already exited or can't exit until the end */
2131                 if (task->flags & PF_EXITING)
2132                         goto next;
2133
2134                 /* leave @task alone if post_fork() hasn't linked it yet */
2135                 if (list_empty(&task->cg_list))
2136                         goto next;
2137
2138                 cset = task_css_set(task);
2139                 if (!cset->mg_src_cgrp)
2140                         goto next;
2141
2142                 /*
2143                  * cgroup_taskset_first() must always return the leader.
2144                  * Take care to avoid disturbing the ordering.
2145                  */
2146                 list_move_tail(&task->cg_list, &cset->mg_tasks);
2147                 if (list_empty(&cset->mg_node))
2148                         list_add_tail(&cset->mg_node, &tset.src_csets);
2149                 if (list_empty(&cset->mg_dst_cset->mg_node))
2150                         list_move_tail(&cset->mg_dst_cset->mg_node,
2151                                        &tset.dst_csets);
2152         next:
2153                 if (!threadgroup)
2154                         break;
2155         } while_each_thread(leader, task);
2156         rcu_read_unlock();
2157         up_write(&css_set_rwsem);
2158
2159         /* methods shouldn't be called if no task is actually migrating */
2160         if (list_empty(&tset.src_csets))
2161                 return 0;
2162
2163         /* check that we can legitimately attach to the cgroup */
2164         for_each_e_css(css, i, cgrp) {
2165                 if (css->ss->can_attach) {
2166                         ret = css->ss->can_attach(css, &tset);
2167                         if (ret) {
2168                                 failed_css = css;
2169                                 goto out_cancel_attach;
2170                         }
2171                 }
2172         }
2173
2174         /*
2175          * Now that we're guaranteed success, proceed to move all tasks to
2176          * the new cgroup.  There are no failure cases after here, so this
2177          * is the commit point.
2178          */
2179         down_write(&css_set_rwsem);
2180         list_for_each_entry(cset, &tset.src_csets, mg_node) {
2181                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2182                         cgroup_task_migrate(cset->mg_src_cgrp, task,
2183                                             cset->mg_dst_cset);
2184         }
2185         up_write(&css_set_rwsem);
2186
2187         /*
2188          * Migration is committed, all target tasks are now on dst_csets.
2189          * Nothing is sensitive to fork() after this point.  Notify
2190          * controllers that migration is complete.
2191          */
2192         tset.csets = &tset.dst_csets;
2193
2194         for_each_e_css(css, i, cgrp)
2195                 if (css->ss->attach)
2196                         css->ss->attach(css, &tset);
2197
2198         ret = 0;
2199         goto out_release_tset;
2200
2201 out_cancel_attach:
2202         for_each_e_css(css, i, cgrp) {
2203                 if (css == failed_css)
2204                         break;
2205                 if (css->ss->cancel_attach)
2206                         css->ss->cancel_attach(css, &tset);
2207         }
2208 out_release_tset:
2209         down_write(&css_set_rwsem);
2210         list_splice_init(&tset.dst_csets, &tset.src_csets);
2211         list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2212                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2213                 list_del_init(&cset->mg_node);
2214         }
2215         up_write(&css_set_rwsem);
2216         return ret;
2217 }
2218
2219 /**
2220  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2221  * @dst_cgrp: the cgroup to attach to
2222  * @leader: the task or the leader of the threadgroup to be attached
2223  * @threadgroup: attach the whole threadgroup?
2224  *
2225  * Call holding cgroup_mutex and threadgroup_lock of @leader.
2226  */
2227 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2228                               struct task_struct *leader, bool threadgroup)
2229 {
2230         LIST_HEAD(preloaded_csets);
2231         struct task_struct *task;
2232         int ret;
2233
2234         /* look up all src csets */
2235         down_read(&css_set_rwsem);
2236         rcu_read_lock();
2237         task = leader;
2238         do {
2239                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2240                                        &preloaded_csets);
2241                 if (!threadgroup)
2242                         break;
2243         } while_each_thread(leader, task);
2244         rcu_read_unlock();
2245         up_read(&css_set_rwsem);
2246
2247         /* prepare dst csets and commit */
2248         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2249         if (!ret)
2250                 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2251
2252         cgroup_migrate_finish(&preloaded_csets);
2253         return ret;
2254 }
2255
2256 /*
2257  * Find the task_struct of the task to attach by vpid and pass it along to the
2258  * function to attach either it or all tasks in its threadgroup. Will lock
2259  * cgroup_mutex and threadgroup.
2260  */
2261 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262                                     size_t nbytes, loff_t off, bool threadgroup)
2263 {
2264         struct task_struct *tsk;
2265         const struct cred *cred = current_cred(), *tcred;
2266         struct cgroup *cgrp;
2267         pid_t pid;
2268         int ret;
2269
2270         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271                 return -EINVAL;
2272
2273         cgrp = cgroup_kn_lock_live(of->kn);
2274         if (!cgrp)
2275                 return -ENODEV;
2276
2277 retry_find_task:
2278         rcu_read_lock();
2279         if (pid) {
2280                 tsk = find_task_by_vpid(pid);
2281                 if (!tsk) {
2282                         rcu_read_unlock();
2283                         ret = -ESRCH;
2284                         goto out_unlock_cgroup;
2285                 }
2286                 /*
2287                  * even if we're attaching all tasks in the thread group, we
2288                  * only need to check permissions on one of them.
2289                  */
2290                 tcred = __task_cred(tsk);
2291                 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2292                     !uid_eq(cred->euid, tcred->uid) &&
2293                     !uid_eq(cred->euid, tcred->suid)) {
2294                         rcu_read_unlock();
2295                         ret = -EACCES;
2296                         goto out_unlock_cgroup;
2297                 }
2298         } else
2299                 tsk = current;
2300
2301         if (threadgroup)
2302                 tsk = tsk->group_leader;
2303
2304         /*
2305          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2306          * trapped in a cpuset, or RT worker may be born in a cgroup
2307          * with no rt_runtime allocated.  Just say no.
2308          */
2309         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2310                 ret = -EINVAL;
2311                 rcu_read_unlock();
2312                 goto out_unlock_cgroup;
2313         }
2314
2315         get_task_struct(tsk);
2316         rcu_read_unlock();
2317
2318         threadgroup_lock(tsk);
2319         if (threadgroup) {
2320                 if (!thread_group_leader(tsk)) {
2321                         /*
2322                          * a race with de_thread from another thread's exec()
2323                          * may strip us of our leadership, if this happens,
2324                          * there is no choice but to throw this task away and
2325                          * try again; this is
2326                          * "double-double-toil-and-trouble-check locking".
2327                          */
2328                         threadgroup_unlock(tsk);
2329                         put_task_struct(tsk);
2330                         goto retry_find_task;
2331                 }
2332         }
2333
2334         ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2335
2336         threadgroup_unlock(tsk);
2337
2338         put_task_struct(tsk);
2339 out_unlock_cgroup:
2340         cgroup_kn_unlock(of->kn);
2341         return ret ?: nbytes;
2342 }
2343
2344 /**
2345  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2346  * @from: attach to all cgroups of a given task
2347  * @tsk: the task to be attached
2348  */
2349 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2350 {
2351         struct cgroup_root *root;
2352         int retval = 0;
2353
2354         mutex_lock(&cgroup_mutex);
2355         for_each_root(root) {
2356                 struct cgroup *from_cgrp;
2357
2358                 if (root == &cgrp_dfl_root)
2359                         continue;
2360
2361                 down_read(&css_set_rwsem);
2362                 from_cgrp = task_cgroup_from_root(from, root);
2363                 up_read(&css_set_rwsem);
2364
2365                 retval = cgroup_attach_task(from_cgrp, tsk, false);
2366                 if (retval)
2367                         break;
2368         }
2369         mutex_unlock(&cgroup_mutex);
2370
2371         return retval;
2372 }
2373 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2374
2375 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2376                                   char *buf, size_t nbytes, loff_t off)
2377 {
2378         return __cgroup_procs_write(of, buf, nbytes, off, false);
2379 }
2380
2381 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2382                                   char *buf, size_t nbytes, loff_t off)
2383 {
2384         return __cgroup_procs_write(of, buf, nbytes, off, true);
2385 }
2386
2387 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2388                                           char *buf, size_t nbytes, loff_t off)
2389 {
2390         struct cgroup *cgrp;
2391
2392         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2393
2394         cgrp = cgroup_kn_lock_live(of->kn);
2395         if (!cgrp)
2396                 return -ENODEV;
2397         spin_lock(&release_agent_path_lock);
2398         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2399                 sizeof(cgrp->root->release_agent_path));
2400         spin_unlock(&release_agent_path_lock);
2401         cgroup_kn_unlock(of->kn);
2402         return nbytes;
2403 }
2404
2405 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2406 {
2407         struct cgroup *cgrp = seq_css(seq)->cgroup;
2408
2409         spin_lock(&release_agent_path_lock);
2410         seq_puts(seq, cgrp->root->release_agent_path);
2411         spin_unlock(&release_agent_path_lock);
2412         seq_putc(seq, '\n');
2413         return 0;
2414 }
2415
2416 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2417 {
2418         struct cgroup *cgrp = seq_css(seq)->cgroup;
2419
2420         seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2421         return 0;
2422 }
2423
2424 static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425 {
2426         struct cgroup_subsys *ss;
2427         bool printed = false;
2428         int ssid;
2429
2430         for_each_subsys(ss, ssid) {
2431                 if (ss_mask & (1 << ssid)) {
2432                         if (printed)
2433                                 seq_putc(seq, ' ');
2434                         seq_printf(seq, "%s", ss->name);
2435                         printed = true;
2436                 }
2437         }
2438         if (printed)
2439                 seq_putc(seq, '\n');
2440 }
2441
2442 /* show controllers which are currently attached to the default hierarchy */
2443 static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444 {
2445         struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447         cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
2448         return 0;
2449 }
2450
2451 /* show controllers which are enabled from the parent */
2452 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2453 {
2454         struct cgroup *cgrp = seq_css(seq)->cgroup;
2455
2456         cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
2457         return 0;
2458 }
2459
2460 /* show controllers which are enabled for a given cgroup's children */
2461 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2462 {
2463         struct cgroup *cgrp = seq_css(seq)->cgroup;
2464
2465         cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2466         return 0;
2467 }
2468
2469 /**
2470  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2471  * @cgrp: root of the subtree to update csses for
2472  *
2473  * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2474  * css associations need to be updated accordingly.  This function looks up
2475  * all css_sets which are attached to the subtree, creates the matching
2476  * updated css_sets and migrates the tasks to the new ones.
2477  */
2478 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2479 {
2480         LIST_HEAD(preloaded_csets);
2481         struct cgroup_subsys_state *css;
2482         struct css_set *src_cset;
2483         int ret;
2484
2485         lockdep_assert_held(&cgroup_mutex);
2486
2487         /* look up all csses currently attached to @cgrp's subtree */
2488         down_read(&css_set_rwsem);
2489         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2490                 struct cgrp_cset_link *link;
2491
2492                 /* self is not affected by child_subsys_mask change */
2493                 if (css->cgroup == cgrp)
2494                         continue;
2495
2496                 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2497                         cgroup_migrate_add_src(link->cset, cgrp,
2498                                                &preloaded_csets);
2499         }
2500         up_read(&css_set_rwsem);
2501
2502         /* NULL dst indicates self on default hierarchy */
2503         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2504         if (ret)
2505                 goto out_finish;
2506
2507         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2508                 struct task_struct *last_task = NULL, *task;
2509
2510                 /* src_csets precede dst_csets, break on the first dst_cset */
2511                 if (!src_cset->mg_src_cgrp)
2512                         break;
2513
2514                 /*
2515                  * All tasks in src_cset need to be migrated to the
2516                  * matching dst_cset.  Empty it process by process.  We
2517                  * walk tasks but migrate processes.  The leader might even
2518                  * belong to a different cset but such src_cset would also
2519                  * be among the target src_csets because the default
2520                  * hierarchy enforces per-process membership.
2521                  */
2522                 while (true) {
2523                         down_read(&css_set_rwsem);
2524                         task = list_first_entry_or_null(&src_cset->tasks,
2525                                                 struct task_struct, cg_list);
2526                         if (task) {
2527                                 task = task->group_leader;
2528                                 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2529                                 get_task_struct(task);
2530                         }
2531                         up_read(&css_set_rwsem);
2532
2533                         if (!task)
2534                                 break;
2535
2536                         /* guard against possible infinite loop */
2537                         if (WARN(last_task == task,
2538                                  "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2539                                 goto out_finish;
2540                         last_task = task;
2541
2542                         threadgroup_lock(task);
2543                         /* raced against de_thread() from another thread? */
2544                         if (!thread_group_leader(task)) {
2545                                 threadgroup_unlock(task);
2546                                 put_task_struct(task);
2547                                 continue;
2548                         }
2549
2550                         ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2551
2552                         threadgroup_unlock(task);
2553                         put_task_struct(task);
2554
2555                         if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2556                                 goto out_finish;
2557                 }
2558         }
2559
2560 out_finish:
2561         cgroup_migrate_finish(&preloaded_csets);
2562         return ret;
2563 }
2564
2565 /* change the enabled child controllers for a cgroup in the default hierarchy */
2566 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2567                                             char *buf, size_t nbytes,
2568                                             loff_t off)
2569 {
2570         unsigned int enable = 0, disable = 0;
2571         struct cgroup *cgrp, *child;
2572         struct cgroup_subsys *ss;
2573         char *tok;
2574         int ssid, ret;
2575
2576         /*
2577          * Parse input - space separated list of subsystem names prefixed
2578          * with either + or -.
2579          */
2580         buf = strstrip(buf);
2581         while ((tok = strsep(&buf, " "))) {
2582                 if (tok[0] == '\0')
2583                         continue;
2584                 for_each_subsys(ss, ssid) {
2585                         if (ss->disabled || strcmp(tok + 1, ss->name))
2586                                 continue;
2587
2588                         if (*tok == '+') {
2589                                 enable |= 1 << ssid;
2590                                 disable &= ~(1 << ssid);
2591                         } else if (*tok == '-') {
2592                                 disable |= 1 << ssid;
2593                                 enable &= ~(1 << ssid);
2594                         } else {
2595                                 return -EINVAL;
2596                         }
2597                         break;
2598                 }
2599                 if (ssid == CGROUP_SUBSYS_COUNT)
2600                         return -EINVAL;
2601         }
2602
2603         cgrp = cgroup_kn_lock_live(of->kn);
2604         if (!cgrp)
2605                 return -ENODEV;
2606
2607         for_each_subsys(ss, ssid) {
2608                 if (enable & (1 << ssid)) {
2609                         if (cgrp->child_subsys_mask & (1 << ssid)) {
2610                                 enable &= ~(1 << ssid);
2611                                 continue;
2612                         }
2613
2614                         /*
2615                          * Because css offlining is asynchronous, userland
2616                          * might try to re-enable the same controller while
2617                          * the previous instance is still around.  In such
2618                          * cases, wait till it's gone using offline_waitq.
2619                          */
2620                         cgroup_for_each_live_child(child, cgrp) {
2621                                 DEFINE_WAIT(wait);
2622
2623                                 if (!cgroup_css(child, ss))
2624                                         continue;
2625
2626                                 cgroup_get(child);
2627                                 prepare_to_wait(&child->offline_waitq, &wait,
2628                                                 TASK_UNINTERRUPTIBLE);
2629                                 cgroup_kn_unlock(of->kn);
2630                                 schedule();
2631                                 finish_wait(&child->offline_waitq, &wait);
2632                                 cgroup_put(child);
2633
2634                                 return restart_syscall();
2635                         }
2636
2637                         /* unavailable or not enabled on the parent? */
2638                         if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2639                             (cgrp->parent &&
2640                              !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
2641                                 ret = -ENOENT;
2642                                 goto out_unlock;
2643                         }
2644                 } else if (disable & (1 << ssid)) {
2645                         if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2646                                 disable &= ~(1 << ssid);
2647                                 continue;
2648                         }
2649
2650                         /* a child has it enabled? */
2651                         cgroup_for_each_live_child(child, cgrp) {
2652                                 if (child->child_subsys_mask & (1 << ssid)) {
2653                                         ret = -EBUSY;
2654                                         goto out_unlock;
2655                                 }
2656                         }
2657                 }
2658         }
2659
2660         if (!enable && !disable) {
2661                 ret = 0;
2662                 goto out_unlock;
2663         }
2664
2665         /*
2666          * Except for the root, child_subsys_mask must be zero for a cgroup
2667          * with tasks so that child cgroups don't compete against tasks.
2668          */
2669         if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
2670                 ret = -EBUSY;
2671                 goto out_unlock;
2672         }
2673
2674         /*
2675          * Create csses for enables and update child_subsys_mask.  This
2676          * changes cgroup_e_css() results which in turn makes the
2677          * subsequent cgroup_update_dfl_csses() associate all tasks in the
2678          * subtree to the updated csses.
2679          */
2680         for_each_subsys(ss, ssid) {
2681                 if (!(enable & (1 << ssid)))
2682                         continue;
2683
2684                 cgroup_for_each_live_child(child, cgrp) {
2685                         ret = create_css(child, ss);
2686                         if (ret)
2687                                 goto err_undo_css;
2688                 }
2689         }
2690
2691         cgrp->child_subsys_mask |= enable;
2692         cgrp->child_subsys_mask &= ~disable;
2693
2694         ret = cgroup_update_dfl_csses(cgrp);
2695         if (ret)
2696                 goto err_undo_css;
2697
2698         /* all tasks are now migrated away from the old csses, kill them */
2699         for_each_subsys(ss, ssid) {
2700                 if (!(disable & (1 << ssid)))
2701                         continue;
2702
2703                 cgroup_for_each_live_child(child, cgrp)
2704                         kill_css(cgroup_css(child, ss));
2705         }
2706
2707         kernfs_activate(cgrp->kn);
2708         ret = 0;
2709 out_unlock:
2710         cgroup_kn_unlock(of->kn);
2711         return ret ?: nbytes;
2712
2713 err_undo_css:
2714         cgrp->child_subsys_mask &= ~enable;
2715         cgrp->child_subsys_mask |= disable;
2716
2717         for_each_subsys(ss, ssid) {
2718                 if (!(enable & (1 << ssid)))
2719                         continue;
2720
2721                 cgroup_for_each_live_child(child, cgrp) {
2722                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
2723                         if (css)
2724                                 kill_css(css);
2725                 }
2726         }
2727         goto out_unlock;
2728 }
2729
2730 static int cgroup_populated_show(struct seq_file *seq, void *v)
2731 {
2732         seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2733         return 0;
2734 }
2735
2736 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2737                                  size_t nbytes, loff_t off)
2738 {
2739         struct cgroup *cgrp = of->kn->parent->priv;
2740         struct cftype *cft = of->kn->priv;
2741         struct cgroup_subsys_state *css;
2742         int ret;
2743
2744         if (cft->write)
2745                 return cft->write(of, buf, nbytes, off);
2746
2747         /*
2748          * kernfs guarantees that a file isn't deleted with operations in
2749          * flight, which means that the matching css is and stays alive and
2750          * doesn't need to be pinned.  The RCU locking is not necessary
2751          * either.  It's just for the convenience of using cgroup_css().
2752          */
2753         rcu_read_lock();
2754         css = cgroup_css(cgrp, cft->ss);
2755         rcu_read_unlock();
2756
2757         if (cft->write_u64) {
2758                 unsigned long long v;
2759                 ret = kstrtoull(buf, 0, &v);
2760                 if (!ret)
2761                         ret = cft->write_u64(css, cft, v);
2762         } else if (cft->write_s64) {
2763                 long long v;
2764                 ret = kstrtoll(buf, 0, &v);
2765                 if (!ret)
2766                         ret = cft->write_s64(css, cft, v);
2767         } else {
2768                 ret = -EINVAL;
2769         }
2770
2771         return ret ?: nbytes;
2772 }
2773
2774 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2775 {
2776         return seq_cft(seq)->seq_start(seq, ppos);
2777 }
2778
2779 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2780 {
2781         return seq_cft(seq)->seq_next(seq, v, ppos);
2782 }
2783
2784 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2785 {
2786         seq_cft(seq)->seq_stop(seq, v);
2787 }
2788
2789 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2790 {
2791         struct cftype *cft = seq_cft(m);
2792         struct cgroup_subsys_state *css = seq_css(m);
2793
2794         if (cft->seq_show)
2795                 return cft->seq_show(m, arg);
2796
2797         if (cft->read_u64)
2798                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2799         else if (cft->read_s64)
2800                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2801         else
2802                 return -EINVAL;
2803         return 0;
2804 }
2805
2806 static struct kernfs_ops cgroup_kf_single_ops = {
2807         .atomic_write_len       = PAGE_SIZE,
2808         .write                  = cgroup_file_write,
2809         .seq_show               = cgroup_seqfile_show,
2810 };
2811
2812 static struct kernfs_ops cgroup_kf_ops = {
2813         .atomic_write_len       = PAGE_SIZE,
2814         .write                  = cgroup_file_write,
2815         .seq_start              = cgroup_seqfile_start,
2816         .seq_next               = cgroup_seqfile_next,
2817         .seq_stop               = cgroup_seqfile_stop,
2818         .seq_show               = cgroup_seqfile_show,
2819 };
2820
2821 /*
2822  * cgroup_rename - Only allow simple rename of directories in place.
2823  */
2824 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2825                          const char *new_name_str)
2826 {
2827         struct cgroup *cgrp = kn->priv;
2828         int ret;
2829
2830         if (kernfs_type(kn) != KERNFS_DIR)
2831                 return -ENOTDIR;
2832         if (kn->parent != new_parent)
2833                 return -EIO;
2834
2835         /*
2836          * This isn't a proper migration and its usefulness is very
2837          * limited.  Disallow if sane_behavior.
2838          */
2839         if (cgroup_sane_behavior(cgrp))
2840                 return -EPERM;
2841
2842         /*
2843          * We're gonna grab cgroup_mutex which nests outside kernfs
2844          * active_ref.  kernfs_rename() doesn't require active_ref
2845          * protection.  Break them before grabbing cgroup_mutex.
2846          */
2847         kernfs_break_active_protection(new_parent);
2848         kernfs_break_active_protection(kn);
2849
2850         mutex_lock(&cgroup_mutex);
2851
2852         ret = kernfs_rename(kn, new_parent, new_name_str);
2853
2854         mutex_unlock(&cgroup_mutex);
2855
2856         kernfs_unbreak_active_protection(kn);
2857         kernfs_unbreak_active_protection(new_parent);
2858         return ret;
2859 }
2860
2861 /* set uid and gid of cgroup dirs and files to that of the creator */
2862 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2863 {
2864         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2865                                .ia_uid = current_fsuid(),
2866                                .ia_gid = current_fsgid(), };
2867
2868         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2869             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2870                 return 0;
2871
2872         return kernfs_setattr(kn, &iattr);
2873 }
2874
2875 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2876 {
2877         char name[CGROUP_FILE_NAME_MAX];
2878         struct kernfs_node *kn;
2879         struct lock_class_key *key = NULL;
2880         int ret;
2881
2882 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2883         key = &cft->lockdep_key;
2884 #endif
2885         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2886                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2887                                   NULL, false, key);
2888         if (IS_ERR(kn))
2889                 return PTR_ERR(kn);
2890
2891         ret = cgroup_kn_set_ugid(kn);
2892         if (ret) {
2893                 kernfs_remove(kn);
2894                 return ret;
2895         }
2896
2897         if (cft->seq_show == cgroup_populated_show)
2898                 cgrp->populated_kn = kn;
2899         return 0;
2900 }
2901
2902 /**
2903  * cgroup_addrm_files - add or remove files to a cgroup directory
2904  * @cgrp: the target cgroup
2905  * @cfts: array of cftypes to be added
2906  * @is_add: whether to add or remove
2907  *
2908  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2909  * For removals, this function never fails.  If addition fails, this
2910  * function doesn't remove files already added.  The caller is responsible
2911  * for cleaning up.
2912  */
2913 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2914                               bool is_add)
2915 {
2916         struct cftype *cft;
2917         int ret;
2918
2919         lockdep_assert_held(&cgroup_mutex);
2920
2921         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2922                 /* does cft->flags tell us to skip this file on @cgrp? */
2923                 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2924                         continue;
2925                 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2926                         continue;
2927                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2928                         continue;
2929                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2930                         continue;
2931
2932                 if (is_add) {
2933                         ret = cgroup_add_file(cgrp, cft);
2934                         if (ret) {
2935                                 pr_warn("%s: failed to add %s, err=%d\n",
2936                                         __func__, cft->name, ret);
2937                                 return ret;
2938                         }
2939                 } else {
2940                         cgroup_rm_file(cgrp, cft);
2941                 }
2942         }
2943         return 0;
2944 }
2945
2946 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2947 {
2948         LIST_HEAD(pending);
2949         struct cgroup_subsys *ss = cfts[0].ss;
2950         struct cgroup *root = &ss->root->cgrp;
2951         struct cgroup_subsys_state *css;
2952         int ret = 0;
2953
2954         lockdep_assert_held(&cgroup_mutex);
2955
2956         /* add/rm files for all cgroups created before */
2957         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2958                 struct cgroup *cgrp = css->cgroup;
2959
2960                 if (cgroup_is_dead(cgrp))
2961                         continue;
2962
2963                 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2964                 if (ret)
2965                         break;
2966         }
2967
2968         if (is_add && !ret)
2969                 kernfs_activate(root->kn);
2970         return ret;
2971 }
2972
2973 static void cgroup_exit_cftypes(struct cftype *cfts)
2974 {
2975         struct cftype *cft;
2976
2977         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2978                 /* free copy for custom atomic_write_len, see init_cftypes() */
2979                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2980                         kfree(cft->kf_ops);
2981                 cft->kf_ops = NULL;
2982                 cft->ss = NULL;
2983         }
2984 }
2985
2986 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2987 {
2988         struct cftype *cft;
2989
2990         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2991                 struct kernfs_ops *kf_ops;
2992
2993                 WARN_ON(cft->ss || cft->kf_ops);
2994
2995                 if (cft->seq_start)
2996                         kf_ops = &cgroup_kf_ops;
2997                 else
2998                         kf_ops = &cgroup_kf_single_ops;
2999
3000                 /*
3001                  * Ugh... if @cft wants a custom max_write_len, we need to
3002                  * make a copy of kf_ops to set its atomic_write_len.
3003                  */
3004                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3005                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3006                         if (!kf_ops) {
3007                                 cgroup_exit_cftypes(cfts);
3008                                 return -ENOMEM;
3009                         }
3010                         kf_ops->atomic_write_len = cft->max_write_len;
3011                 }
3012
3013                 cft->kf_ops = kf_ops;
3014                 cft->ss = ss;
3015         }
3016
3017         return 0;
3018 }
3019
3020 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3021 {
3022         lockdep_assert_held(&cgroup_mutex);
3023
3024         if (!cfts || !cfts[0].ss)
3025                 return -ENOENT;
3026
3027         list_del(&cfts->node);
3028         cgroup_apply_cftypes(cfts, false);
3029         cgroup_exit_cftypes(cfts);
3030         return 0;
3031 }
3032
3033 /**
3034  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3035  * @cfts: zero-length name terminated array of cftypes
3036  *
3037  * Unregister @cfts.  Files described by @cfts are removed from all
3038  * existing cgroups and all future cgroups won't have them either.  This
3039  * function can be called anytime whether @cfts' subsys is attached or not.
3040  *
3041  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3042  * registered.
3043  */
3044 int cgroup_rm_cftypes(struct cftype *cfts)
3045 {
3046         int ret;
3047
3048         mutex_lock(&cgroup_mutex);
3049         ret = cgroup_rm_cftypes_locked(cfts);
3050         mutex_unlock(&cgroup_mutex);
3051         return ret;
3052 }
3053
3054 /**
3055  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3056  * @ss: target cgroup subsystem
3057  * @cfts: zero-length name terminated array of cftypes
3058  *
3059  * Register @cfts to @ss.  Files described by @cfts are created for all
3060  * existing cgroups to which @ss is attached and all future cgroups will
3061  * have them too.  This function can be called anytime whether @ss is
3062  * attached or not.
3063  *
3064  * Returns 0 on successful registration, -errno on failure.  Note that this
3065  * function currently returns 0 as long as @cfts registration is successful
3066  * even if some file creation attempts on existing cgroups fail.
3067  */
3068 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3069 {
3070         int ret;
3071
3072         if (!cfts || cfts[0].name[0] == '\0')
3073                 return 0;
3074
3075         ret = cgroup_init_cftypes(ss, cfts);
3076         if (ret)
3077                 return ret;
3078
3079         mutex_lock(&cgroup_mutex);
3080
3081         list_add_tail(&cfts->node, &ss->cfts);
3082         ret = cgroup_apply_cftypes(cfts, true);
3083         if (ret)
3084                 cgroup_rm_cftypes_locked(cfts);
3085
3086         mutex_unlock(&cgroup_mutex);
3087         return ret;
3088 }
3089
3090 /**
3091  * cgroup_task_count - count the number of tasks in a cgroup.
3092  * @cgrp: the cgroup in question
3093  *
3094  * Return the number of tasks in the cgroup.
3095  */
3096 static int cgroup_task_count(const struct cgroup *cgrp)
3097 {
3098         int count = 0;
3099         struct cgrp_cset_link *link;
3100
3101         down_read(&css_set_rwsem);
3102         list_for_each_entry(link, &cgrp->cset_links, cset_link)
3103                 count += atomic_read(&link->cset->refcount);
3104         up_read(&css_set_rwsem);
3105         return count;
3106 }
3107
3108 /**
3109  * css_next_child - find the next child of a given css
3110  * @pos_css: the current position (%NULL to initiate traversal)
3111  * @parent_css: css whose children to walk
3112  *
3113  * This function returns the next child of @parent_css and should be called
3114  * under either cgroup_mutex or RCU read lock.  The only requirement is
3115  * that @parent_css and @pos_css are accessible.  The next sibling is
3116  * guaranteed to be returned regardless of their states.
3117  */
3118 struct cgroup_subsys_state *
3119 css_next_child(struct cgroup_subsys_state *pos_css,
3120                struct cgroup_subsys_state *parent_css)
3121 {
3122         struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3123         struct cgroup *cgrp = parent_css->cgroup;
3124         struct cgroup *next;
3125
3126         cgroup_assert_mutex_or_rcu_locked();
3127
3128         /*
3129          * @pos could already have been removed.  Once a cgroup is removed,
3130          * its ->sibling.next is no longer updated when its next sibling
3131          * changes.  As CGRP_DEAD assertion is serialized and happens
3132          * before the cgroup is taken off the ->sibling list, if we see it
3133          * unasserted, it's guaranteed that the next sibling hasn't
3134          * finished its grace period even if it's already removed, and thus
3135          * safe to dereference from this RCU critical section.  If
3136          * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3137          * to be visible as %true here.
3138          *
3139          * If @pos is dead, its next pointer can't be dereferenced;
3140          * however, as each cgroup is given a monotonically increasing
3141          * unique serial number and always appended to the sibling list,
3142          * the next one can be found by walking the parent's children until
3143          * we see a cgroup with higher serial number than @pos's.  While
3144          * this path can be slower, it's taken only when either the current
3145          * cgroup is removed or iteration and removal race.
3146          */
3147         if (!pos) {
3148                 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3149         } else if (likely(!cgroup_is_dead(pos))) {
3150                 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3151         } else {
3152                 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3153                         if (next->serial_nr > pos->serial_nr)
3154                                 break;
3155         }
3156
3157         /*
3158          * @next, if not pointing to the head, can be dereferenced and is
3159          * the next sibling; however, it might have @ss disabled.  If so,
3160          * fast-forward to the next enabled one.
3161          */
3162         while (&next->sibling != &cgrp->children) {
3163                 struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
3164
3165                 if (next_css)
3166                         return next_css;
3167                 next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
3168         }
3169         return NULL;
3170 }
3171
3172 /**
3173  * css_next_descendant_pre - find the next descendant for pre-order walk
3174  * @pos: the current position (%NULL to initiate traversal)
3175  * @root: css whose descendants to walk
3176  *
3177  * To be used by css_for_each_descendant_pre().  Find the next descendant
3178  * to visit for pre-order traversal of @root's descendants.  @root is
3179  * included in the iteration and the first node to be visited.
3180  *
3181  * While this function requires cgroup_mutex or RCU read locking, it
3182  * doesn't require the whole traversal to be contained in a single critical
3183  * section.  This function will return the correct next descendant as long
3184  * as both @pos and @root are accessible and @pos is a descendant of @root.
3185  */
3186 struct cgroup_subsys_state *
3187 css_next_descendant_pre(struct cgroup_subsys_state *pos,
3188                         struct cgroup_subsys_state *root)
3189 {
3190         struct cgroup_subsys_state *next;
3191
3192         cgroup_assert_mutex_or_rcu_locked();
3193
3194         /* if first iteration, visit @root */
3195         if (!pos)
3196                 return root;
3197
3198         /* visit the first child if exists */
3199         next = css_next_child(NULL, pos);
3200         if (next)
3201                 return next;
3202
3203         /* no child, visit my or the closest ancestor's next sibling */
3204         while (pos != root) {
3205                 next = css_next_child(pos, css_parent(pos));
3206                 if (next)
3207                         return next;
3208                 pos = css_parent(pos);
3209         }
3210
3211         return NULL;
3212 }
3213
3214 /**
3215  * css_rightmost_descendant - return the rightmost descendant of a css
3216  * @pos: css of interest
3217  *
3218  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3219  * is returned.  This can be used during pre-order traversal to skip
3220  * subtree of @pos.
3221  *
3222  * While this function requires cgroup_mutex or RCU read locking, it
3223  * doesn't require the whole traversal to be contained in a single critical
3224  * section.  This function will return the correct rightmost descendant as
3225  * long as @pos is accessible.
3226  */
3227 struct cgroup_subsys_state *
3228 css_rightmost_descendant(struct cgroup_subsys_state *pos)
3229 {
3230         struct cgroup_subsys_state *last, *tmp;
3231
3232         cgroup_assert_mutex_or_rcu_locked();
3233
3234         do {
3235                 last = pos;
3236                 /* ->prev isn't RCU safe, walk ->next till the end */
3237                 pos = NULL;
3238                 css_for_each_child(tmp, last)
3239                         pos = tmp;
3240         } while (pos);
3241
3242         return last;
3243 }
3244
3245 static struct cgroup_subsys_state *
3246 css_leftmost_descendant(struct cgroup_subsys_state *pos)
3247 {
3248         struct cgroup_subsys_state *last;
3249
3250         do {
3251                 last = pos;
3252                 pos = css_next_child(NULL, pos);
3253         } while (pos);
3254
3255         return last;
3256 }
3257
3258 /**
3259  * css_next_descendant_post - find the next descendant for post-order walk
3260  * @pos: the current position (%NULL to initiate traversal)
3261  * @root: css whose descendants to walk
3262  *
3263  * To be used by css_for_each_descendant_post().  Find the next descendant
3264  * to visit for post-order traversal of @root's descendants.  @root is
3265  * included in the iteration and the last node to be visited.
3266  *
3267  * While this function requires cgroup_mutex or RCU read locking, it
3268  * doesn't require the whole traversal to be contained in a single critical
3269  * section.  This function will return the correct next descendant as long
3270  * as both @pos and @cgroup are accessible and @pos is a descendant of
3271  * @cgroup.
3272  */
3273 struct cgroup_subsys_state *
3274 css_next_descendant_post(struct cgroup_subsys_state *pos,
3275                          struct cgroup_subsys_state *root)
3276 {
3277         struct cgroup_subsys_state *next;
3278
3279         cgroup_assert_mutex_or_rcu_locked();
3280
3281         /* if first iteration, visit leftmost descendant which may be @root */
3282         if (!pos)
3283                 return css_leftmost_descendant(root);
3284
3285         /* if we visited @root, we're done */
3286         if (pos == root)
3287                 return NULL;
3288
3289         /* if there's an unvisited sibling, visit its leftmost descendant */
3290         next = css_next_child(pos, css_parent(pos));
3291         if (next)
3292                 return css_leftmost_descendant(next);
3293
3294         /* no sibling left, visit parent */
3295         return css_parent(pos);
3296 }
3297
3298 static bool cgroup_has_live_children(struct cgroup *cgrp)
3299 {
3300         struct cgroup *child;
3301
3302         rcu_read_lock();
3303         list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3304                 if (!cgroup_is_dead(child)) {
3305                         rcu_read_unlock();
3306                         return true;
3307                 }
3308         }
3309         rcu_read_unlock();
3310         return false;
3311 }
3312
3313 /**
3314  * css_advance_task_iter - advance a task itererator to the next css_set
3315  * @it: the iterator to advance
3316  *
3317  * Advance @it to the next css_set to walk.
3318  */
3319 static void css_advance_task_iter(struct css_task_iter *it)
3320 {
3321         struct list_head *l = it->cset_pos;
3322         struct cgrp_cset_link *link;
3323         struct css_set *cset;
3324
3325         /* Advance to the next non-empty css_set */
3326         do {
3327                 l = l->next;
3328                 if (l == it->cset_head) {
3329                         it->cset_pos = NULL;
3330                         return;
3331                 }
3332
3333                 if (it->ss) {
3334                         cset = container_of(l, struct css_set,
3335                                             e_cset_node[it->ss->id]);
3336                 } else {
3337                         link = list_entry(l, struct cgrp_cset_link, cset_link);
3338                         cset = link->cset;
3339                 }
3340         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
3341
3342         it->cset_pos = l;
3343
3344         if (!list_empty(&cset->tasks))
3345                 it->task_pos = cset->tasks.next;
3346         else
3347                 it->task_pos = cset->mg_tasks.next;
3348
3349         it->tasks_head = &cset->tasks;
3350         it->mg_tasks_head = &cset->mg_tasks;
3351 }
3352
3353 /**
3354  * css_task_iter_start - initiate task iteration
3355  * @css: the css to walk tasks of
3356  * @it: the task iterator to use
3357  *
3358  * Initiate iteration through the tasks of @css.  The caller can call
3359  * css_task_iter_next() to walk through the tasks until the function
3360  * returns NULL.  On completion of iteration, css_task_iter_end() must be
3361  * called.
3362  *
3363  * Note that this function acquires a lock which is released when the
3364  * iteration finishes.  The caller can't sleep while iteration is in
3365  * progress.
3366  */
3367 void css_task_iter_start(struct cgroup_subsys_state *css,
3368                          struct css_task_iter *it)
3369         __acquires(css_set_rwsem)
3370 {
3371         /* no one should try to iterate before mounting cgroups */
3372         WARN_ON_ONCE(!use_task_css_set_links);
3373
3374         down_read(&css_set_rwsem);
3375
3376         it->ss = css->ss;
3377
3378         if (it->ss)
3379                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3380         else
3381                 it->cset_pos = &css->cgroup->cset_links;
3382
3383         it->cset_head = it->cset_pos;
3384
3385         css_advance_task_iter(it);
3386 }
3387
3388 /**
3389  * css_task_iter_next - return the next task for the iterator
3390  * @it: the task iterator being iterated
3391  *
3392  * The "next" function for task iteration.  @it should have been
3393  * initialized via css_task_iter_start().  Returns NULL when the iteration
3394  * reaches the end.
3395  */
3396 struct task_struct *css_task_iter_next(struct css_task_iter *it)
3397 {
3398         struct task_struct *res;
3399         struct list_head *l = it->task_pos;
3400
3401         /* If the iterator cg is NULL, we have no tasks */
3402         if (!it->cset_pos)
3403                 return NULL;
3404         res = list_entry(l, struct task_struct, cg_list);
3405
3406         /*
3407          * Advance iterator to find next entry.  cset->tasks is consumed
3408          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3409          * next cset.
3410          */
3411         l = l->next;
3412
3413         if (l == it->tasks_head)
3414                 l = it->mg_tasks_head->next;
3415
3416         if (l == it->mg_tasks_head)
3417                 css_advance_task_iter(it);
3418         else
3419                 it->task_pos = l;
3420
3421         return res;
3422 }
3423
3424 /**
3425  * css_task_iter_end - finish task iteration
3426  * @it: the task iterator to finish
3427  *
3428  * Finish task iteration started by css_task_iter_start().
3429  */
3430 void css_task_iter_end(struct css_task_iter *it)
3431         __releases(css_set_rwsem)
3432 {
3433         up_read(&css_set_rwsem);
3434 }
3435
3436 /**
3437  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3438  * @to: cgroup to which the tasks will be moved
3439  * @from: cgroup in which the tasks currently reside
3440  *
3441  * Locking rules between cgroup_post_fork() and the migration path
3442  * guarantee that, if a task is forking while being migrated, the new child
3443  * is guaranteed to be either visible in the source cgroup after the
3444  * parent's migration is complete or put into the target cgroup.  No task
3445  * can slip out of migration through forking.
3446  */
3447 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3448 {
3449         LIST_HEAD(preloaded_csets);
3450         struct cgrp_cset_link *link;
3451         struct css_task_iter it;
3452         struct task_struct *task;
3453         int ret;
3454
3455         mutex_lock(&cgroup_mutex);
3456
3457         /* all tasks in @from are being moved, all csets are source */
3458         down_read(&css_set_rwsem);
3459         list_for_each_entry(link, &from->cset_links, cset_link)
3460                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3461         up_read(&css_set_rwsem);
3462
3463         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3464         if (ret)
3465                 goto out_err;
3466
3467         /*
3468          * Migrate tasks one-by-one until @form is empty.  This fails iff
3469          * ->can_attach() fails.
3470          */
3471         do {
3472                 css_task_iter_start(&from->self, &it);
3473                 task = css_task_iter_next(&it);
3474                 if (task)
3475                         get_task_struct(task);
3476                 css_task_iter_end(&it);
3477
3478                 if (task) {
3479                         ret = cgroup_migrate(to, task, false);
3480                         put_task_struct(task);
3481                 }
3482         } while (task && !ret);
3483 out_err:
3484         cgroup_migrate_finish(&preloaded_csets);
3485         mutex_unlock(&cgroup_mutex);
3486         return ret;
3487 }
3488
3489 /*
3490  * Stuff for reading the 'tasks'/'procs' files.
3491  *
3492  * Reading this file can return large amounts of data if a cgroup has
3493  * *lots* of attached tasks. So it may need several calls to read(),
3494  * but we cannot guarantee that the information we produce is correct
3495  * unless we produce it entirely atomically.
3496  *
3497  */
3498
3499 /* which pidlist file are we talking about? */
3500 enum cgroup_filetype {
3501         CGROUP_FILE_PROCS,
3502         CGROUP_FILE_TASKS,
3503 };
3504
3505 /*
3506  * A pidlist is a list of pids that virtually represents the contents of one
3507  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3508  * a pair (one each for procs, tasks) for each pid namespace that's relevant
3509  * to the cgroup.
3510  */
3511 struct cgroup_pidlist {
3512         /*
3513          * used to find which pidlist is wanted. doesn't change as long as
3514          * this particular list stays in the list.
3515         */
3516         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3517         /* array of xids */
3518         pid_t *list;
3519         /* how many elements the above list has */
3520         int length;
3521         /* each of these stored in a list by its cgroup */
3522         struct list_head links;
3523         /* pointer to the cgroup we belong to, for list removal purposes */
3524         struct cgroup *owner;
3525         /* for delayed destruction */
3526         struct delayed_work destroy_dwork;
3527 };
3528
3529 /*
3530  * The following two functions "fix" the issue where there are more pids
3531  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3532  * TODO: replace with a kernel-wide solution to this problem
3533  */
3534 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3535 static void *pidlist_allocate(int count)
3536 {
3537         if (PIDLIST_TOO_LARGE(count))
3538                 return vmalloc(count * sizeof(pid_t));
3539         else
3540                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3541 }
3542
3543 static void pidlist_free(void *p)
3544 {
3545         if (is_vmalloc_addr(p))
3546                 vfree(p);
3547         else
3548                 kfree(p);
3549 }
3550
3551 /*
3552  * Used to destroy all pidlists lingering waiting for destroy timer.  None
3553  * should be left afterwards.
3554  */
3555 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3556 {
3557         struct cgroup_pidlist *l, *tmp_l;
3558
3559         mutex_lock(&cgrp->pidlist_mutex);
3560         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3561                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3562         mutex_unlock(&cgrp->pidlist_mutex);
3563
3564         flush_workqueue(cgroup_pidlist_destroy_wq);
3565         BUG_ON(!list_empty(&cgrp->pidlists));
3566 }
3567
3568 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3569 {
3570         struct delayed_work *dwork = to_delayed_work(work);
3571         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3572                                                 destroy_dwork);
3573         struct cgroup_pidlist *tofree = NULL;
3574
3575         mutex_lock(&l->owner->pidlist_mutex);
3576
3577         /*
3578          * Destroy iff we didn't get queued again.  The state won't change
3579          * as destroy_dwork can only be queued while locked.
3580          */
3581         if (!delayed_work_pending(dwork)) {
3582                 list_del(&l->links);
3583                 pidlist_free(l->list);
3584                 put_pid_ns(l->key.ns);
3585                 tofree = l;
3586         }
3587
3588         mutex_unlock(&l->owner->pidlist_mutex);
3589         kfree(tofree);
3590 }
3591
3592 /*
3593  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3594  * Returns the number of unique elements.
3595  */
3596 static int pidlist_uniq(pid_t *list, int length)
3597 {
3598         int src, dest = 1;
3599
3600         /*
3601          * we presume the 0th element is unique, so i starts at 1. trivial
3602          * edge cases first; no work needs to be done for either
3603          */
3604         if (length == 0 || length == 1)
3605                 return length;
3606         /* src and dest walk down the list; dest counts unique elements */
3607         for (src = 1; src < length; src++) {
3608                 /* find next unique element */
3609                 while (list[src] == list[src-1]) {
3610                         src++;
3611                         if (src == length)
3612                                 goto after;
3613                 }
3614                 /* dest always points to where the next unique element goes */
3615                 list[dest] = list[src];
3616                 dest++;
3617         }
3618 after:
3619         return dest;
3620 }
3621
3622 /*
3623  * The two pid files - task and cgroup.procs - guaranteed that the result
3624  * is sorted, which forced this whole pidlist fiasco.  As pid order is
3625  * different per namespace, each namespace needs differently sorted list,
3626  * making it impossible to use, for example, single rbtree of member tasks
3627  * sorted by task pointer.  As pidlists can be fairly large, allocating one
3628  * per open file is dangerous, so cgroup had to implement shared pool of
3629  * pidlists keyed by cgroup and namespace.
3630  *
3631  * All this extra complexity was caused by the original implementation
3632  * committing to an entirely unnecessary property.  In the long term, we
3633  * want to do away with it.  Explicitly scramble sort order if
3634  * sane_behavior so that no such expectation exists in the new interface.
3635  *
3636  * Scrambling is done by swapping every two consecutive bits, which is
3637  * non-identity one-to-one mapping which disturbs sort order sufficiently.
3638  */
3639 static pid_t pid_fry(pid_t pid)
3640 {
3641         unsigned a = pid & 0x55555555;
3642         unsigned b = pid & 0xAAAAAAAA;
3643
3644         return (a << 1) | (b >> 1);
3645 }
3646
3647 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3648 {
3649         if (cgroup_sane_behavior(cgrp))
3650                 return pid_fry(pid);
3651         else
3652                 return pid;
3653 }
3654
3655 static int cmppid(const void *a, const void *b)
3656 {
3657         return *(pid_t *)a - *(pid_t *)b;
3658 }
3659
3660 static int fried_cmppid(const void *a, const void *b)
3661 {
3662         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3663 }
3664
3665 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3666                                                   enum cgroup_filetype type)
3667 {
3668         struct cgroup_pidlist *l;
3669         /* don't need task_nsproxy() if we're looking at ourself */
3670         struct pid_namespace *ns = task_active_pid_ns(current);
3671
3672         lockdep_assert_held(&cgrp->pidlist_mutex);
3673
3674         list_for_each_entry(l, &cgrp->pidlists, links)
3675                 if (l->key.type == type && l->key.ns == ns)
3676                         return l;
3677         return NULL;
3678 }
3679
3680 /*
3681  * find the appropriate pidlist for our purpose (given procs vs tasks)
3682  * returns with the lock on that pidlist already held, and takes care
3683  * of the use count, or returns NULL with no locks held if we're out of
3684  * memory.
3685  */
3686 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3687                                                 enum cgroup_filetype type)
3688 {
3689         struct cgroup_pidlist *l;
3690
3691         lockdep_assert_held(&cgrp->pidlist_mutex);
3692
3693         l = cgroup_pidlist_find(cgrp, type);
3694         if (l)
3695                 return l;
3696
3697         /* entry not found; create a new one */
3698         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3699         if (!l)
3700                 return l;
3701
3702         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3703         l->key.type = type;
3704         /* don't need task_nsproxy() if we're looking at ourself */
3705         l->key.ns = get_pid_ns(task_active_pid_ns(current));
3706         l->owner = cgrp;
3707         list_add(&l->links, &cgrp->pidlists);
3708         return l;
3709 }
3710
3711 /*
3712  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3713  */
3714 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3715                               struct cgroup_pidlist **lp)
3716 {
3717         pid_t *array;
3718         int length;
3719         int pid, n = 0; /* used for populating the array */
3720         struct css_task_iter it;
3721         struct task_struct *tsk;
3722         struct cgroup_pidlist *l;
3723
3724         lockdep_assert_held(&cgrp->pidlist_mutex);
3725
3726         /*
3727          * If cgroup gets more users after we read count, we won't have
3728          * enough space - tough.  This race is indistinguishable to the
3729          * caller from the case that the additional cgroup users didn't
3730          * show up until sometime later on.
3731          */
3732         length = cgroup_task_count(cgrp);
3733         array = pidlist_allocate(length);
3734         if (!array)
3735                 return -ENOMEM;
3736         /* now, populate the array */
3737         css_task_iter_start(&cgrp->self, &it);
3738         while ((tsk = css_task_iter_next(&it))) {
3739                 if (unlikely(n == length))
3740                         break;
3741                 /* get tgid or pid for procs or tasks file respectively */
3742                 if (type == CGROUP_FILE_PROCS)
3743                         pid = task_tgid_vnr(tsk);
3744                 else
3745                         pid = task_pid_vnr(tsk);
3746                 if (pid > 0) /* make sure to only use valid results */
3747                         array[n++] = pid;
3748         }
3749         css_task_iter_end(&it);
3750         length = n;
3751         /* now sort & (if procs) strip out duplicates */
3752         if (cgroup_sane_behavior(cgrp))
3753                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3754         else
3755                 sort(array, length, sizeof(pid_t), cmppid, NULL);
3756         if (type == CGROUP_FILE_PROCS)
3757                 length = pidlist_uniq(array, length);
3758
3759         l = cgroup_pidlist_find_create(cgrp, type);
3760         if (!l) {
3761                 mutex_unlock(&cgrp->pidlist_mutex);
3762                 pidlist_free(array);
3763                 return -ENOMEM;
3764         }
3765
3766         /* store array, freeing old if necessary */
3767         pidlist_free(l->list);
3768         l->list = array;
3769         l->length = length;
3770         *lp = l;
3771         return 0;
3772 }
3773
3774 /**
3775  * cgroupstats_build - build and fill cgroupstats
3776  * @stats: cgroupstats to fill information into
3777  * @dentry: A dentry entry belonging to the cgroup for which stats have
3778  * been requested.
3779  *
3780  * Build and fill cgroupstats so that taskstats can export it to user
3781  * space.
3782  */
3783 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3784 {
3785         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3786         struct cgroup *cgrp;
3787         struct css_task_iter it;
3788         struct task_struct *tsk;
3789
3790         /* it should be kernfs_node belonging to cgroupfs and is a directory */
3791         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3792             kernfs_type(kn) != KERNFS_DIR)
3793                 return -EINVAL;
3794
3795         mutex_lock(&cgroup_mutex);
3796
3797         /*
3798          * We aren't being called from kernfs and there's no guarantee on
3799          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
3800          * @kn->priv is RCU safe.  Let's do the RCU dancing.
3801          */
3802         rcu_read_lock();
3803         cgrp = rcu_dereference(kn->priv);
3804         if (!cgrp || cgroup_is_dead(cgrp)) {
3805                 rcu_read_unlock();
3806                 mutex_unlock(&cgroup_mutex);
3807                 return -ENOENT;
3808         }
3809         rcu_read_unlock();
3810
3811         css_task_iter_start(&cgrp->self, &it);
3812         while ((tsk = css_task_iter_next(&it))) {
3813                 switch (tsk->state) {
3814                 case TASK_RUNNING:
3815                         stats->nr_running++;
3816                         break;
3817                 case TASK_INTERRUPTIBLE:
3818                         stats->nr_sleeping++;
3819                         break;
3820                 case TASK_UNINTERRUPTIBLE:
3821                         stats->nr_uninterruptible++;
3822                         break;
3823                 case TASK_STOPPED:
3824                         stats->nr_stopped++;
3825                         break;
3826                 default:
3827                         if (delayacct_is_task_waiting_on_io(tsk))
3828                                 stats->nr_io_wait++;
3829                         break;
3830                 }
3831         }
3832         css_task_iter_end(&it);
3833
3834         mutex_unlock(&cgroup_mutex);
3835         return 0;
3836 }
3837
3838
3839 /*
3840  * seq_file methods for the tasks/procs files. The seq_file position is the
3841  * next pid to display; the seq_file iterator is a pointer to the pid
3842  * in the cgroup->l->list array.
3843  */
3844
3845 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3846 {
3847         /*
3848          * Initially we receive a position value that corresponds to
3849          * one more than the last pid shown (or 0 on the first call or
3850          * after a seek to the start). Use a binary-search to find the
3851          * next pid to display, if any
3852          */
3853         struct kernfs_open_file *of = s->private;
3854         struct cgroup *cgrp = seq_css(s)->cgroup;
3855         struct cgroup_pidlist *l;
3856         enum cgroup_filetype type = seq_cft(s)->private;
3857         int index = 0, pid = *pos;
3858         int *iter, ret;
3859
3860         mutex_lock(&cgrp->pidlist_mutex);
3861
3862         /*
3863          * !NULL @of->priv indicates that this isn't the first start()
3864          * after open.  If the matching pidlist is around, we can use that.
3865          * Look for it.  Note that @of->priv can't be used directly.  It
3866          * could already have been destroyed.
3867          */
3868         if (of->priv)
3869                 of->priv = cgroup_pidlist_find(cgrp, type);
3870
3871         /*
3872          * Either this is the first start() after open or the matching
3873          * pidlist has been destroyed inbetween.  Create a new one.
3874          */
3875         if (!of->priv) {
3876                 ret = pidlist_array_load(cgrp, type,
3877                                          (struct cgroup_pidlist **)&of->priv);
3878                 if (ret)
3879                         return ERR_PTR(ret);
3880         }
3881         l = of->priv;
3882
3883         if (pid) {
3884                 int end = l->length;
3885
3886                 while (index < end) {
3887                         int mid = (index + end) / 2;
3888                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3889                                 index = mid;
3890                                 break;
3891                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3892                                 index = mid + 1;
3893                         else
3894                                 end = mid;
3895                 }
3896         }
3897         /* If we're off the end of the array, we're done */
3898         if (index >= l->length)
3899                 return NULL;
3900         /* Update the abstract position to be the actual pid that we found */
3901         iter = l->list + index;
3902         *pos = cgroup_pid_fry(cgrp, *iter);
3903         return iter;
3904 }
3905
3906 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3907 {
3908         struct kernfs_open_file *of = s->private;
3909         struct cgroup_pidlist *l = of->priv;
3910
3911         if (l)
3912                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3913                                  CGROUP_PIDLIST_DESTROY_DELAY);
3914         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3915 }
3916
3917 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3918 {
3919         struct kernfs_open_file *of = s->private;
3920         struct cgroup_pidlist *l = of->priv;
3921         pid_t *p = v;
3922         pid_t *end = l->list + l->length;
3923         /*
3924          * Advance to the next pid in the array. If this goes off the
3925          * end, we're done
3926          */
3927         p++;
3928         if (p >= end) {
3929                 return NULL;
3930         } else {
3931                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3932                 return p;
3933         }
3934 }
3935
3936 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3937 {
3938         return seq_printf(s, "%d\n", *(int *)v);
3939 }
3940
3941 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3942                                          struct cftype *cft)
3943 {
3944         return notify_on_release(css->cgroup);
3945 }
3946
3947 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3948                                           struct cftype *cft, u64 val)
3949 {
3950         clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3951         if (val)
3952                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3953         else
3954                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3955         return 0;
3956 }
3957
3958 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3959                                       struct cftype *cft)
3960 {
3961         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3962 }
3963
3964 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3965                                        struct cftype *cft, u64 val)
3966 {
3967         if (val)
3968                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3969         else
3970                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3971         return 0;
3972 }
3973
3974 static struct cftype cgroup_base_files[] = {
3975         {
3976                 .name = "cgroup.procs",
3977                 .seq_start = cgroup_pidlist_start,
3978                 .seq_next = cgroup_pidlist_next,
3979                 .seq_stop = cgroup_pidlist_stop,
3980                 .seq_show = cgroup_pidlist_show,
3981                 .private = CGROUP_FILE_PROCS,
3982                 .write = cgroup_procs_write,
3983                 .mode = S_IRUGO | S_IWUSR,
3984         },
3985         {
3986                 .name = "cgroup.clone_children",
3987                 .flags = CFTYPE_INSANE,
3988                 .read_u64 = cgroup_clone_children_read,
3989                 .write_u64 = cgroup_clone_children_write,
3990         },
3991         {
3992                 .name = "cgroup.sane_behavior",
3993                 .flags = CFTYPE_ONLY_ON_ROOT,
3994                 .seq_show = cgroup_sane_behavior_show,
3995         },
3996         {
3997                 .name = "cgroup.controllers",
3998                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
3999                 .seq_show = cgroup_root_controllers_show,
4000         },
4001         {
4002                 .name = "cgroup.controllers",
4003                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4004                 .seq_show = cgroup_controllers_show,
4005         },
4006         {
4007                 .name = "cgroup.subtree_control",
4008                 .flags = CFTYPE_ONLY_ON_DFL,
4009                 .seq_show = cgroup_subtree_control_show,
4010                 .write = cgroup_subtree_control_write,
4011         },
4012         {
4013                 .name = "cgroup.populated",
4014                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4015                 .seq_show = cgroup_populated_show,
4016         },
4017
4018         /*
4019          * Historical crazy stuff.  These don't have "cgroup."  prefix and
4020          * don't exist if sane_behavior.  If you're depending on these, be
4021          * prepared to be burned.
4022          */
4023         {
4024                 .name = "tasks",
4025                 .flags = CFTYPE_INSANE,         /* use "procs" instead */
4026                 .seq_start = cgroup_pidlist_start,
4027                 .seq_next = cgroup_pidlist_next,
4028                 .seq_stop = cgroup_pidlist_stop,
4029                 .seq_show = cgroup_pidlist_show,
4030                 .private = CGROUP_FILE_TASKS,
4031                 .write = cgroup_tasks_write,
4032                 .mode = S_IRUGO | S_IWUSR,
4033         },
4034         {
4035                 .name = "notify_on_release",
4036                 .flags = CFTYPE_INSANE,
4037                 .read_u64 = cgroup_read_notify_on_release,
4038                 .write_u64 = cgroup_write_notify_on_release,
4039         },
4040         {
4041                 .name = "release_agent",
4042                 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4043                 .seq_show = cgroup_release_agent_show,
4044                 .write = cgroup_release_agent_write,
4045                 .max_write_len = PATH_MAX - 1,
4046         },
4047         { }     /* terminate */
4048 };
4049
4050 /**
4051  * cgroup_populate_dir - create subsys files in a cgroup directory
4052  * @cgrp: target cgroup
4053  * @subsys_mask: mask of the subsystem ids whose files should be added
4054  *
4055  * On failure, no file is added.
4056  */
4057 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
4058 {
4059         struct cgroup_subsys *ss;
4060         int i, ret = 0;
4061
4062         /* process cftsets of each subsystem */
4063         for_each_subsys(ss, i) {
4064                 struct cftype *cfts;
4065
4066                 if (!(subsys_mask & (1 << i)))
4067                         continue;
4068
4069                 list_for_each_entry(cfts, &ss->cfts, node) {
4070                         ret = cgroup_addrm_files(cgrp, cfts, true);
4071                         if (ret < 0)
4072                                 goto err;
4073                 }
4074         }
4075         return 0;
4076 err:
4077         cgroup_clear_dir(cgrp, subsys_mask);
4078         return ret;
4079 }
4080
4081 /*
4082  * css destruction is four-stage process.
4083  *
4084  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4085  *    Implemented in kill_css().
4086  *
4087  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4088  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4089  *    offlined by invoking offline_css().  After offlining, the base ref is
4090  *    put.  Implemented in css_killed_work_fn().
4091  *
4092  * 3. When the percpu_ref reaches zero, the only possible remaining
4093  *    accessors are inside RCU read sections.  css_release() schedules the
4094  *    RCU callback.
4095  *
4096  * 4. After the grace period, the css can be freed.  Implemented in
4097  *    css_free_work_fn().
4098  *
4099  * It is actually hairier because both step 2 and 4 require process context
4100  * and thus involve punting to css->destroy_work adding two additional
4101  * steps to the already complex sequence.
4102  */
4103 static void css_free_work_fn(struct work_struct *work)
4104 {
4105         struct cgroup_subsys_state *css =
4106                 container_of(work, struct cgroup_subsys_state, destroy_work);
4107         struct cgroup *cgrp = css->cgroup;
4108
4109         if (css->parent)
4110                 css_put(css->parent);
4111
4112         css->ss->css_free(css);
4113         cgroup_put(cgrp);
4114 }
4115
4116 static void css_free_rcu_fn(struct rcu_head *rcu_head)
4117 {
4118         struct cgroup_subsys_state *css =
4119                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4120
4121         INIT_WORK(&css->destroy_work, css_free_work_fn);
4122         queue_work(cgroup_destroy_wq, &css->destroy_work);
4123 }
4124
4125 static void css_release(struct percpu_ref *ref)
4126 {
4127         struct cgroup_subsys_state *css =
4128                 container_of(ref, struct cgroup_subsys_state, refcnt);
4129         struct cgroup_subsys *ss = css->ss;
4130
4131         cgroup_idr_remove(&ss->css_idr, css->id);
4132
4133         call_rcu(&css->rcu_head, css_free_rcu_fn);
4134 }
4135
4136 static void init_and_link_css(struct cgroup_subsys_state *css,
4137                               struct cgroup_subsys *ss, struct cgroup *cgrp)
4138 {
4139         cgroup_get(cgrp);
4140
4141         css->cgroup = cgrp;
4142         css->ss = ss;
4143         css->flags = 0;
4144
4145         if (cgrp->parent) {
4146                 css->parent = cgroup_css(cgrp->parent, ss);
4147                 css_get(css->parent);
4148         } else {
4149                 css->flags |= CSS_ROOT;
4150         }
4151
4152         BUG_ON(cgroup_css(cgrp, ss));
4153 }
4154
4155 /* invoke ->css_online() on a new CSS and mark it online if successful */
4156 static int online_css(struct cgroup_subsys_state *css)
4157 {
4158         struct cgroup_subsys *ss = css->ss;
4159         int ret = 0;
4160
4161         lockdep_assert_held(&cgroup_mutex);
4162
4163         if (ss->css_online)
4164                 ret = ss->css_online(css);
4165         if (!ret) {
4166                 css->flags |= CSS_ONLINE;
4167                 css->cgroup->nr_css++;
4168                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4169         }
4170         return ret;
4171 }
4172
4173 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4174 static void offline_css(struct cgroup_subsys_state *css)
4175 {
4176         struct cgroup_subsys *ss = css->ss;
4177
4178         lockdep_assert_held(&cgroup_mutex);
4179
4180         if (!(css->flags & CSS_ONLINE))
4181                 return;
4182
4183         if (ss->css_offline)
4184                 ss->css_offline(css);
4185
4186         css->flags &= ~CSS_ONLINE;
4187         css->cgroup->nr_css--;
4188         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4189
4190         wake_up_all(&css->cgroup->offline_waitq);
4191 }
4192
4193 /**
4194  * create_css - create a cgroup_subsys_state
4195  * @cgrp: the cgroup new css will be associated with
4196  * @ss: the subsys of new css
4197  *
4198  * Create a new css associated with @cgrp - @ss pair.  On success, the new
4199  * css is online and installed in @cgrp with all interface files created.
4200  * Returns 0 on success, -errno on failure.
4201  */
4202 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4203 {
4204         struct cgroup *parent = cgrp->parent;
4205         struct cgroup_subsys_state *css;
4206         int err;
4207
4208         lockdep_assert_held(&cgroup_mutex);
4209
4210         css = ss->css_alloc(cgroup_css(parent, ss));
4211         if (IS_ERR(css))
4212                 return PTR_ERR(css);
4213
4214         init_and_link_css(css, ss, cgrp);
4215
4216         err = percpu_ref_init(&css->refcnt, css_release);
4217         if (err)
4218                 goto err_free_css;
4219
4220         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4221         if (err < 0)
4222                 goto err_free_percpu_ref;
4223         css->id = err;
4224
4225         err = cgroup_populate_dir(cgrp, 1 << ss->id);
4226         if (err)
4227                 goto err_free_id;
4228
4229         /* @css is ready to be brought online now, make it visible */
4230         cgroup_idr_replace(&ss->css_idr, css, css->id);
4231
4232         err = online_css(css);
4233         if (err)
4234                 goto err_clear_dir;
4235
4236         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4237             parent->parent) {
4238                 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4239                         current->comm, current->pid, ss->name);
4240                 if (!strcmp(ss->name, "memory"))
4241                         pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4242                 ss->warned_broken_hierarchy = true;
4243         }
4244
4245         return 0;
4246
4247 err_clear_dir:
4248         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4249 err_free_id:
4250         cgroup_idr_remove(&ss->css_idr, css->id);
4251 err_free_percpu_ref:
4252         percpu_ref_cancel_init(&css->refcnt);
4253 err_free_css:
4254         call_rcu(&css->rcu_head, css_free_rcu_fn);
4255         return err;
4256 }
4257
4258 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4259                         umode_t mode)
4260 {
4261         struct cgroup *parent, *cgrp;
4262         struct cgroup_root *root;
4263         struct cgroup_subsys *ss;
4264         struct kernfs_node *kn;
4265         int ssid, ret;
4266
4267         parent = cgroup_kn_lock_live(parent_kn);
4268         if (!parent)
4269                 return -ENODEV;
4270         root = parent->root;
4271
4272         /* allocate the cgroup and its ID, 0 is reserved for the root */
4273         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4274         if (!cgrp) {
4275                 ret = -ENOMEM;
4276                 goto out_unlock;
4277         }
4278
4279         /*
4280          * Temporarily set the pointer to NULL, so idr_find() won't return
4281          * a half-baked cgroup.
4282          */
4283         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
4284         if (cgrp->id < 0) {
4285                 ret = -ENOMEM;
4286                 goto out_free_cgrp;
4287         }
4288
4289         init_cgroup_housekeeping(cgrp);
4290
4291         cgrp->parent = parent;
4292         cgrp->self.parent = &parent->self;
4293         cgrp->root = root;
4294
4295         if (notify_on_release(parent))
4296                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4297
4298         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4299                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4300
4301         /* create the directory */
4302         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4303         if (IS_ERR(kn)) {
4304                 ret = PTR_ERR(kn);
4305                 goto out_free_id;
4306         }
4307         cgrp->kn = kn;
4308
4309         /*
4310          * This extra ref will be put in cgroup_free_fn() and guarantees
4311          * that @cgrp->kn is always accessible.
4312          */
4313         kernfs_get(kn);
4314
4315         cgrp->serial_nr = cgroup_serial_nr_next++;
4316
4317         /* allocation complete, commit to creation */
4318         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4319         atomic_inc(&root->nr_cgrps);
4320         cgroup_get(parent);
4321
4322         /*
4323          * @cgrp is now fully operational.  If something fails after this
4324          * point, it'll be released via the normal destruction path.
4325          */
4326         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4327
4328         ret = cgroup_kn_set_ugid(kn);
4329         if (ret)
4330                 goto out_destroy;
4331
4332         ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4333         if (ret)
4334                 goto out_destroy;
4335
4336         /* let's create and online css's */
4337         for_each_subsys(ss, ssid) {
4338                 if (parent->child_subsys_mask & (1 << ssid)) {
4339                         ret = create_css(cgrp, ss);
4340                         if (ret)
4341                                 goto out_destroy;
4342                 }
4343         }
4344
4345         /*
4346          * On the default hierarchy, a child doesn't automatically inherit
4347          * child_subsys_mask from the parent.  Each is configured manually.
4348          */
4349         if (!cgroup_on_dfl(cgrp))
4350                 cgrp->child_subsys_mask = parent->child_subsys_mask;
4351
4352         kernfs_activate(kn);
4353
4354         ret = 0;
4355         goto out_unlock;
4356
4357 out_free_id:
4358         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4359 out_free_cgrp:
4360         kfree(cgrp);
4361 out_unlock:
4362         cgroup_kn_unlock(parent_kn);
4363         return ret;
4364
4365 out_destroy:
4366         cgroup_destroy_locked(cgrp);
4367         goto out_unlock;
4368 }
4369
4370 /*
4371  * This is called when the refcnt of a css is confirmed to be killed.
4372  * css_tryget_online() is now guaranteed to fail.
4373  */
4374 static void css_killed_work_fn(struct work_struct *work)
4375 {
4376         struct cgroup_subsys_state *css =
4377                 container_of(work, struct cgroup_subsys_state, destroy_work);
4378         struct cgroup *cgrp = css->cgroup;
4379
4380         mutex_lock(&cgroup_mutex);
4381
4382         /*
4383          * css_tryget_online() is guaranteed to fail now.  Tell subsystems
4384          * to initate destruction.
4385          */
4386         offline_css(css);
4387
4388         /*
4389          * If @cgrp is marked dead, it's waiting for refs of all css's to
4390          * be disabled before proceeding to the second phase of cgroup
4391          * destruction.  If we are the last one, kick it off.
4392          */
4393         if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4394                 cgroup_destroy_css_killed(cgrp);
4395
4396         mutex_unlock(&cgroup_mutex);
4397
4398         /*
4399          * Put the css refs from kill_css().  Each css holds an extra
4400          * reference to the cgroup's dentry and cgroup removal proceeds
4401          * regardless of css refs.  On the last put of each css, whenever
4402          * that may be, the extra dentry ref is put so that dentry
4403          * destruction happens only after all css's are released.
4404          */
4405         css_put(css);
4406 }
4407
4408 /* css kill confirmation processing requires process context, bounce */
4409 static void css_killed_ref_fn(struct percpu_ref *ref)
4410 {
4411         struct cgroup_subsys_state *css =
4412                 container_of(ref, struct cgroup_subsys_state, refcnt);
4413
4414         INIT_WORK(&css->destroy_work, css_killed_work_fn);
4415         queue_work(cgroup_destroy_wq, &css->destroy_work);
4416 }
4417
4418 /**
4419  * kill_css - destroy a css
4420  * @css: css to destroy
4421  *
4422  * This function initiates destruction of @css by removing cgroup interface
4423  * files and putting its base reference.  ->css_offline() will be invoked
4424  * asynchronously once css_tryget_online() is guaranteed to fail and when
4425  * the reference count reaches zero, @css will be released.
4426  */
4427 static void kill_css(struct cgroup_subsys_state *css)
4428 {
4429         lockdep_assert_held(&cgroup_mutex);
4430
4431         /*
4432          * This must happen before css is disassociated with its cgroup.
4433          * See seq_css() for details.
4434          */
4435         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4436
4437         /*
4438          * Killing would put the base ref, but we need to keep it alive
4439          * until after ->css_offline().
4440          */
4441         css_get(css);
4442
4443         /*
4444          * cgroup core guarantees that, by the time ->css_offline() is
4445          * invoked, no new css reference will be given out via
4446          * css_tryget_online().  We can't simply call percpu_ref_kill() and
4447          * proceed to offlining css's because percpu_ref_kill() doesn't
4448          * guarantee that the ref is seen as killed on all CPUs on return.
4449          *
4450          * Use percpu_ref_kill_and_confirm() to get notifications as each
4451          * css is confirmed to be seen as killed on all CPUs.
4452          */
4453         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4454 }
4455
4456 /**
4457  * cgroup_destroy_locked - the first stage of cgroup destruction
4458  * @cgrp: cgroup to be destroyed
4459  *
4460  * css's make use of percpu refcnts whose killing latency shouldn't be
4461  * exposed to userland and are RCU protected.  Also, cgroup core needs to
4462  * guarantee that css_tryget_online() won't succeed by the time
4463  * ->css_offline() is invoked.  To satisfy all the requirements,
4464  * destruction is implemented in the following two steps.
4465  *
4466  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4467  *     userland visible parts and start killing the percpu refcnts of
4468  *     css's.  Set up so that the next stage will be kicked off once all
4469  *     the percpu refcnts are confirmed to be killed.
4470  *
4471  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4472  *     rest of destruction.  Once all cgroup references are gone, the
4473  *     cgroup is RCU-freed.
4474  *
4475  * This function implements s1.  After this step, @cgrp is gone as far as
4476  * the userland is concerned and a new cgroup with the same name may be
4477  * created.  As cgroup doesn't care about the names internally, this
4478  * doesn't cause any problem.
4479  */
4480 static int cgroup_destroy_locked(struct cgroup *cgrp)
4481         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4482 {
4483         struct cgroup_subsys_state *css;
4484         bool empty;
4485         int ssid;
4486
4487         lockdep_assert_held(&cgroup_mutex);
4488
4489         /*
4490          * css_set_rwsem synchronizes access to ->cset_links and prevents
4491          * @cgrp from being removed while put_css_set() is in progress.
4492          */
4493         down_read(&css_set_rwsem);
4494         empty = list_empty(&cgrp->cset_links);
4495         up_read(&css_set_rwsem);
4496         if (!empty)
4497                 return -EBUSY;
4498
4499         /*
4500          * Make sure there's no live children.  We can't test ->children
4501          * emptiness as dead children linger on it while being destroyed;
4502          * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4503          */
4504         if (cgroup_has_live_children(cgrp))
4505                 return -EBUSY;
4506
4507         /*
4508          * Mark @cgrp dead.  This prevents further task migration and child
4509          * creation by disabling cgroup_lock_live_group().  Note that
4510          * CGRP_DEAD assertion is depended upon by css_next_child() to
4511          * resume iteration after dropping RCU read lock.  See
4512          * css_next_child() for details.
4513          */
4514         set_bit(CGRP_DEAD, &cgrp->flags);
4515
4516         /*
4517          * Initiate massacre of all css's.  cgroup_destroy_css_killed()
4518          * will be invoked to perform the rest of destruction once the
4519          * percpu refs of all css's are confirmed to be killed.
4520          */
4521         for_each_css(css, ssid, cgrp)
4522                 kill_css(css);
4523
4524         /* CGRP_DEAD is set, remove from ->release_list for the last time */
4525         raw_spin_lock(&release_list_lock);
4526         if (!list_empty(&cgrp->release_list))
4527                 list_del_init(&cgrp->release_list);
4528         raw_spin_unlock(&release_list_lock);
4529
4530         /*
4531          * If @cgrp has css's attached, the second stage of cgroup
4532          * destruction is kicked off from css_killed_work_fn() after the
4533          * refs of all attached css's are killed.  If @cgrp doesn't have
4534          * any css, we kick it off here.
4535          */
4536         if (!cgrp->nr_css)
4537                 cgroup_destroy_css_killed(cgrp);
4538
4539         /*
4540          * Remove @cgrp directory along with the base files.  @cgrp has an
4541          * extra ref on its kn.
4542          */
4543         kernfs_remove(cgrp->kn);
4544
4545         return 0;
4546 };
4547
4548 /**
4549  * cgroup_destroy_css_killed - the second step of cgroup destruction
4550  * @cgrp: the cgroup whose csses have just finished offlining
4551  *
4552  * This function is invoked from a work item for a cgroup which is being
4553  * destroyed after all css's are offlined and performs the rest of
4554  * destruction.  This is the second step of destruction described in the
4555  * comment above cgroup_destroy_locked().
4556  */
4557 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4558 {
4559         struct cgroup *parent = cgrp->parent;
4560
4561         lockdep_assert_held(&cgroup_mutex);
4562
4563         /* delete this cgroup from parent->children */
4564         list_del_rcu(&cgrp->sibling);
4565
4566         cgroup_put(cgrp);
4567
4568         set_bit(CGRP_RELEASABLE, &parent->flags);
4569         check_for_release(parent);
4570 }
4571
4572 static int cgroup_rmdir(struct kernfs_node *kn)
4573 {
4574         struct cgroup *cgrp;
4575         int ret = 0;
4576
4577         cgrp = cgroup_kn_lock_live(kn);
4578         if (!cgrp)
4579                 return 0;
4580         cgroup_get(cgrp);       /* for @kn->priv clearing */
4581
4582         ret = cgroup_destroy_locked(cgrp);
4583
4584         cgroup_kn_unlock(kn);
4585
4586         /*
4587          * There are two control paths which try to determine cgroup from
4588          * dentry without going through kernfs - cgroupstats_build() and
4589          * css_tryget_online_from_dir().  Those are supported by RCU
4590          * protecting clearing of cgrp->kn->priv backpointer, which should
4591          * happen after all files under it have been removed.
4592          */
4593         if (!ret)
4594                 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4595
4596         cgroup_put(cgrp);
4597         return ret;
4598 }
4599
4600 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4601         .remount_fs             = cgroup_remount,
4602         .show_options           = cgroup_show_options,
4603         .mkdir                  = cgroup_mkdir,
4604         .rmdir                  = cgroup_rmdir,
4605         .rename                 = cgroup_rename,
4606 };
4607
4608 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4609 {
4610         struct cgroup_subsys_state *css;
4611
4612         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4613
4614         mutex_lock(&cgroup_mutex);
4615
4616         idr_init(&ss->css_idr);
4617         INIT_LIST_HEAD(&ss->cfts);
4618
4619         /* Create the root cgroup state for this subsystem */
4620         ss->root = &cgrp_dfl_root;
4621         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4622         /* We don't handle early failures gracefully */
4623         BUG_ON(IS_ERR(css));
4624         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4625         if (early) {
4626                 /* idr_alloc() can't be called safely during early init */
4627                 css->id = 1;
4628         } else {
4629                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4630                 BUG_ON(css->id < 0);
4631         }
4632
4633         /* Update the init_css_set to contain a subsys
4634          * pointer to this state - since the subsystem is
4635          * newly registered, all tasks and hence the
4636          * init_css_set is in the subsystem's root cgroup. */
4637         init_css_set.subsys[ss->id] = css;
4638
4639         need_forkexit_callback |= ss->fork || ss->exit;
4640
4641         /* At system boot, before all subsystems have been
4642          * registered, no tasks have been forked, so we don't
4643          * need to invoke fork callbacks here. */
4644         BUG_ON(!list_empty(&init_task.tasks));
4645
4646         BUG_ON(online_css(css));
4647
4648         cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4649
4650         mutex_unlock(&cgroup_mutex);
4651 }
4652
4653 /**
4654  * cgroup_init_early - cgroup initialization at system boot
4655  *
4656  * Initialize cgroups at system boot, and initialize any
4657  * subsystems that request early init.
4658  */
4659 int __init cgroup_init_early(void)
4660 {
4661         static struct cgroup_sb_opts __initdata opts =
4662                 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4663         struct cgroup_subsys *ss;
4664         int i;
4665
4666         init_cgroup_root(&cgrp_dfl_root, &opts);
4667         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4668
4669         for_each_subsys(ss, i) {
4670                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4671                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4672                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4673                      ss->id, ss->name);
4674                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4675                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4676
4677                 ss->id = i;
4678                 ss->name = cgroup_subsys_name[i];
4679
4680                 if (ss->early_init)
4681                         cgroup_init_subsys(ss, true);
4682         }
4683         return 0;
4684 }
4685
4686 /**
4687  * cgroup_init - cgroup initialization
4688  *
4689  * Register cgroup filesystem and /proc file, and initialize
4690  * any subsystems that didn't request early init.
4691  */
4692 int __init cgroup_init(void)
4693 {
4694         struct cgroup_subsys *ss;
4695         unsigned long key;
4696         int ssid, err;
4697
4698         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4699
4700         mutex_lock(&cgroup_mutex);
4701
4702         /* Add init_css_set to the hash table */
4703         key = css_set_hash(init_css_set.subsys);
4704         hash_add(css_set_table, &init_css_set.hlist, key);
4705
4706         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4707
4708         mutex_unlock(&cgroup_mutex);
4709
4710         for_each_subsys(ss, ssid) {
4711                 if (ss->early_init) {
4712                         struct cgroup_subsys_state *css =
4713                                 init_css_set.subsys[ss->id];
4714
4715                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4716                                                    GFP_KERNEL);
4717                         BUG_ON(css->id < 0);
4718                 } else {
4719                         cgroup_init_subsys(ss, false);
4720                 }
4721
4722                 list_add_tail(&init_css_set.e_cset_node[ssid],
4723                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
4724
4725                 /*
4726                  * cftype registration needs kmalloc and can't be done
4727                  * during early_init.  Register base cftypes separately.
4728                  */
4729                 if (ss->base_cftypes)
4730                         WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4731         }
4732
4733         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4734         if (!cgroup_kobj)
4735                 return -ENOMEM;
4736
4737         err = register_filesystem(&cgroup_fs_type);
4738         if (err < 0) {
4739                 kobject_put(cgroup_kobj);
4740                 return err;
4741         }
4742
4743         proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4744         return 0;
4745 }
4746
4747 static int __init cgroup_wq_init(void)
4748 {
4749         /*
4750          * There isn't much point in executing destruction path in
4751          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4752          * Use 1 for @max_active.
4753          *
4754          * We would prefer to do this in cgroup_init() above, but that
4755          * is called before init_workqueues(): so leave this until after.
4756          */
4757         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4758         BUG_ON(!cgroup_destroy_wq);
4759
4760         /*
4761          * Used to destroy pidlists and separate to serve as flush domain.
4762          * Cap @max_active to 1 too.
4763          */
4764         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4765                                                     0, 1);
4766         BUG_ON(!cgroup_pidlist_destroy_wq);
4767
4768         return 0;
4769 }
4770 core_initcall(cgroup_wq_init);
4771
4772 /*
4773  * proc_cgroup_show()
4774  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4775  *  - Used for /proc/<pid>/cgroup.
4776  */
4777
4778 /* TODO: Use a proper seq_file iterator */
4779 int proc_cgroup_show(struct seq_file *m, void *v)
4780 {
4781         struct pid *pid;
4782         struct task_struct *tsk;
4783         char *buf, *path;
4784         int retval;
4785         struct cgroup_root *root;
4786
4787         retval = -ENOMEM;
4788         buf = kmalloc(PATH_MAX, GFP_KERNEL);
4789         if (!buf)
4790                 goto out;
4791
4792         retval = -ESRCH;
4793         pid = m->private;
4794         tsk = get_pid_task(pid, PIDTYPE_PID);
4795         if (!tsk)
4796                 goto out_free;
4797
4798         retval = 0;
4799
4800         mutex_lock(&cgroup_mutex);
4801         down_read(&css_set_rwsem);
4802
4803         for_each_root(root) {
4804                 struct cgroup_subsys *ss;
4805                 struct cgroup *cgrp;
4806                 int ssid, count = 0;
4807
4808                 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4809                         continue;
4810
4811                 seq_printf(m, "%d:", root->hierarchy_id);
4812                 for_each_subsys(ss, ssid)
4813                         if (root->subsys_mask & (1 << ssid))
4814                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4815                 if (strlen(root->name))
4816                         seq_printf(m, "%sname=%s", count ? "," : "",
4817                                    root->name);
4818                 seq_putc(m, ':');
4819                 cgrp = task_cgroup_from_root(tsk, root);
4820                 path = cgroup_path(cgrp, buf, PATH_MAX);
4821                 if (!path) {
4822                         retval = -ENAMETOOLONG;
4823                         goto out_unlock;
4824                 }
4825                 seq_puts(m, path);
4826                 seq_putc(m, '\n');
4827         }
4828
4829 out_unlock:
4830         up_read(&css_set_rwsem);
4831         mutex_unlock(&cgroup_mutex);
4832         put_task_struct(tsk);
4833 out_free:
4834         kfree(buf);
4835 out:
4836         return retval;
4837 }
4838
4839 /* Display information about each subsystem and each hierarchy */
4840 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4841 {
4842         struct cgroup_subsys *ss;
4843         int i;
4844
4845         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4846         /*
4847          * ideally we don't want subsystems moving around while we do this.
4848          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4849          * subsys/hierarchy state.
4850          */
4851         mutex_lock(&cgroup_mutex);
4852
4853         for_each_subsys(ss, i)
4854                 seq_printf(m, "%s\t%d\t%d\t%d\n",
4855                            ss->name, ss->root->hierarchy_id,
4856                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4857
4858         mutex_unlock(&cgroup_mutex);
4859         return 0;
4860 }
4861
4862 static int cgroupstats_open(struct inode *inode, struct file *file)
4863 {
4864         return single_open(file, proc_cgroupstats_show, NULL);
4865 }
4866
4867 static const struct file_operations proc_cgroupstats_operations = {
4868         .open = cgroupstats_open,
4869         .read = seq_read,
4870         .llseek = seq_lseek,
4871         .release = single_release,
4872 };
4873
4874 /**
4875  * cgroup_fork - initialize cgroup related fields during copy_process()
4876  * @child: pointer to task_struct of forking parent process.
4877  *
4878  * A task is associated with the init_css_set until cgroup_post_fork()
4879  * attaches it to the parent's css_set.  Empty cg_list indicates that
4880  * @child isn't holding reference to its css_set.
4881  */
4882 void cgroup_fork(struct task_struct *child)
4883 {
4884         RCU_INIT_POINTER(child->cgroups, &init_css_set);
4885         INIT_LIST_HEAD(&child->cg_list);
4886 }
4887
4888 /**
4889  * cgroup_post_fork - called on a new task after adding it to the task list
4890  * @child: the task in question
4891  *
4892  * Adds the task to the list running through its css_set if necessary and
4893  * call the subsystem fork() callbacks.  Has to be after the task is
4894  * visible on the task list in case we race with the first call to
4895  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4896  * list.
4897  */
4898 void cgroup_post_fork(struct task_struct *child)
4899 {
4900         struct cgroup_subsys *ss;
4901         int i;
4902
4903         /*
4904          * This may race against cgroup_enable_task_cg_links().  As that
4905          * function sets use_task_css_set_links before grabbing
4906          * tasklist_lock and we just went through tasklist_lock to add
4907          * @child, it's guaranteed that either we see the set
4908          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4909          * @child during its iteration.
4910          *
4911          * If we won the race, @child is associated with %current's
4912          * css_set.  Grabbing css_set_rwsem guarantees both that the
4913          * association is stable, and, on completion of the parent's
4914          * migration, @child is visible in the source of migration or
4915          * already in the destination cgroup.  This guarantee is necessary
4916          * when implementing operations which need to migrate all tasks of
4917          * a cgroup to another.
4918          *
4919          * Note that if we lose to cgroup_enable_task_cg_links(), @child
4920          * will remain in init_css_set.  This is safe because all tasks are
4921          * in the init_css_set before cg_links is enabled and there's no
4922          * operation which transfers all tasks out of init_css_set.
4923          */
4924         if (use_task_css_set_links) {
4925                 struct css_set *cset;
4926
4927                 down_write(&css_set_rwsem);
4928                 cset = task_css_set(current);
4929                 if (list_empty(&child->cg_list)) {
4930                         rcu_assign_pointer(child->cgroups, cset);
4931                         list_add(&child->cg_list, &cset->tasks);
4932                         get_css_set(cset);
4933                 }
4934                 up_write(&css_set_rwsem);
4935         }
4936
4937         /*
4938          * Call ss->fork().  This must happen after @child is linked on
4939          * css_set; otherwise, @child might change state between ->fork()
4940          * and addition to css_set.
4941          */
4942         if (need_forkexit_callback) {
4943                 for_each_subsys(ss, i)
4944                         if (ss->fork)
4945                                 ss->fork(child);
4946         }
4947 }
4948
4949 /**
4950  * cgroup_exit - detach cgroup from exiting task
4951  * @tsk: pointer to task_struct of exiting process
4952  *
4953  * Description: Detach cgroup from @tsk and release it.
4954  *
4955  * Note that cgroups marked notify_on_release force every task in
4956  * them to take the global cgroup_mutex mutex when exiting.
4957  * This could impact scaling on very large systems.  Be reluctant to
4958  * use notify_on_release cgroups where very high task exit scaling
4959  * is required on large systems.
4960  *
4961  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
4962  * call cgroup_exit() while the task is still competent to handle
4963  * notify_on_release(), then leave the task attached to the root cgroup in
4964  * each hierarchy for the remainder of its exit.  No need to bother with
4965  * init_css_set refcnting.  init_css_set never goes away and we can't race
4966  * with migration path - PF_EXITING is visible to migration path.
4967  */
4968 void cgroup_exit(struct task_struct *tsk)
4969 {
4970         struct cgroup_subsys *ss;
4971         struct css_set *cset;
4972         bool put_cset = false;
4973         int i;
4974
4975         /*
4976          * Unlink from @tsk from its css_set.  As migration path can't race
4977          * with us, we can check cg_list without grabbing css_set_rwsem.
4978          */
4979         if (!list_empty(&tsk->cg_list)) {
4980                 down_write(&css_set_rwsem);
4981                 list_del_init(&tsk->cg_list);
4982                 up_write(&css_set_rwsem);
4983                 put_cset = true;
4984         }
4985
4986         /* Reassign the task to the init_css_set. */
4987         cset = task_css_set(tsk);
4988         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4989
4990         if (need_forkexit_callback) {
4991                 /* see cgroup_post_fork() for details */
4992                 for_each_subsys(ss, i) {
4993                         if (ss->exit) {
4994                                 struct cgroup_subsys_state *old_css = cset->subsys[i];
4995                                 struct cgroup_subsys_state *css = task_css(tsk, i);
4996
4997                                 ss->exit(css, old_css, tsk);
4998                         }
4999                 }
5000         }
5001
5002         if (put_cset)
5003                 put_css_set(cset, true);
5004 }
5005
5006 static void check_for_release(struct cgroup *cgrp)
5007 {
5008         if (cgroup_is_releasable(cgrp) &&
5009             list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
5010                 /*
5011                  * Control Group is currently removeable. If it's not
5012                  * already queued for a userspace notification, queue
5013                  * it now
5014                  */
5015                 int need_schedule_work = 0;
5016
5017                 raw_spin_lock(&release_list_lock);
5018                 if (!cgroup_is_dead(cgrp) &&
5019                     list_empty(&cgrp->release_list)) {
5020                         list_add(&cgrp->release_list, &release_list);
5021                         need_schedule_work = 1;
5022                 }
5023                 raw_spin_unlock(&release_list_lock);
5024                 if (need_schedule_work)
5025                         schedule_work(&release_agent_work);
5026         }
5027 }
5028
5029 /*
5030  * Notify userspace when a cgroup is released, by running the
5031  * configured release agent with the name of the cgroup (path
5032  * relative to the root of cgroup file system) as the argument.
5033  *
5034  * Most likely, this user command will try to rmdir this cgroup.
5035  *
5036  * This races with the possibility that some other task will be
5037  * attached to this cgroup before it is removed, or that some other
5038  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5039  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5040  * unused, and this cgroup will be reprieved from its death sentence,
5041  * to continue to serve a useful existence.  Next time it's released,
5042  * we will get notified again, if it still has 'notify_on_release' set.
5043  *
5044  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5045  * means only wait until the task is successfully execve()'d.  The
5046  * separate release agent task is forked by call_usermodehelper(),
5047  * then control in this thread returns here, without waiting for the
5048  * release agent task.  We don't bother to wait because the caller of
5049  * this routine has no use for the exit status of the release agent
5050  * task, so no sense holding our caller up for that.
5051  */
5052 static void cgroup_release_agent(struct work_struct *work)
5053 {
5054         BUG_ON(work != &release_agent_work);
5055         mutex_lock(&cgroup_mutex);
5056         raw_spin_lock(&release_list_lock);
5057         while (!list_empty(&release_list)) {
5058                 char *argv[3], *envp[3];
5059                 int i;
5060                 char *pathbuf = NULL, *agentbuf = NULL, *path;
5061                 struct cgroup *cgrp = list_entry(release_list.next,
5062                                                     struct cgroup,
5063                                                     release_list);
5064                 list_del_init(&cgrp->release_list);
5065                 raw_spin_unlock(&release_list_lock);
5066                 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5067                 if (!pathbuf)
5068                         goto continue_free;
5069                 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5070                 if (!path)
5071                         goto continue_free;
5072                 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5073                 if (!agentbuf)
5074                         goto continue_free;
5075
5076                 i = 0;
5077                 argv[i++] = agentbuf;
5078                 argv[i++] = path;
5079                 argv[i] = NULL;
5080
5081                 i = 0;
5082                 /* minimal command environment */
5083                 envp[i++] = "HOME=/";
5084                 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5085                 envp[i] = NULL;
5086
5087                 /* Drop the lock while we invoke the usermode helper,
5088                  * since the exec could involve hitting disk and hence
5089                  * be a slow process */
5090                 mutex_unlock(&cgroup_mutex);
5091                 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5092                 mutex_lock(&cgroup_mutex);
5093  continue_free:
5094                 kfree(pathbuf);
5095                 kfree(agentbuf);
5096                 raw_spin_lock(&release_list_lock);
5097         }
5098         raw_spin_unlock(&release_list_lock);
5099         mutex_unlock(&cgroup_mutex);
5100 }
5101
5102 static int __init cgroup_disable(char *str)
5103 {
5104         struct cgroup_subsys *ss;
5105         char *token;
5106         int i;
5107
5108         while ((token = strsep(&str, ",")) != NULL) {
5109                 if (!*token)
5110                         continue;
5111
5112                 for_each_subsys(ss, i) {
5113                         if (!strcmp(token, ss->name)) {
5114                                 ss->disabled = 1;
5115                                 printk(KERN_INFO "Disabling %s control group"
5116                                         " subsystem\n", ss->name);
5117                                 break;
5118                         }
5119                 }
5120         }
5121         return 1;
5122 }
5123 __setup("cgroup_disable=", cgroup_disable);
5124
5125 /**
5126  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5127  * @dentry: directory dentry of interest
5128  * @ss: subsystem of interest
5129  *
5130  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5131  * to get the corresponding css and return it.  If such css doesn't exist
5132  * or can't be pinned, an ERR_PTR value is returned.
5133  */
5134 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5135                                                        struct cgroup_subsys *ss)
5136 {
5137         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5138         struct cgroup_subsys_state *css = NULL;
5139         struct cgroup *cgrp;
5140
5141         /* is @dentry a cgroup dir? */
5142         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5143             kernfs_type(kn) != KERNFS_DIR)
5144                 return ERR_PTR(-EBADF);
5145
5146         rcu_read_lock();
5147
5148         /*
5149          * This path doesn't originate from kernfs and @kn could already
5150          * have been or be removed at any point.  @kn->priv is RCU
5151          * protected for this access.  See cgroup_rmdir() for details.
5152          */
5153         cgrp = rcu_dereference(kn->priv);
5154         if (cgrp)
5155                 css = cgroup_css(cgrp, ss);
5156
5157         if (!css || !css_tryget_online(css))
5158                 css = ERR_PTR(-ENOENT);
5159
5160         rcu_read_unlock();
5161         return css;
5162 }
5163
5164 /**
5165  * css_from_id - lookup css by id
5166  * @id: the cgroup id
5167  * @ss: cgroup subsys to be looked into
5168  *
5169  * Returns the css if there's valid one with @id, otherwise returns NULL.
5170  * Should be called under rcu_read_lock().
5171  */
5172 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5173 {
5174         WARN_ON_ONCE(!rcu_read_lock_held());
5175         return idr_find(&ss->css_idr, id);
5176 }
5177
5178 #ifdef CONFIG_CGROUP_DEBUG
5179 static struct cgroup_subsys_state *
5180 debug_css_alloc(struct cgroup_subsys_state *parent_css)
5181 {
5182         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5183
5184         if (!css)
5185                 return ERR_PTR(-ENOMEM);
5186
5187         return css;
5188 }
5189
5190 static void debug_css_free(struct cgroup_subsys_state *css)
5191 {
5192         kfree(css);
5193 }
5194
5195 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5196                                 struct cftype *cft)
5197 {
5198         return cgroup_task_count(css->cgroup);
5199 }
5200
5201 static u64 current_css_set_read(struct cgroup_subsys_state *css,
5202                                 struct cftype *cft)
5203 {
5204         return (u64)(unsigned long)current->cgroups;
5205 }
5206
5207 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5208                                          struct cftype *cft)
5209 {
5210         u64 count;
5211
5212         rcu_read_lock();
5213         count = atomic_read(&task_css_set(current)->refcount);
5214         rcu_read_unlock();
5215         return count;
5216 }
5217
5218 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5219 {
5220         struct cgrp_cset_link *link;
5221         struct css_set *cset;
5222         char *name_buf;
5223
5224         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
5225         if (!name_buf)
5226                 return -ENOMEM;
5227
5228         down_read(&css_set_rwsem);
5229         rcu_read_lock();
5230         cset = rcu_dereference(current->cgroups);
5231         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5232                 struct cgroup *c = link->cgrp;
5233
5234                 cgroup_name(c, name_buf, NAME_MAX + 1);
5235                 seq_printf(seq, "Root %d group %s\n",
5236                            c->root->hierarchy_id, name_buf);
5237         }
5238         rcu_read_unlock();
5239         up_read(&css_set_rwsem);
5240         kfree(name_buf);
5241         return 0;
5242 }
5243
5244 #define MAX_TASKS_SHOWN_PER_CSS 25
5245 static int cgroup_css_links_read(struct seq_file *seq, void *v)
5246 {
5247         struct cgroup_subsys_state *css = seq_css(seq);
5248         struct cgrp_cset_link *link;
5249
5250         down_read(&css_set_rwsem);
5251         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5252                 struct css_set *cset = link->cset;
5253                 struct task_struct *task;
5254                 int count = 0;
5255
5256                 seq_printf(seq, "css_set %p\n", cset);
5257
5258                 list_for_each_entry(task, &cset->tasks, cg_list) {
5259                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5260                                 goto overflow;
5261                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5262                 }
5263
5264                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5265                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5266                                 goto overflow;
5267                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5268                 }
5269                 continue;
5270         overflow:
5271                 seq_puts(seq, "  ...\n");
5272         }
5273         up_read(&css_set_rwsem);
5274         return 0;
5275 }
5276
5277 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5278 {
5279         return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5280 }
5281
5282 static struct cftype debug_files[] =  {
5283         {
5284                 .name = "taskcount",
5285                 .read_u64 = debug_taskcount_read,
5286         },
5287
5288         {
5289                 .name = "current_css_set",
5290                 .read_u64 = current_css_set_read,
5291         },
5292
5293         {
5294                 .name = "current_css_set_refcount",
5295                 .read_u64 = current_css_set_refcount_read,
5296         },
5297
5298         {
5299                 .name = "current_css_set_cg_links",
5300                 .seq_show = current_css_set_cg_links_read,
5301         },
5302
5303         {
5304                 .name = "cgroup_css_links",
5305                 .seq_show = cgroup_css_links_read,
5306         },
5307
5308         {
5309                 .name = "releasable",
5310                 .read_u64 = releasable_read,
5311         },
5312
5313         { }     /* terminate */
5314 };
5315
5316 struct cgroup_subsys debug_cgrp_subsys = {
5317         .css_alloc = debug_css_alloc,
5318         .css_free = debug_css_free,
5319         .base_cftypes = debug_files,
5320 };
5321 #endif /* CONFIG_CGROUP_DEBUG */