kernel/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include <linux/cgroup.h>
  32 #include <linux/cred.h>
  33 #include <linux/ctype.h>
  34 #include <linux/errno.h>
  35 #include <linux/init_task.h>
  36 #include <linux/kernel.h>
  37 #include <linux/list.h>
  38 #include <linux/mm.h>
  39 #include <linux/mutex.h>
  40 #include <linux/mount.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/slab.h>
  46 #include <linux/spinlock.h>
  47 #include <linux/rwsem.h>
  48 #include <linux/string.h>
  49 #include <linux/sort.h>
  50 #include <linux/kmod.h>
  51 #include <linux/delayacct.h>
  52 #include <linux/cgroupstats.h>
  53 #include <linux/hashtable.h>
  54 #include <linux/pid_namespace.h>
  55 #include <linux/idr.h>
  56 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  57 #include <linux/kthread.h>
  58 #include <linux/delay.h>
  59
  60 #include <linux/atomic.h>
  61
  62 /*
  63  * pidlists linger the following amount before being destroyed.  The goal
  64  * is avoiding frequent destruction in the middle of consecutive read calls
  65  * Expiring in the middle is a performance problem not a correctness one.
  66  * 1 sec should be enough.
  67  */
  68 #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
  69
  70 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  71                                          MAX_CFTYPE_NAME + 2)
  72
  73 /*
  74  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  75  * hierarchy must be performed while holding it.
  76  *
  77  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  78  * objects, and the chain of tasks off each css_set.
  79  *
  80  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  81  * cgroup.h can use them for lockdep annotations.
  82  */
  83 #ifdef CONFIG_PROVE_RCU
  84 DEFINE_MUTEX(cgroup_mutex);
  85 DECLARE_RWSEM(css_set_rwsem);
  86 EXPORT_SYMBOL_GPL(cgroup_mutex);
  87 EXPORT_SYMBOL_GPL(css_set_rwsem);
  88 #else
  89 static DEFINE_MUTEX(cgroup_mutex);
  90 static DECLARE_RWSEM(css_set_rwsem);
  91 #endif
  92
  93 /*
  94  * Protects cgroup_idr and css_idr so that IDs can be released without
  95  * grabbing cgroup_mutex.
  96  */
  97 static DEFINE_SPINLOCK(cgroup_idr_lock);
  98
  99 /*
 100  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 101  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 102  */
 103 static DEFINE_SPINLOCK(release_agent_path_lock);
 104
 105 #define cgroup_assert_mutex_or_rcu_locked()                             \
 106         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
 107                            lockdep_is_held(&cgroup_mutex),              \
 108                            "cgroup_mutex or RCU read lock required");
 109
 110 /*
 111  * cgroup destruction makes heavy use of work items and there can be a lot
 112  * of concurrent destructions.  Use a separate workqueue so that cgroup
 113  * destruction work items don't end up filling up max_active of system_wq
 114  * which may lead to deadlock.
 115  */
 116 static struct workqueue_struct *cgroup_destroy_wq;
 117
 118 /*
 119  * pidlist destructions need to be flushed on cgroup destruction.  Use a
 120  * separate workqueue as flush domain.
 121  */
 122 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 123
 124 /* generate an array of cgroup subsystem pointers */
 125 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 126 static struct cgroup_subsys *cgroup_subsys[] = {
 127 #include <linux/cgroup_subsys.h>
 128 };
 129 #undef SUBSYS
 130
 131 /* array of cgroup subsystem names */
 132 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 133 static const char *cgroup_subsys_name[] = {
 134 #include <linux/cgroup_subsys.h>
 135 };
 136 #undef SUBSYS
 137
 138 /*
 139  * The default hierarchy, reserved for the subsystems that are otherwise
 140  * unattached - it never has more than a single cgroup, and all tasks are
 141  * part of that cgroup.
 142  */
 143 struct cgroup_root cgrp_dfl_root;
 144
 145 /*
 146  * The default hierarchy always exists but is hidden until mounted for the
 147  * first time.  This is for backward compatibility.
 148  */
 149 static bool cgrp_dfl_root_visible;
 150
 151 /* The list of hierarchy roots */
 152
 153 static LIST_HEAD(cgroup_roots);
 154 static int cgroup_root_count;
 155
 156 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 157 static DEFINE_IDR(cgroup_hierarchy_idr);
 158
 159 /*
 160  * Assign a monotonically increasing serial number to cgroups.  It
 161  * guarantees cgroups with bigger numbers are newer than those with smaller
 162  * numbers.  Also, as cgroups are always appended to the parent's
 163  * ->children list, it guarantees that sibling cgroups are always sorted in
 164  * the ascending serial number order on the list.  Protected by
 165  * cgroup_mutex.
 166  */
 167 static u64 cgroup_serial_nr_next = 1;
 168
 169 /* This flag indicates whether tasks in the fork and exit paths should
 170  * check for fork/exit handlers to call. This avoids us having to do
 171  * extra work in the fork/exit path if none of the subsystems need to
 172  * be called.
 173  */
 174 static int need_forkexit_callback __read_mostly;
 175
 176 static struct cftype cgroup_base_files[];
 177
 178 static void cgroup_put(struct cgroup *cgrp);
 179 static int rebind_subsystems(struct cgroup_root *dst_root,
 180                              unsigned int ss_mask);
 181 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 182 static int cgroup_destroy_locked(struct cgroup *cgrp);
 183 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
 184 static void kill_css(struct cgroup_subsys_state *css);
 185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 186                               bool is_add);
 187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 188
 189 /* IDR wrappers which synchronize using cgroup_idr_lock */
 190 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 191                             gfp_t gfp_mask)
 192 {
 193         int ret;
 194
 195         idr_preload(gfp_mask);
 196         spin_lock_bh(&cgroup_idr_lock);
 197         ret = idr_alloc(idr, ptr, start, end, gfp_mask);
 198         spin_unlock_bh(&cgroup_idr_lock);
 199         idr_preload_end();
 200         return ret;
 201 }
 202
 203 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 204 {
 205         void *ret;
 206
 207         spin_lock_bh(&cgroup_idr_lock);
 208         ret = idr_replace(idr, ptr, id);
 209         spin_unlock_bh(&cgroup_idr_lock);
 210         return ret;
 211 }
 212
 213 static void cgroup_idr_remove(struct idr *idr, int id)
 214 {
 215         spin_lock_bh(&cgroup_idr_lock);
 216         idr_remove(idr, id);
 217         spin_unlock_bh(&cgroup_idr_lock);
 218 }
 219
 220 /**
 221  * cgroup_css - obtain a cgroup's css for the specified subsystem
 222  * @cgrp: the cgroup of interest
 223  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 224  *
 225  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 226  * function must be called either under cgroup_mutex or rcu_read_lock() and
 227  * the caller is responsible for pinning the returned css if it wants to
 228  * keep accessing it outside the said locks.  This function may return
 229  * %NULL if @cgrp doesn't have @subsys_id enabled.
 230  */
 231 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 232                                               struct cgroup_subsys *ss)
 233 {
 234         if (ss)
 235                 return rcu_dereference_check(cgrp->subsys[ss->id],
 236                                         lockdep_is_held(&cgroup_mutex));
 237         else
 238                 return &cgrp->self;
 239 }
 240
 241 /**
 242  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 243  * @cgrp: the cgroup of interest
 244  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 245  *
 246  * Similar to cgroup_css() but returns the effctive css, which is defined
 247  * as the matching css of the nearest ancestor including self which has @ss
 248  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 249  * function is guaranteed to return non-NULL css.
 250  */
 251 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 252                                                 struct cgroup_subsys *ss)
 253 {
 254         lockdep_assert_held(&cgroup_mutex);
 255
 256         if (!ss)
 257                 return &cgrp->self;
 258
 259         if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 260                 return NULL;
 261
 262         while (cgrp->parent &&
 263                !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
 264                 cgrp = cgrp->parent;
 265
 266         return cgroup_css(cgrp, ss);
 267 }
 268
 269 /* convenient tests for these bits */
 270 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 271 {
 272         return test_bit(CGRP_DEAD, &cgrp->flags);
 273 }
 274
 275 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 276 {
 277         struct cgroup *cgrp = of->kn->parent->priv;
 278         struct cftype *cft = of_cft(of);
 279
 280         /*
 281          * This is open and unprotected implementation of cgroup_css().
 282          * seq_css() is only called from a kernfs file operation which has
 283          * an active reference on the file.  Because all the subsystem
 284          * files are drained before a css is disassociated with a cgroup,
 285          * the matching css from the cgroup's subsys table is guaranteed to
 286          * be and stay valid until the enclosing operation is complete.
 287          */
 288         if (cft->ss)
 289                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 290         else
 291                 return &cgrp->self;
 292 }
 293 EXPORT_SYMBOL_GPL(of_css);
 294
 295 /**
 296  * cgroup_is_descendant - test ancestry
 297  * @cgrp: the cgroup to be tested
 298  * @ancestor: possible ancestor of @cgrp
 299  *
 300  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 301  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 302  * and @ancestor are accessible.
 303  */
 304 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 305 {
 306         while (cgrp) {
 307                 if (cgrp == ancestor)
 308                         return true;
 309                 cgrp = cgrp->parent;
 310         }
 311         return false;
 312 }
 313
 314 static int cgroup_is_releasable(const struct cgroup *cgrp)
 315 {
 316         const int bits =
 317                 (1 << CGRP_RELEASABLE) |
 318                 (1 << CGRP_NOTIFY_ON_RELEASE);
 319         return (cgrp->flags & bits) == bits;
 320 }
 321
 322 static int notify_on_release(const struct cgroup *cgrp)
 323 {
 324         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 325 }
 326
 327 /**
 328  * for_each_css - iterate all css's of a cgroup
 329  * @css: the iteration cursor
 330  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 331  * @cgrp: the target cgroup to iterate css's of
 332  *
 333  * Should be called under cgroup_[tree_]mutex.
 334  */
 335 #define for_each_css(css, ssid, cgrp)                                   \
 336         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 337                 if (!((css) = rcu_dereference_check(                    \
 338                                 (cgrp)->subsys[(ssid)],                 \
 339                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 340                 else
 341
 342 /**
 343  * for_each_e_css - iterate all effective css's of a cgroup
 344  * @css: the iteration cursor
 345  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 346  * @cgrp: the target cgroup to iterate css's of
 347  *
 348  * Should be called under cgroup_[tree_]mutex.
 349  */
 350 #define for_each_e_css(css, ssid, cgrp)                                 \
 351         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 352                 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 353                         ;                                               \
 354                 else
 355
 356 /**
 357  * for_each_subsys - iterate all enabled cgroup subsystems
 358  * @ss: the iteration cursor
 359  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 360  */
 361 #define for_each_subsys(ss, ssid)                                       \
 362         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
 363              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 364
 365 /* iterate across the hierarchies */
 366 #define for_each_root(root)                                             \
 367         list_for_each_entry((root), &cgroup_roots, root_list)
 368
 369 /* iterate over child cgrps, lock should be held throughout iteration */
 370 #define cgroup_for_each_live_child(child, cgrp)                         \
 371         list_for_each_entry((child), &(cgrp)->children, sibling)        \
 372                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 373                        cgroup_is_dead(child); }))                       \
 374                         ;                                               \
 375                 else
 376
 377 /* the list of cgroups eligible for automatic release. Protected by
 378  * release_list_lock */
 379 static LIST_HEAD(release_list);
 380 static DEFINE_RAW_SPINLOCK(release_list_lock);
 381 static void cgroup_release_agent(struct work_struct *work);
 382 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 383 static void check_for_release(struct cgroup *cgrp);
 384
 385 /*
 386  * A cgroup can be associated with multiple css_sets as different tasks may
 387  * belong to different cgroups on different hierarchies.  In the other
 388  * direction, a css_set is naturally associated with multiple cgroups.
 389  * This M:N relationship is represented by the following link structure
 390  * which exists for each association and allows traversing the associations
 391  * from both sides.
 392  */
 393 struct cgrp_cset_link {
 394         /* the cgroup and css_set this link associates */
 395         struct cgroup           *cgrp;
 396         struct css_set          *cset;
 397
 398         /* list of cgrp_cset_links anchored at cgrp->cset_links */
 399         struct list_head        cset_link;
 400
 401         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
 402         struct list_head        cgrp_link;
 403 };
 404
 405 /*
 406  * The default css_set - used by init and its children prior to any
 407  * hierarchies being mounted. It contains a pointer to the root state
 408  * for each subsystem. Also used to anchor the list of css_sets. Not
 409  * reference-counted, to improve performance when child cgroups
 410  * haven't been created.
 411  */
 412 struct css_set init_css_set = {
 413         .refcount               = ATOMIC_INIT(1),
 414         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 415         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 416         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 417         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 418         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 419 };
 420
 421 static int css_set_count        = 1;    /* 1 for init_css_set */
 422
 423 /**
 424  * cgroup_update_populated - updated populated count of a cgroup
 425  * @cgrp: the target cgroup
 426  * @populated: inc or dec populated count
 427  *
 428  * @cgrp is either getting the first task (css_set) or losing the last.
 429  * Update @cgrp->populated_cnt accordingly.  The count is propagated
 430  * towards root so that a given cgroup's populated_cnt is zero iff the
 431  * cgroup and all its descendants are empty.
 432  *
 433  * @cgrp's interface file "cgroup.populated" is zero if
 434  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 435  * changes from or to zero, userland is notified that the content of the
 436  * interface file has changed.  This can be used to detect when @cgrp and
 437  * its descendants become populated or empty.
 438  */
 439 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 440 {
 441         lockdep_assert_held(&css_set_rwsem);
 442
 443         do {
 444                 bool trigger;
 445
 446                 if (populated)
 447                         trigger = !cgrp->populated_cnt++;
 448                 else
 449                         trigger = !--cgrp->populated_cnt;
 450
 451                 if (!trigger)
 452                         break;
 453
 454                 if (cgrp->populated_kn)
 455                         kernfs_notify(cgrp->populated_kn);
 456                 cgrp = cgrp->parent;
 457         } while (cgrp);
 458 }
 459
 460 /*
 461  * hash table for cgroup groups. This improves the performance to find
 462  * an existing css_set. This hash doesn't (currently) take into
 463  * account cgroups in empty hierarchies.
 464  */
 465 #define CSS_SET_HASH_BITS       7
 466 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 467
 468 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 469 {
 470         unsigned long key = 0UL;
 471         struct cgroup_subsys *ss;
 472         int i;
 473
 474         for_each_subsys(ss, i)
 475                 key += (unsigned long)css[i];
 476         key = (key >> 16) ^ key;
 477
 478         return key;
 479 }
 480
 481 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 482 {
 483         struct cgrp_cset_link *link, *tmp_link;
 484         struct cgroup_subsys *ss;
 485         int ssid;
 486
 487         lockdep_assert_held(&css_set_rwsem);
 488
 489         if (!atomic_dec_and_test(&cset->refcount))
 490                 return;
 491
 492         /* This css_set is dead. unlink it and release cgroup refcounts */
 493         for_each_subsys(ss, ssid)
 494                 list_del(&cset->e_cset_node[ssid]);
 495         hash_del(&cset->hlist);
 496         css_set_count--;
 497
 498         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 499                 struct cgroup *cgrp = link->cgrp;
 500
 501                 list_del(&link->cset_link);
 502                 list_del(&link->cgrp_link);
 503
 504                 /* @cgrp can't go away while we're holding css_set_rwsem */
 505                 if (list_empty(&cgrp->cset_links)) {
 506                         cgroup_update_populated(cgrp, false);
 507                         if (notify_on_release(cgrp)) {
 508                                 if (taskexit)
 509                                         set_bit(CGRP_RELEASABLE, &cgrp->flags);
 510                                 check_for_release(cgrp);
 511                         }
 512                 }
 513
 514                 kfree(link);
 515         }
 516
 517         kfree_rcu(cset, rcu_head);
 518 }
 519
 520 static void put_css_set(struct css_set *cset, bool taskexit)
 521 {
 522         /*
 523          * Ensure that the refcount doesn't hit zero while any readers
 524          * can see it. Similar to atomic_dec_and_lock(), but for an
 525          * rwlock
 526          */
 527         if (atomic_add_unless(&cset->refcount, -1, 1))
 528                 return;
 529
 530         down_write(&css_set_rwsem);
 531         put_css_set_locked(cset, taskexit);
 532         up_write(&css_set_rwsem);
 533 }
 534
 535 /*
 536  * refcounted get/put for css_set objects
 537  */
 538 static inline void get_css_set(struct css_set *cset)
 539 {
 540         atomic_inc(&cset->refcount);
 541 }
 542
 543 /**
 544  * compare_css_sets - helper function for find_existing_css_set().
 545  * @cset: candidate css_set being tested
 546  * @old_cset: existing css_set for a task
 547  * @new_cgrp: cgroup that's being entered by the task
 548  * @template: desired set of css pointers in css_set (pre-calculated)
 549  *
 550  * Returns true if "cset" matches "old_cset" except for the hierarchy
 551  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 552  */
 553 static bool compare_css_sets(struct css_set *cset,
 554                              struct css_set *old_cset,
 555                              struct cgroup *new_cgrp,
 556                              struct cgroup_subsys_state *template[])
 557 {
 558         struct list_head *l1, *l2;
 559
 560         /*
 561          * On the default hierarchy, there can be csets which are
 562          * associated with the same set of cgroups but different csses.
 563          * Let's first ensure that csses match.
 564          */
 565         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 566                 return false;
 567
 568         /*
 569          * Compare cgroup pointers in order to distinguish between
 570          * different cgroups in hierarchies.  As different cgroups may
 571          * share the same effective css, this comparison is always
 572          * necessary.
 573          */
 574         l1 = &cset->cgrp_links;
 575         l2 = &old_cset->cgrp_links;
 576         while (1) {
 577                 struct cgrp_cset_link *link1, *link2;
 578                 struct cgroup *cgrp1, *cgrp2;
 579
 580                 l1 = l1->next;
 581                 l2 = l2->next;
 582                 /* See if we reached the end - both lists are equal length. */
 583                 if (l1 == &cset->cgrp_links) {
 584                         BUG_ON(l2 != &old_cset->cgrp_links);
 585                         break;
 586                 } else {
 587                         BUG_ON(l2 == &old_cset->cgrp_links);
 588                 }
 589                 /* Locate the cgroups associated with these links. */
 590                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 591                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 592                 cgrp1 = link1->cgrp;
 593                 cgrp2 = link2->cgrp;
 594                 /* Hierarchies should be linked in the same order. */
 595                 BUG_ON(cgrp1->root != cgrp2->root);
 596
 597                 /*
 598                  * If this hierarchy is the hierarchy of the cgroup
 599                  * that's changing, then we need to check that this
 600                  * css_set points to the new cgroup; if it's any other
 601                  * hierarchy, then this css_set should point to the
 602                  * same cgroup as the old css_set.
 603                  */
 604                 if (cgrp1->root == new_cgrp->root) {
 605                         if (cgrp1 != new_cgrp)
 606                                 return false;
 607                 } else {
 608                         if (cgrp1 != cgrp2)
 609                                 return false;
 610                 }
 611         }
 612         return true;
 613 }
 614
 615 /**
 616  * find_existing_css_set - init css array and find the matching css_set
 617  * @old_cset: the css_set that we're using before the cgroup transition
 618  * @cgrp: the cgroup that we're moving into
 619  * @template: out param for the new set of csses, should be clear on entry
 620  */
 621 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 622                                         struct cgroup *cgrp,
 623                                         struct cgroup_subsys_state *template[])
 624 {
 625         struct cgroup_root *root = cgrp->root;
 626         struct cgroup_subsys *ss;
 627         struct css_set *cset;
 628         unsigned long key;
 629         int i;
 630
 631         /*
 632          * Build the set of subsystem state objects that we want to see in the
 633          * new css_set. while subsystems can change globally, the entries here
 634          * won't change, so no need for locking.
 635          */
 636         for_each_subsys(ss, i) {
 637                 if (root->subsys_mask & (1UL << i)) {
 638                         /*
 639                          * @ss is in this hierarchy, so we want the
 640                          * effective css from @cgrp.
 641                          */
 642                         template[i] = cgroup_e_css(cgrp, ss);
 643                 } else {
 644                         /*
 645                          * @ss is not in this hierarchy, so we don't want
 646                          * to change the css.
 647                          */
 648                         template[i] = old_cset->subsys[i];
 649                 }
 650         }
 651
 652         key = css_set_hash(template);
 653         hash_for_each_possible(css_set_table, cset, hlist, key) {
 654                 if (!compare_css_sets(cset, old_cset, cgrp, template))
 655                         continue;
 656
 657                 /* This css_set matches what we need */
 658                 return cset;
 659         }
 660
 661         /* No existing cgroup group matched */
 662         return NULL;
 663 }
 664
 665 static void free_cgrp_cset_links(struct list_head *links_to_free)
 666 {
 667         struct cgrp_cset_link *link, *tmp_link;
 668
 669         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 670                 list_del(&link->cset_link);
 671                 kfree(link);
 672         }
 673 }
 674
 675 /**
 676  * allocate_cgrp_cset_links - allocate cgrp_cset_links
 677  * @count: the number of links to allocate
 678  * @tmp_links: list_head the allocated links are put on
 679  *
 680  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 681  * through ->cset_link.  Returns 0 on success or -errno.
 682  */
 683 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 684 {
 685         struct cgrp_cset_link *link;
 686         int i;
 687
 688         INIT_LIST_HEAD(tmp_links);
 689
 690         for (i = 0; i < count; i++) {
 691                 link = kzalloc(sizeof(*link), GFP_KERNEL);
 692                 if (!link) {
 693                         free_cgrp_cset_links(tmp_links);
 694                         return -ENOMEM;
 695                 }
 696                 list_add(&link->cset_link, tmp_links);
 697         }
 698         return 0;
 699 }
 700
 701 /**
 702  * link_css_set - a helper function to link a css_set to a cgroup
 703  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 704  * @cset: the css_set to be linked
 705  * @cgrp: the destination cgroup
 706  */
 707 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 708                          struct cgroup *cgrp)
 709 {
 710         struct cgrp_cset_link *link;
 711
 712         BUG_ON(list_empty(tmp_links));
 713
 714         if (cgroup_on_dfl(cgrp))
 715                 cset->dfl_cgrp = cgrp;
 716
 717         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 718         link->cset = cset;
 719         link->cgrp = cgrp;
 720
 721         if (list_empty(&cgrp->cset_links))
 722                 cgroup_update_populated(cgrp, true);
 723         list_move(&link->cset_link, &cgrp->cset_links);
 724
 725         /*
 726          * Always add links to the tail of the list so that the list
 727          * is sorted by order of hierarchy creation
 728          */
 729         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 730 }
 731
 732 /**
 733  * find_css_set - return a new css_set with one cgroup updated
 734  * @old_cset: the baseline css_set
 735  * @cgrp: the cgroup to be updated
 736  *
 737  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 738  * substituted into the appropriate hierarchy.
 739  */
 740 static struct css_set *find_css_set(struct css_set *old_cset,
 741                                     struct cgroup *cgrp)
 742 {
 743         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 744         struct css_set *cset;
 745         struct list_head tmp_links;
 746         struct cgrp_cset_link *link;
 747         struct cgroup_subsys *ss;
 748         unsigned long key;
 749         int ssid;
 750
 751         lockdep_assert_held(&cgroup_mutex);
 752
 753         /* First see if we already have a cgroup group that matches
 754          * the desired set */
 755         down_read(&css_set_rwsem);
 756         cset = find_existing_css_set(old_cset, cgrp, template);
 757         if (cset)
 758                 get_css_set(cset);
 759         up_read(&css_set_rwsem);
 760
 761         if (cset)
 762                 return cset;
 763
 764         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 765         if (!cset)
 766                 return NULL;
 767
 768         /* Allocate all the cgrp_cset_link objects that we'll need */
 769         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 770                 kfree(cset);
 771                 return NULL;
 772         }
 773
 774         atomic_set(&cset->refcount, 1);
 775         INIT_LIST_HEAD(&cset->cgrp_links);
 776         INIT_LIST_HEAD(&cset->tasks);
 777         INIT_LIST_HEAD(&cset->mg_tasks);
 778         INIT_LIST_HEAD(&cset->mg_preload_node);
 779         INIT_LIST_HEAD(&cset->mg_node);
 780         INIT_HLIST_NODE(&cset->hlist);
 781
 782         /* Copy the set of subsystem state objects generated in
 783          * find_existing_css_set() */
 784         memcpy(cset->subsys, template, sizeof(cset->subsys));
 785
 786         down_write(&css_set_rwsem);
 787         /* Add reference counts and links from the new css_set. */
 788         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 789                 struct cgroup *c = link->cgrp;
 790
 791                 if (c->root == cgrp->root)
 792                         c = cgrp;
 793                 link_css_set(&tmp_links, cset, c);
 794         }
 795
 796         BUG_ON(!list_empty(&tmp_links));
 797
 798         css_set_count++;
 799
 800         /* Add @cset to the hash table */
 801         key = css_set_hash(cset->subsys);
 802         hash_add(css_set_table, &cset->hlist, key);
 803
 804         for_each_subsys(ss, ssid)
 805                 list_add_tail(&cset->e_cset_node[ssid],
 806                               &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 807
 808         up_write(&css_set_rwsem);
 809
 810         return cset;
 811 }
 812
 813 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 814 {
 815         struct cgroup *root_cgrp = kf_root->kn->priv;
 816
 817         return root_cgrp->root;
 818 }
 819
 820 static int cgroup_init_root_id(struct cgroup_root *root)
 821 {
 822         int id;
 823
 824         lockdep_assert_held(&cgroup_mutex);
 825
 826         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 827         if (id < 0)
 828                 return id;
 829
 830         root->hierarchy_id = id;
 831         return 0;
 832 }
 833
 834 static void cgroup_exit_root_id(struct cgroup_root *root)
 835 {
 836         lockdep_assert_held(&cgroup_mutex);
 837
 838         if (root->hierarchy_id) {
 839                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 840                 root->hierarchy_id = 0;
 841         }
 842 }
 843
 844 static void cgroup_free_root(struct cgroup_root *root)
 845 {
 846         if (root) {
 847                 /* hierarhcy ID shoulid already have been released */
 848                 WARN_ON_ONCE(root->hierarchy_id);
 849
 850                 idr_destroy(&root->cgroup_idr);
 851                 kfree(root);
 852         }
 853 }
 854
 855 static void cgroup_destroy_root(struct cgroup_root *root)
 856 {
 857         struct cgroup *cgrp = &root->cgrp;
 858         struct cgrp_cset_link *link, *tmp_link;
 859
 860         mutex_lock(&cgroup_mutex);
 861
 862         BUG_ON(atomic_read(&root->nr_cgrps));
 863         BUG_ON(!list_empty(&cgrp->children));
 864
 865         /* Rebind all subsystems back to the default hierarchy */
 866         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 867
 868         /*
 869          * Release all the links from cset_links to this hierarchy's
 870          * root cgroup
 871          */
 872         down_write(&css_set_rwsem);
 873
 874         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 875                 list_del(&link->cset_link);
 876                 list_del(&link->cgrp_link);
 877                 kfree(link);
 878         }
 879         up_write(&css_set_rwsem);
 880
 881         if (!list_empty(&root->root_list)) {
 882                 list_del(&root->root_list);
 883                 cgroup_root_count--;
 884         }
 885
 886         cgroup_exit_root_id(root);
 887
 888         mutex_unlock(&cgroup_mutex);
 889
 890         kernfs_destroy_root(root->kf_root);
 891         cgroup_free_root(root);
 892 }
 893
 894 /* look up cgroup associated with given css_set on the specified hierarchy */
 895 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 896                                             struct cgroup_root *root)
 897 {
 898         struct cgroup *res = NULL;
 899
 900         lockdep_assert_held(&cgroup_mutex);
 901         lockdep_assert_held(&css_set_rwsem);
 902
 903         if (cset == &init_css_set) {
 904                 res = &root->cgrp;
 905         } else {
 906                 struct cgrp_cset_link *link;
 907
 908                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 909                         struct cgroup *c = link->cgrp;
 910
 911                         if (c->root == root) {
 912                                 res = c;
 913                                 break;
 914                         }
 915                 }
 916         }
 917
 918         BUG_ON(!res);
 919         return res;
 920 }
 921
 922 /*
 923  * Return the cgroup for "task" from the given hierarchy. Must be
 924  * called with cgroup_mutex and css_set_rwsem held.
 925  */
 926 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 927                                             struct cgroup_root *root)
 928 {
 929         /*
 930          * No need to lock the task - since we hold cgroup_mutex the
 931          * task can't change groups, so the only thing that can happen
 932          * is that it exits and its css is set back to init_css_set.
 933          */
 934         return cset_cgroup_from_root(task_css_set(task), root);
 935 }
 936
 937 /*
 938  * A task must hold cgroup_mutex to modify cgroups.
 939  *
 940  * Any task can increment and decrement the count field without lock.
 941  * So in general, code holding cgroup_mutex can't rely on the count
 942  * field not changing.  However, if the count goes to zero, then only
 943  * cgroup_attach_task() can increment it again.  Because a count of zero
 944  * means that no tasks are currently attached, therefore there is no
 945  * way a task attached to that cgroup can fork (the other way to
 946  * increment the count).  So code holding cgroup_mutex can safely
 947  * assume that if the count is zero, it will stay zero. Similarly, if
 948  * a task holds cgroup_mutex on a cgroup with zero count, it
 949  * knows that the cgroup won't be removed, as cgroup_rmdir()
 950  * needs that mutex.
 951  *
 952  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 953  * (usually) take cgroup_mutex.  These are the two most performance
 954  * critical pieces of code here.  The exception occurs on cgroup_exit(),
 955  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 956  * is taken, and if the cgroup count is zero, a usermode call made
 957  * to the release agent with the name of the cgroup (path relative to
 958  * the root of cgroup file system) as the argument.
 959  *
 960  * A cgroup can only be deleted if both its 'count' of using tasks
 961  * is zero, and its list of 'children' cgroups is empty.  Since all
 962  * tasks in the system use _some_ cgroup, and since there is always at
 963  * least one task in the system (init, pid == 1), therefore, root cgroup
 964  * always has either children cgroups and/or using tasks.  So we don't
 965  * need a special hack to ensure that root cgroup cannot be deleted.
 966  *
 967  * P.S.  One more locking exception.  RCU is used to guard the
 968  * update of a tasks cgroup pointer by cgroup_attach_task()
 969  */
 970
 971 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
 972 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 973 static const struct file_operations proc_cgroupstats_operations;
 974
 975 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 976                               char *buf)
 977 {
 978         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 979             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 980                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 981                          cft->ss->name, cft->name);
 982         else
 983                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 984         return buf;
 985 }
 986
 987 /**
 988  * cgroup_file_mode - deduce file mode of a control file
 989  * @cft: the control file in question
 990  *
 991  * returns cft->mode if ->mode is not 0
 992  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 993  * returns S_IRUGO if it has only a read handler
 994  * returns S_IWUSR if it has only a write hander
 995  */
 996 static umode_t cgroup_file_mode(const struct cftype *cft)
 997 {
 998         umode_t mode = 0;
 999
1000         if (cft->mode)
1001                 return cft->mode;
1002
1003         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1004                 mode |= S_IRUGO;
1005
1006         if (cft->write_u64 || cft->write_s64 || cft->write)
1007                 mode |= S_IWUSR;
1008
1009         return mode;
1010 }
1011
1012 static void cgroup_free_fn(struct work_struct *work)
1013 {
1014         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
1015
1016         atomic_dec(&cgrp->root->nr_cgrps);
1017         cgroup_pidlist_destroy_all(cgrp);
1018
1019         if (cgrp->parent) {
1020                 /*
1021                  * We get a ref to the parent, and put the ref when this
1022                  * cgroup is being freed, so it's guaranteed that the
1023                  * parent won't be destroyed before its children.
1024                  */
1025                 cgroup_put(cgrp->parent);
1026                 kernfs_put(cgrp->kn);
1027                 kfree(cgrp);
1028         } else {
1029                 /*
1030                  * This is root cgroup's refcnt reaching zero, which
1031                  * indicates that the root should be released.
1032                  */
1033                 cgroup_destroy_root(cgrp->root);
1034         }
1035 }
1036
1037 static void cgroup_free_rcu(struct rcu_head *head)
1038 {
1039         struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
1040
1041         INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
1042         queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
1043 }
1044
1045 static void cgroup_get(struct cgroup *cgrp)
1046 {
1047         WARN_ON_ONCE(cgroup_is_dead(cgrp));
1048         WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
1049         atomic_inc(&cgrp->refcnt);
1050 }
1051
1052 static void cgroup_put(struct cgroup *cgrp)
1053 {
1054         if (!atomic_dec_and_test(&cgrp->refcnt))
1055                 return;
1056         if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
1057                 return;
1058
1059         /* delete this cgroup from parent->children */
1060         mutex_lock(&cgroup_mutex);
1061         list_del_rcu(&cgrp->sibling);
1062         mutex_unlock(&cgroup_mutex);
1063
1064         cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
1065         cgrp->id = -1;
1066
1067         call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1068 }
1069
1070 /**
1071  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1072  * @kn: the kernfs_node being serviced
1073  *
1074  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1075  * the method finishes if locking succeeded.  Note that once this function
1076  * returns the cgroup returned by cgroup_kn_lock_live() may become
1077  * inaccessible any time.  If the caller intends to continue to access the
1078  * cgroup, it should pin it before invoking this function.
1079  */
1080 static void cgroup_kn_unlock(struct kernfs_node *kn)
1081 {
1082         struct cgroup *cgrp;
1083
1084         if (kernfs_type(kn) == KERNFS_DIR)
1085                 cgrp = kn->priv;
1086         else
1087                 cgrp = kn->parent->priv;
1088
1089         mutex_unlock(&cgroup_mutex);
1090
1091         kernfs_unbreak_active_protection(kn);
1092         cgroup_put(cgrp);
1093 }
1094
1095 /**
1096  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1097  * @kn: the kernfs_node being serviced
1098  *
1099  * This helper is to be used by a cgroup kernfs method currently servicing
1100  * @kn.  It breaks the active protection, performs cgroup locking and
1101  * verifies that the associated cgroup is alive.  Returns the cgroup if
1102  * alive; otherwise, %NULL.  A successful return should be undone by a
1103  * matching cgroup_kn_unlock() invocation.
1104  *
1105  * Any cgroup kernfs method implementation which requires locking the
1106  * associated cgroup should use this helper.  It avoids nesting cgroup
1107  * locking under kernfs active protection and allows all kernfs operations
1108  * including self-removal.
1109  */
1110 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1111 {
1112         struct cgroup *cgrp;
1113
1114         if (kernfs_type(kn) == KERNFS_DIR)
1115                 cgrp = kn->priv;
1116         else
1117                 cgrp = kn->parent->priv;
1118
1119         /*
1120          * We're gonna grab cgroup_mutex which nests outside kernfs
1121          * active_ref.  cgroup liveliness check alone provides enough
1122          * protection against removal.  Ensure @cgrp stays accessible and
1123          * break the active_ref protection.
1124          */
1125         cgroup_get(cgrp);
1126         kernfs_break_active_protection(kn);
1127
1128         mutex_lock(&cgroup_mutex);
1129
1130         if (!cgroup_is_dead(cgrp))
1131                 return cgrp;
1132
1133         cgroup_kn_unlock(kn);
1134         return NULL;
1135 }
1136
1137 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1138 {
1139         char name[CGROUP_FILE_NAME_MAX];
1140
1141         lockdep_assert_held(&cgroup_mutex);
1142         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1143 }
1144
1145 /**
1146  * cgroup_clear_dir - remove subsys files in a cgroup directory
1147  * @cgrp: target cgroup
1148  * @subsys_mask: mask of the subsystem ids whose files should be removed
1149  */
1150 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1151 {
1152         struct cgroup_subsys *ss;
1153         int i;
1154
1155         for_each_subsys(ss, i) {
1156                 struct cftype *cfts;
1157
1158                 if (!(subsys_mask & (1 << i)))
1159                         continue;
1160                 list_for_each_entry(cfts, &ss->cfts, node)
1161                         cgroup_addrm_files(cgrp, cfts, false);
1162         }
1163 }
1164
1165 static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1166 {
1167         struct cgroup_subsys *ss;
1168         int ssid, i, ret;
1169
1170         lockdep_assert_held(&cgroup_mutex);
1171
1172         for_each_subsys(ss, ssid) {
1173                 if (!(ss_mask & (1 << ssid)))
1174                         continue;
1175
1176                 /* if @ss has non-root csses attached to it, can't move */
1177                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1178                         return -EBUSY;
1179
1180                 /* can't move between two non-dummy roots either */
1181                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1182                         return -EBUSY;
1183         }
1184
1185         ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1186         if (ret) {
1187                 if (dst_root != &cgrp_dfl_root)
1188                         return ret;
1189
1190                 /*
1191                  * Rebinding back to the default root is not allowed to
1192                  * fail.  Using both default and non-default roots should
1193                  * be rare.  Moving subsystems back and forth even more so.
1194                  * Just warn about it and continue.
1195                  */
1196                 if (cgrp_dfl_root_visible) {
1197                         pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1198                                 ret, ss_mask);
1199                         pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1200                 }
1201         }
1202
1203         /*
1204          * Nothing can fail from this point on.  Remove files for the
1205          * removed subsystems and rebind each subsystem.
1206          */
1207         for_each_subsys(ss, ssid)
1208                 if (ss_mask & (1 << ssid))
1209                         cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1210
1211         for_each_subsys(ss, ssid) {
1212                 struct cgroup_root *src_root;
1213                 struct cgroup_subsys_state *css;
1214                 struct css_set *cset;
1215
1216                 if (!(ss_mask & (1 << ssid)))
1217                         continue;
1218
1219                 src_root = ss->root;
1220                 css = cgroup_css(&src_root->cgrp, ss);
1221
1222                 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1223
1224                 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1225                 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1226                 ss->root = dst_root;
1227                 css->cgroup = &dst_root->cgrp;
1228
1229                 down_write(&css_set_rwsem);
1230                 hash_for_each(css_set_table, i, cset, hlist)
1231                         list_move_tail(&cset->e_cset_node[ss->id],
1232                                        &dst_root->cgrp.e_csets[ss->id]);
1233                 up_write(&css_set_rwsem);
1234
1235                 src_root->subsys_mask &= ~(1 << ssid);
1236                 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1237
1238                 /* default hierarchy doesn't enable controllers by default */
1239                 dst_root->subsys_mask |= 1 << ssid;
1240                 if (dst_root != &cgrp_dfl_root)
1241                         dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1242
1243                 if (ss->bind)
1244                         ss->bind(css);
1245         }
1246
1247         kernfs_activate(dst_root->cgrp.kn);
1248         return 0;
1249 }
1250
1251 static int cgroup_show_options(struct seq_file *seq,
1252                                struct kernfs_root *kf_root)
1253 {
1254         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1255         struct cgroup_subsys *ss;
1256         int ssid;
1257
1258         for_each_subsys(ss, ssid)
1259                 if (root->subsys_mask & (1 << ssid))
1260                         seq_printf(seq, ",%s", ss->name);
1261         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1262                 seq_puts(seq, ",sane_behavior");
1263         if (root->flags & CGRP_ROOT_NOPREFIX)
1264                 seq_puts(seq, ",noprefix");
1265         if (root->flags & CGRP_ROOT_XATTR)
1266                 seq_puts(seq, ",xattr");
1267
1268         spin_lock(&release_agent_path_lock);
1269         if (strlen(root->release_agent_path))
1270                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1271         spin_unlock(&release_agent_path_lock);
1272
1273         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1274                 seq_puts(seq, ",clone_children");
1275         if (strlen(root->name))
1276                 seq_printf(seq, ",name=%s", root->name);
1277         return 0;
1278 }
1279
1280 struct cgroup_sb_opts {
1281         unsigned int subsys_mask;
1282         unsigned int flags;
1283         char *release_agent;
1284         bool cpuset_clone_children;
1285         char *name;
1286         /* User explicitly requested empty subsystem */
1287         bool none;
1288 };
1289
1290 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1291 {
1292         char *token, *o = data;
1293         bool all_ss = false, one_ss = false;
1294         unsigned int mask = -1U;
1295         struct cgroup_subsys *ss;
1296         int i;
1297
1298 #ifdef CONFIG_CPUSETS
1299         mask = ~(1U << cpuset_cgrp_id);
1300 #endif
1301
1302         memset(opts, 0, sizeof(*opts));
1303
1304         while ((token = strsep(&o, ",")) != NULL) {
1305                 if (!*token)
1306                         return -EINVAL;
1307                 if (!strcmp(token, "none")) {
1308                         /* Explicitly have no subsystems */
1309                         opts->none = true;
1310                         continue;
1311                 }
1312                 if (!strcmp(token, "all")) {
1313                         /* Mutually exclusive option 'all' + subsystem name */
1314                         if (one_ss)
1315                                 return -EINVAL;
1316                         all_ss = true;
1317                         continue;
1318                 }
1319                 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1320                         opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1321                         continue;
1322                 }
1323                 if (!strcmp(token, "noprefix")) {
1324                         opts->flags |= CGRP_ROOT_NOPREFIX;
1325                         continue;
1326                 }
1327                 if (!strcmp(token, "clone_children")) {
1328                         opts->cpuset_clone_children = true;
1329                         continue;
1330                 }
1331                 if (!strcmp(token, "xattr")) {
1332                         opts->flags |= CGRP_ROOT_XATTR;
1333                         continue;
1334                 }
1335                 if (!strncmp(token, "release_agent=", 14)) {
1336                         /* Specifying two release agents is forbidden */
1337                         if (opts->release_agent)
1338                                 return -EINVAL;
1339                         opts->release_agent =
1340                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1341                         if (!opts->release_agent)
1342                                 return -ENOMEM;
1343                         continue;
1344                 }
1345                 if (!strncmp(token, "name=", 5)) {
1346                         const char *name = token + 5;
1347                         /* Can't specify an empty name */
1348                         if (!strlen(name))
1349                                 return -EINVAL;
1350                         /* Must match [\w.-]+ */
1351                         for (i = 0; i < strlen(name); i++) {
1352                                 char c = name[i];
1353                                 if (isalnum(c))
1354                                         continue;
1355                                 if ((c == '.') || (c == '-') || (c == '_'))
1356                                         continue;
1357                                 return -EINVAL;
1358                         }
1359                         /* Specifying two names is forbidden */
1360                         if (opts->name)
1361                                 return -EINVAL;
1362                         opts->name = kstrndup(name,
1363                                               MAX_CGROUP_ROOT_NAMELEN - 1,
1364                                               GFP_KERNEL);
1365                         if (!opts->name)
1366                                 return -ENOMEM;
1367
1368                         continue;
1369                 }
1370
1371                 for_each_subsys(ss, i) {
1372                         if (strcmp(token, ss->name))
1373                                 continue;
1374                         if (ss->disabled)
1375                                 continue;
1376
1377                         /* Mutually exclusive option 'all' + subsystem name */
1378                         if (all_ss)
1379                                 return -EINVAL;
1380                         opts->subsys_mask |= (1 << i);
1381                         one_ss = true;
1382
1383                         break;
1384                 }
1385                 if (i == CGROUP_SUBSYS_COUNT)
1386                         return -ENOENT;
1387         }
1388
1389         /* Consistency checks */
1390
1391         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1392                 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1393
1394                 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1395                     opts->cpuset_clone_children || opts->release_agent ||
1396                     opts->name) {
1397                         pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1398                         return -EINVAL;
1399                 }
1400         } else {
1401                 /*
1402                  * If the 'all' option was specified select all the
1403                  * subsystems, otherwise if 'none', 'name=' and a subsystem
1404                  * name options were not specified, let's default to 'all'
1405                  */
1406                 if (all_ss || (!one_ss && !opts->none && !opts->name))
1407                         for_each_subsys(ss, i)
1408                                 if (!ss->disabled)
1409                                         opts->subsys_mask |= (1 << i);
1410
1411                 /*
1412                  * We either have to specify by name or by subsystems. (So
1413                  * all empty hierarchies must have a name).
1414                  */
1415                 if (!opts->subsys_mask && !opts->name)
1416                         return -EINVAL;
1417         }
1418
1419         /*
1420          * Option noprefix was introduced just for backward compatibility
1421          * with the old cpuset, so we allow noprefix only if mounting just
1422          * the cpuset subsystem.
1423          */
1424         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1425                 return -EINVAL;
1426
1427
1428         /* Can't specify "none" and some subsystems */
1429         if (opts->subsys_mask && opts->none)
1430                 return -EINVAL;
1431
1432         return 0;
1433 }
1434
1435 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1436 {
1437         int ret = 0;
1438         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1439         struct cgroup_sb_opts opts;
1440         unsigned int added_mask, removed_mask;
1441
1442         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1443                 pr_err("sane_behavior: remount is not allowed\n");
1444                 return -EINVAL;
1445         }
1446
1447         mutex_lock(&cgroup_mutex);
1448
1449         /* See what subsystems are wanted */
1450         ret = parse_cgroupfs_options(data, &opts);
1451         if (ret)
1452                 goto out_unlock;
1453
1454         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1455                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1456                         task_tgid_nr(current), current->comm);
1457
1458         added_mask = opts.subsys_mask & ~root->subsys_mask;
1459         removed_mask = root->subsys_mask & ~opts.subsys_mask;
1460
1461         /* Don't allow flags or name to change at remount */
1462         if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1463             (opts.name && strcmp(opts.name, root->name))) {
1464                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1465                        opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1466                        root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1467                 ret = -EINVAL;
1468                 goto out_unlock;
1469         }
1470
1471         /* remounting is not allowed for populated hierarchies */
1472         if (!list_empty(&root->cgrp.children)) {
1473                 ret = -EBUSY;
1474                 goto out_unlock;
1475         }
1476
1477         ret = rebind_subsystems(root, added_mask);
1478         if (ret)
1479                 goto out_unlock;
1480
1481         rebind_subsystems(&cgrp_dfl_root, removed_mask);
1482
1483         if (opts.release_agent) {
1484                 spin_lock(&release_agent_path_lock);
1485                 strcpy(root->release_agent_path, opts.release_agent);
1486                 spin_unlock(&release_agent_path_lock);
1487         }
1488  out_unlock:
1489         kfree(opts.release_agent);
1490         kfree(opts.name);
1491         mutex_unlock(&cgroup_mutex);
1492         return ret;
1493 }
1494
1495 /*
1496  * To reduce the fork() overhead for systems that are not actually using
1497  * their cgroups capability, we don't maintain the lists running through
1498  * each css_set to its tasks until we see the list actually used - in other
1499  * words after the first mount.
1500  */
1501 static bool use_task_css_set_links __read_mostly;
1502
1503 static void cgroup_enable_task_cg_lists(void)
1504 {
1505         struct task_struct *p, *g;
1506
1507         down_write(&css_set_rwsem);
1508
1509         if (use_task_css_set_links)
1510                 goto out_unlock;
1511
1512         use_task_css_set_links = true;
1513
1514         /*
1515          * We need tasklist_lock because RCU is not safe against
1516          * while_each_thread(). Besides, a forking task that has passed
1517          * cgroup_post_fork() without seeing use_task_css_set_links = 1
1518          * is not guaranteed to have its child immediately visible in the
1519          * tasklist if we walk through it with RCU.
1520          */
1521         read_lock(&tasklist_lock);
1522         do_each_thread(g, p) {
1523                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1524                              task_css_set(p) != &init_css_set);
1525
1526                 /*
1527                  * We should check if the process is exiting, otherwise
1528                  * it will race with cgroup_exit() in that the list
1529                  * entry won't be deleted though the process has exited.
1530                  * Do it while holding siglock so that we don't end up
1531                  * racing against cgroup_exit().
1532                  */
1533                 spin_lock_irq(&p->sighand->siglock);
1534                 if (!(p->flags & PF_EXITING)) {
1535                         struct css_set *cset = task_css_set(p);
1536
1537                         list_add(&p->cg_list, &cset->tasks);
1538                         get_css_set(cset);
1539                 }
1540                 spin_unlock_irq(&p->sighand->siglock);
1541         } while_each_thread(g, p);
1542         read_unlock(&tasklist_lock);
1543 out_unlock:
1544         up_write(&css_set_rwsem);
1545 }
1546
1547 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1548 {
1549         struct cgroup_subsys *ss;
1550         int ssid;
1551
1552         atomic_set(&cgrp->refcnt, 1);
1553         INIT_LIST_HEAD(&cgrp->sibling);
1554         INIT_LIST_HEAD(&cgrp->children);
1555         INIT_LIST_HEAD(&cgrp->cset_links);
1556         INIT_LIST_HEAD(&cgrp->release_list);
1557         INIT_LIST_HEAD(&cgrp->pidlists);
1558         mutex_init(&cgrp->pidlist_mutex);
1559         cgrp->self.cgroup = cgrp;
1560
1561         for_each_subsys(ss, ssid)
1562                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1563
1564         init_waitqueue_head(&cgrp->offline_waitq);
1565 }
1566
1567 static void init_cgroup_root(struct cgroup_root *root,
1568                              struct cgroup_sb_opts *opts)
1569 {
1570         struct cgroup *cgrp = &root->cgrp;
1571
1572         INIT_LIST_HEAD(&root->root_list);
1573         atomic_set(&root->nr_cgrps, 1);
1574         cgrp->root = root;
1575         init_cgroup_housekeeping(cgrp);
1576         idr_init(&root->cgroup_idr);
1577
1578         root->flags = opts->flags;
1579         if (opts->release_agent)
1580                 strcpy(root->release_agent_path, opts->release_agent);
1581         if (opts->name)
1582                 strcpy(root->name, opts->name);
1583         if (opts->cpuset_clone_children)
1584                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1585 }
1586
1587 static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1588 {
1589         LIST_HEAD(tmp_links);
1590         struct cgroup *root_cgrp = &root->cgrp;
1591         struct css_set *cset;
1592         int i, ret;
1593
1594         lockdep_assert_held(&cgroup_mutex);
1595
1596         ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1597         if (ret < 0)
1598                 goto out;
1599         root_cgrp->id = ret;
1600
1601         /*
1602          * We're accessing css_set_count without locking css_set_rwsem here,
1603          * but that's OK - it can only be increased by someone holding
1604          * cgroup_lock, and that's us. The worst that can happen is that we
1605          * have some link structures left over
1606          */
1607         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1608         if (ret)
1609                 goto out;
1610
1611         ret = cgroup_init_root_id(root);
1612         if (ret)
1613                 goto out;
1614
1615         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1616                                            KERNFS_ROOT_CREATE_DEACTIVATED,
1617                                            root_cgrp);
1618         if (IS_ERR(root->kf_root)) {
1619                 ret = PTR_ERR(root->kf_root);
1620                 goto exit_root_id;
1621         }
1622         root_cgrp->kn = root->kf_root->kn;
1623
1624         ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1625         if (ret)
1626                 goto destroy_root;
1627
1628         ret = rebind_subsystems(root, ss_mask);
1629         if (ret)
1630                 goto destroy_root;
1631
1632         /*
1633          * There must be no failure case after here, since rebinding takes
1634          * care of subsystems' refcounts, which are explicitly dropped in
1635          * the failure exit path.
1636          */
1637         list_add(&root->root_list, &cgroup_roots);
1638         cgroup_root_count++;
1639
1640         /*
1641          * Link the root cgroup in this hierarchy into all the css_set
1642          * objects.
1643          */
1644         down_write(&css_set_rwsem);
1645         hash_for_each(css_set_table, i, cset, hlist)
1646                 link_css_set(&tmp_links, cset, root_cgrp);
1647         up_write(&css_set_rwsem);
1648
1649         BUG_ON(!list_empty(&root_cgrp->children));
1650         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1651
1652         kernfs_activate(root_cgrp->kn);
1653         ret = 0;
1654         goto out;
1655
1656 destroy_root:
1657         kernfs_destroy_root(root->kf_root);
1658         root->kf_root = NULL;
1659 exit_root_id:
1660         cgroup_exit_root_id(root);
1661 out:
1662         free_cgrp_cset_links(&tmp_links);
1663         return ret;
1664 }
1665
1666 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1667                          int flags, const char *unused_dev_name,
1668                          void *data)
1669 {
1670         struct cgroup_root *root;
1671         struct cgroup_sb_opts opts;
1672         struct dentry *dentry;
1673         int ret;
1674         bool new_sb;
1675
1676         /*
1677          * The first time anyone tries to mount a cgroup, enable the list
1678          * linking each css_set to its tasks and fix up all existing tasks.
1679          */
1680         if (!use_task_css_set_links)
1681                 cgroup_enable_task_cg_lists();
1682
1683         mutex_lock(&cgroup_mutex);
1684
1685         /* First find the desired set of subsystems */
1686         ret = parse_cgroupfs_options(data, &opts);
1687         if (ret)
1688                 goto out_unlock;
1689
1690         /* look for a matching existing root */
1691         if (!opts.subsys_mask && !opts.none && !opts.name) {
1692                 cgrp_dfl_root_visible = true;
1693                 root = &cgrp_dfl_root;
1694                 cgroup_get(&root->cgrp);
1695                 ret = 0;
1696                 goto out_unlock;
1697         }
1698
1699         for_each_root(root) {
1700                 bool name_match = false;
1701
1702                 if (root == &cgrp_dfl_root)
1703                         continue;
1704
1705                 /*
1706                  * If we asked for a name then it must match.  Also, if
1707                  * name matches but sybsys_mask doesn't, we should fail.
1708                  * Remember whether name matched.
1709                  */
1710                 if (opts.name) {
1711                         if (strcmp(opts.name, root->name))
1712                                 continue;
1713                         name_match = true;
1714                 }
1715
1716                 /*
1717                  * If we asked for subsystems (or explicitly for no
1718                  * subsystems) then they must match.
1719                  */
1720                 if ((opts.subsys_mask || opts.none) &&
1721                     (opts.subsys_mask != root->subsys_mask)) {
1722                         if (!name_match)
1723                                 continue;
1724                         ret = -EBUSY;
1725                         goto out_unlock;
1726                 }
1727
1728                 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1729                         if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1730                                 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1731                                 ret = -EINVAL;
1732                                 goto out_unlock;
1733                         } else {
1734                                 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1735                         }
1736                 }
1737
1738                 /*
1739                  * A root's lifetime is governed by its root cgroup.  Zero
1740                  * ref indicate that the root is being destroyed.  Wait for
1741                  * destruction to complete so that the subsystems are free.
1742                  * We can use wait_queue for the wait but this path is
1743                  * super cold.  Let's just sleep for a bit and retry.
1744                  */
1745                 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1746                         mutex_unlock(&cgroup_mutex);
1747                         msleep(10);
1748                         ret = restart_syscall();
1749                         goto out_free;
1750                 }
1751
1752                 ret = 0;
1753                 goto out_unlock;
1754         }
1755
1756         /*
1757          * No such thing, create a new one.  name= matching without subsys
1758          * specification is allowed for already existing hierarchies but we
1759          * can't create new one without subsys specification.
1760          */
1761         if (!opts.subsys_mask && !opts.none) {
1762                 ret = -EINVAL;
1763                 goto out_unlock;
1764         }
1765
1766         root = kzalloc(sizeof(*root), GFP_KERNEL);
1767         if (!root) {
1768                 ret = -ENOMEM;
1769                 goto out_unlock;
1770         }
1771
1772         init_cgroup_root(root, &opts);
1773
1774         ret = cgroup_setup_root(root, opts.subsys_mask);
1775         if (ret)
1776                 cgroup_free_root(root);
1777
1778 out_unlock:
1779         mutex_unlock(&cgroup_mutex);
1780 out_free:
1781         kfree(opts.release_agent);
1782         kfree(opts.name);
1783
1784         if (ret)
1785                 return ERR_PTR(ret);
1786
1787         dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1788         if (IS_ERR(dentry) || !new_sb)
1789                 cgroup_put(&root->cgrp);
1790         return dentry;
1791 }
1792
1793 static void cgroup_kill_sb(struct super_block *sb)
1794 {
1795         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1796         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1797
1798         cgroup_put(&root->cgrp);
1799         kernfs_kill_sb(sb);
1800 }
1801
1802 static struct file_system_type cgroup_fs_type = {
1803         .name = "cgroup",
1804         .mount = cgroup_mount,
1805         .kill_sb = cgroup_kill_sb,
1806 };
1807
1808 static struct kobject *cgroup_kobj;
1809
1810 /**
1811  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1812  * @task: target task
1813  * @buf: the buffer to write the path into
1814  * @buflen: the length of the buffer
1815  *
1816  * Determine @task's cgroup on the first (the one with the lowest non-zero
1817  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1818  * function grabs cgroup_mutex and shouldn't be used inside locks used by
1819  * cgroup controller callbacks.
1820  *
1821  * Return value is the same as kernfs_path().
1822  */
1823 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1824 {
1825         struct cgroup_root *root;
1826         struct cgroup *cgrp;
1827         int hierarchy_id = 1;
1828         char *path = NULL;
1829
1830         mutex_lock(&cgroup_mutex);
1831         down_read(&css_set_rwsem);
1832
1833         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1834
1835         if (root) {
1836                 cgrp = task_cgroup_from_root(task, root);
1837                 path = cgroup_path(cgrp, buf, buflen);
1838         } else {
1839                 /* if no hierarchy exists, everyone is in "/" */
1840                 if (strlcpy(buf, "/", buflen) < buflen)
1841                         path = buf;
1842         }
1843
1844         up_read(&css_set_rwsem);
1845         mutex_unlock(&cgroup_mutex);
1846         return path;
1847 }
1848 EXPORT_SYMBOL_GPL(task_cgroup_path);
1849
1850 /* used to track tasks and other necessary states during migration */
1851 struct cgroup_taskset {
1852         /* the src and dst cset list running through cset->mg_node */
1853         struct list_head        src_csets;
1854         struct list_head        dst_csets;
1855
1856         /*
1857          * Fields for cgroup_taskset_*() iteration.
1858          *
1859          * Before migration is committed, the target migration tasks are on
1860          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1861          * the csets on ->dst_csets.  ->csets point to either ->src_csets
1862          * or ->dst_csets depending on whether migration is committed.
1863          *
1864          * ->cur_csets and ->cur_task point to the current task position
1865          * during iteration.
1866          */
1867         struct list_head        *csets;
1868         struct css_set          *cur_cset;
1869         struct task_struct      *cur_task;
1870 };
1871
1872 /**
1873  * cgroup_taskset_first - reset taskset and return the first task
1874  * @tset: taskset of interest
1875  *
1876  * @tset iteration is initialized and the first task is returned.
1877  */
1878 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1879 {
1880         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1881         tset->cur_task = NULL;
1882
1883         return cgroup_taskset_next(tset);
1884 }
1885
1886 /**
1887  * cgroup_taskset_next - iterate to the next task in taskset
1888  * @tset: taskset of interest
1889  *
1890  * Return the next task in @tset.  Iteration must have been initialized
1891  * with cgroup_taskset_first().
1892  */
1893 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1894 {
1895         struct css_set *cset = tset->cur_cset;
1896         struct task_struct *task = tset->cur_task;
1897
1898         while (&cset->mg_node != tset->csets) {
1899                 if (!task)
1900                         task = list_first_entry(&cset->mg_tasks,
1901                                                 struct task_struct, cg_list);
1902                 else
1903                         task = list_next_entry(task, cg_list);
1904
1905                 if (&task->cg_list != &cset->mg_tasks) {
1906                         tset->cur_cset = cset;
1907                         tset->cur_task = task;
1908                         return task;
1909                 }
1910
1911                 cset = list_next_entry(cset, mg_node);
1912                 task = NULL;
1913         }
1914
1915         return NULL;
1916 }
1917
1918 /**
1919  * cgroup_task_migrate - move a task from one cgroup to another.
1920  * @old_cgrp: the cgroup @tsk is being migrated from
1921  * @tsk: the task being migrated
1922  * @new_cset: the new css_set @tsk is being attached to
1923  *
1924  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1925  */
1926 static void cgroup_task_migrate(struct cgroup *old_cgrp,
1927                                 struct task_struct *tsk,
1928                                 struct css_set *new_cset)
1929 {
1930         struct css_set *old_cset;
1931
1932         lockdep_assert_held(&cgroup_mutex);
1933         lockdep_assert_held(&css_set_rwsem);
1934
1935         /*
1936          * We are synchronized through threadgroup_lock() against PF_EXITING
1937          * setting such that we can't race against cgroup_exit() changing the
1938          * css_set to init_css_set and dropping the old one.
1939          */
1940         WARN_ON_ONCE(tsk->flags & PF_EXITING);
1941         old_cset = task_css_set(tsk);
1942
1943         get_css_set(new_cset);
1944         rcu_assign_pointer(tsk->cgroups, new_cset);
1945
1946         /*
1947          * Use move_tail so that cgroup_taskset_first() still returns the
1948          * leader after migration.  This works because cgroup_migrate()
1949          * ensures that the dst_cset of the leader is the first on the
1950          * tset's dst_csets list.
1951          */
1952         list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1953
1954         /*
1955          * We just gained a reference on old_cset by taking it from the
1956          * task. As trading it for new_cset is protected by cgroup_mutex,
1957          * we're safe to drop it here; it will be freed under RCU.
1958          */
1959         set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1960         put_css_set_locked(old_cset, false);
1961 }
1962
1963 /**
1964  * cgroup_migrate_finish - cleanup after attach
1965  * @preloaded_csets: list of preloaded css_sets
1966  *
1967  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
1968  * those functions for details.
1969  */
1970 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1971 {
1972         struct css_set *cset, *tmp_cset;
1973
1974         lockdep_assert_held(&cgroup_mutex);
1975
1976         down_write(&css_set_rwsem);
1977         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1978                 cset->mg_src_cgrp = NULL;
1979                 cset->mg_dst_cset = NULL;
1980                 list_del_init(&cset->mg_preload_node);
1981                 put_css_set_locked(cset, false);
1982         }
1983         up_write(&css_set_rwsem);
1984 }
1985
1986 /**
1987  * cgroup_migrate_add_src - add a migration source css_set
1988  * @src_cset: the source css_set to add
1989  * @dst_cgrp: the destination cgroup
1990  * @preloaded_csets: list of preloaded css_sets
1991  *
1992  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
1993  * @src_cset and add it to @preloaded_csets, which should later be cleaned
1994  * up by cgroup_migrate_finish().
1995  *
1996  * This function may be called without holding threadgroup_lock even if the
1997  * target is a process.  Threads may be created and destroyed but as long
1998  * as cgroup_mutex is not dropped, no new css_set can be put into play and
1999  * the preloaded css_sets are guaranteed to cover all migrations.
2000  */
2001 static void cgroup_migrate_add_src(struct css_set *src_cset,
2002                                    struct cgroup *dst_cgrp,
2003                                    struct list_head *preloaded_csets)
2004 {
2005         struct cgroup *src_cgrp;
2006
2007         lockdep_assert_held(&cgroup_mutex);
2008         lockdep_assert_held(&css_set_rwsem);
2009
2010         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2011
2012         if (!list_empty(&src_cset->mg_preload_node))
2013                 return;
2014
2015         WARN_ON(src_cset->mg_src_cgrp);
2016         WARN_ON(!list_empty(&src_cset->mg_tasks));
2017         WARN_ON(!list_empty(&src_cset->mg_node));
2018
2019         src_cset->mg_src_cgrp = src_cgrp;
2020         get_css_set(src_cset);
2021         list_add(&src_cset->mg_preload_node, preloaded_csets);
2022 }
2023
2024 /**
2025  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2026  * @dst_cgrp: the destination cgroup (may be %NULL)
2027  * @preloaded_csets: list of preloaded source css_sets
2028  *
2029  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2030  * have been preloaded to @preloaded_csets.  This function looks up and
2031  * pins all destination css_sets, links each to its source, and append them
2032  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2033  * source css_set is assumed to be its cgroup on the default hierarchy.
2034  *
2035  * This function must be called after cgroup_migrate_add_src() has been
2036  * called on each migration source css_set.  After migration is performed
2037  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2038  * @preloaded_csets.
2039  */
2040 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2041                                       struct list_head *preloaded_csets)
2042 {
2043         LIST_HEAD(csets);
2044         struct css_set *src_cset, *tmp_cset;
2045
2046         lockdep_assert_held(&cgroup_mutex);
2047
2048         /*
2049          * Except for the root, child_subsys_mask must be zero for a cgroup
2050          * with tasks so that child cgroups don't compete against tasks.
2051          */
2052         if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
2053             dst_cgrp->child_subsys_mask)
2054                 return -EBUSY;
2055
2056         /* look up the dst cset for each src cset and link it to src */
2057         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2058                 struct css_set *dst_cset;
2059
2060                 dst_cset = find_css_set(src_cset,
2061                                         dst_cgrp ?: src_cset->dfl_cgrp);
2062                 if (!dst_cset)
2063                         goto err;
2064
2065                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2066
2067                 /*
2068                  * If src cset equals dst, it's noop.  Drop the src.
2069                  * cgroup_migrate() will skip the cset too.  Note that we
2070                  * can't handle src == dst as some nodes are used by both.
2071                  */
2072                 if (src_cset == dst_cset) {
2073                         src_cset->mg_src_cgrp = NULL;
2074                         list_del_init(&src_cset->mg_preload_node);
2075                         put_css_set(src_cset, false);
2076                         put_css_set(dst_cset, false);
2077                         continue;
2078                 }
2079
2080                 src_cset->mg_dst_cset = dst_cset;
2081
2082                 if (list_empty(&dst_cset->mg_preload_node))
2083                         list_add(&dst_cset->mg_preload_node, &csets);
2084                 else
2085                         put_css_set(dst_cset, false);
2086         }
2087
2088         list_splice_tail(&csets, preloaded_csets);
2089         return 0;
2090 err:
2091         cgroup_migrate_finish(&csets);
2092         return -ENOMEM;
2093 }
2094
2095 /**
2096  * cgroup_migrate - migrate a process or task to a cgroup
2097  * @cgrp: the destination cgroup
2098  * @leader: the leader of the process or the task to migrate
2099  * @threadgroup: whether @leader points to the whole process or a single task
2100  *
2101  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2102  * process, the caller must be holding threadgroup_lock of @leader.  The
2103  * caller is also responsible for invoking cgroup_migrate_add_src() and
2104  * cgroup_migrate_prepare_dst() on the targets before invoking this
2105  * function and following up with cgroup_migrate_finish().
2106  *
2107  * As long as a controller's ->can_attach() doesn't fail, this function is
2108  * guaranteed to succeed.  This means that, excluding ->can_attach()
2109  * failure, when migrating multiple targets, the success or failure can be
2110  * decided for all targets by invoking group_migrate_prepare_dst() before
2111  * actually starting migrating.
2112  */
2113 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2114                           bool threadgroup)
2115 {
2116         struct cgroup_taskset tset = {
2117                 .src_csets      = LIST_HEAD_INIT(tset.src_csets),
2118                 .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
2119                 .csets          = &tset.src_csets,
2120         };
2121         struct cgroup_subsys_state *css, *failed_css = NULL;
2122         struct css_set *cset, *tmp_cset;
2123         struct task_struct *task, *tmp_task;
2124         int i, ret;
2125
2126         /*
2127          * Prevent freeing of tasks while we take a snapshot. Tasks that are
2128          * already PF_EXITING could be freed from underneath us unless we
2129          * take an rcu_read_lock.
2130          */
2131         down_write(&css_set_rwsem);
2132         rcu_read_lock();
2133         task = leader;
2134         do {
2135                 /* @task either already exited or can't exit until the end */
2136                 if (task->flags & PF_EXITING)
2137                         goto next;
2138
2139                 /* leave @task alone if post_fork() hasn't linked it yet */
2140                 if (list_empty(&task->cg_list))
2141                         goto next;
2142
2143                 cset = task_css_set(task);
2144                 if (!cset->mg_src_cgrp)
2145                         goto next;
2146
2147                 /*
2148                  * cgroup_taskset_first() must always return the leader.
2149                  * Take care to avoid disturbing the ordering.
2150                  */
2151                 list_move_tail(&task->cg_list, &cset->mg_tasks);
2152                 if (list_empty(&cset->mg_node))
2153                         list_add_tail(&cset->mg_node, &tset.src_csets);
2154                 if (list_empty(&cset->mg_dst_cset->mg_node))
2155                         list_move_tail(&cset->mg_dst_cset->mg_node,
2156                                        &tset.dst_csets);
2157         next:
2158                 if (!threadgroup)
2159                         break;
2160         } while_each_thread(leader, task);
2161         rcu_read_unlock();
2162         up_write(&css_set_rwsem);
2163
2164         /* methods shouldn't be called if no task is actually migrating */
2165         if (list_empty(&tset.src_csets))
2166                 return 0;
2167
2168         /* check that we can legitimately attach to the cgroup */
2169         for_each_e_css(css, i, cgrp) {
2170                 if (css->ss->can_attach) {
2171                         ret = css->ss->can_attach(css, &tset);
2172                         if (ret) {
2173                                 failed_css = css;
2174                                 goto out_cancel_attach;
2175                         }
2176                 }
2177         }
2178
2179         /*
2180          * Now that we're guaranteed success, proceed to move all tasks to
2181          * the new cgroup.  There are no failure cases after here, so this
2182          * is the commit point.
2183          */
2184         down_write(&css_set_rwsem);
2185         list_for_each_entry(cset, &tset.src_csets, mg_node) {
2186                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2187                         cgroup_task_migrate(cset->mg_src_cgrp, task,
2188                                             cset->mg_dst_cset);
2189         }
2190         up_write(&css_set_rwsem);
2191
2192         /*
2193          * Migration is committed, all target tasks are now on dst_csets.
2194          * Nothing is sensitive to fork() after this point.  Notify
2195          * controllers that migration is complete.
2196          */
2197         tset.csets = &tset.dst_csets;
2198
2199         for_each_e_css(css, i, cgrp)
2200                 if (css->ss->attach)
2201                         css->ss->attach(css, &tset);
2202
2203         ret = 0;
2204         goto out_release_tset;
2205
2206 out_cancel_attach:
2207         for_each_e_css(css, i, cgrp) {
2208                 if (css == failed_css)
2209                         break;
2210                 if (css->ss->cancel_attach)
2211                         css->ss->cancel_attach(css, &tset);
2212         }
2213 out_release_tset:
2214         down_write(&css_set_rwsem);
2215         list_splice_init(&tset.dst_csets, &tset.src_csets);
2216         list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2217                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2218                 list_del_init(&cset->mg_node);
2219         }
2220         up_write(&css_set_rwsem);
2221         return ret;
2222 }
2223
2224 /**
2225  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2226  * @dst_cgrp: the cgroup to attach to
2227  * @leader: the task or the leader of the threadgroup to be attached
2228  * @threadgroup: attach the whole threadgroup?
2229  *
2230  * Call holding cgroup_mutex and threadgroup_lock of @leader.
2231  */
2232 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2233                               struct task_struct *leader, bool threadgroup)
2234 {
2235         LIST_HEAD(preloaded_csets);
2236         struct task_struct *task;
2237         int ret;
2238
2239         /* look up all src csets */
2240         down_read(&css_set_rwsem);
2241         rcu_read_lock();
2242         task = leader;
2243         do {
2244                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2245                                        &preloaded_csets);
2246                 if (!threadgroup)
2247                         break;
2248         } while_each_thread(leader, task);
2249         rcu_read_unlock();
2250         up_read(&css_set_rwsem);
2251
2252         /* prepare dst csets and commit */
2253         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2254         if (!ret)
2255                 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2256
2257         cgroup_migrate_finish(&preloaded_csets);
2258         return ret;
2259 }
2260
2261 /*
2262  * Find the task_struct of the task to attach by vpid and pass it along to the
2263  * function to attach either it or all tasks in its threadgroup. Will lock
2264  * cgroup_mutex and threadgroup.
2265  */
2266 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2267                                     size_t nbytes, loff_t off, bool threadgroup)
2268 {
2269         struct task_struct *tsk;
2270         const struct cred *cred = current_cred(), *tcred;
2271         struct cgroup *cgrp;
2272         pid_t pid;
2273         int ret;
2274
2275         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2276                 return -EINVAL;
2277
2278         cgrp = cgroup_kn_lock_live(of->kn);
2279         if (!cgrp)
2280                 return -ENODEV;
2281
2282 retry_find_task:
2283         rcu_read_lock();
2284         if (pid) {
2285                 tsk = find_task_by_vpid(pid);
2286                 if (!tsk) {
2287                         rcu_read_unlock();
2288                         ret = -ESRCH;
2289                         goto out_unlock_cgroup;
2290                 }
2291                 /*
2292                  * even if we're attaching all tasks in the thread group, we
2293                  * only need to check permissions on one of them.
2294                  */
2295                 tcred = __task_cred(tsk);
2296                 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2297                     !uid_eq(cred->euid, tcred->uid) &&
2298                     !uid_eq(cred->euid, tcred->suid)) {
2299                         rcu_read_unlock();
2300                         ret = -EACCES;
2301                         goto out_unlock_cgroup;
2302                 }
2303         } else
2304                 tsk = current;
2305
2306         if (threadgroup)
2307                 tsk = tsk->group_leader;
2308
2309         /*
2310          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2311          * trapped in a cpuset, or RT worker may be born in a cgroup
2312          * with no rt_runtime allocated.  Just say no.
2313          */
2314         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2315                 ret = -EINVAL;
2316                 rcu_read_unlock();
2317                 goto out_unlock_cgroup;
2318         }
2319
2320         get_task_struct(tsk);
2321         rcu_read_unlock();
2322
2323         threadgroup_lock(tsk);
2324         if (threadgroup) {
2325                 if (!thread_group_leader(tsk)) {
2326                         /*
2327                          * a race with de_thread from another thread's exec()
2328                          * may strip us of our leadership, if this happens,
2329                          * there is no choice but to throw this task away and
2330                          * try again; this is
2331                          * "double-double-toil-and-trouble-check locking".
2332                          */
2333                         threadgroup_unlock(tsk);
2334                         put_task_struct(tsk);
2335                         goto retry_find_task;
2336                 }
2337         }
2338
2339         ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2340
2341         threadgroup_unlock(tsk);
2342
2343         put_task_struct(tsk);
2344 out_unlock_cgroup:
2345         cgroup_kn_unlock(of->kn);
2346         return ret ?: nbytes;
2347 }
2348
2349 /**
2350  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2351  * @from: attach to all cgroups of a given task
2352  * @tsk: the task to be attached
2353  */
2354 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2355 {
2356         struct cgroup_root *root;
2357         int retval = 0;
2358
2359         mutex_lock(&cgroup_mutex);
2360         for_each_root(root) {
2361                 struct cgroup *from_cgrp;
2362
2363                 if (root == &cgrp_dfl_root)
2364                         continue;
2365
2366                 down_read(&css_set_rwsem);
2367                 from_cgrp = task_cgroup_from_root(from, root);
2368                 up_read(&css_set_rwsem);
2369
2370                 retval = cgroup_attach_task(from_cgrp, tsk, false);
2371                 if (retval)
2372                         break;
2373         }
2374         mutex_unlock(&cgroup_mutex);
2375
2376         return retval;
2377 }
2378 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2379
2380 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2381                                   char *buf, size_t nbytes, loff_t off)
2382 {
2383         return __cgroup_procs_write(of, buf, nbytes, off, false);
2384 }
2385
2386 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2387                                   char *buf, size_t nbytes, loff_t off)
2388 {
2389         return __cgroup_procs_write(of, buf, nbytes, off, true);
2390 }
2391
2392 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2393                                           char *buf, size_t nbytes, loff_t off)
2394 {
2395         struct cgroup *cgrp;
2396
2397         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2398
2399         cgrp = cgroup_kn_lock_live(of->kn);
2400         if (!cgrp)
2401                 return -ENODEV;
2402         spin_lock(&release_agent_path_lock);
2403         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2404                 sizeof(cgrp->root->release_agent_path));
2405         spin_unlock(&release_agent_path_lock);
2406         cgroup_kn_unlock(of->kn);
2407         return nbytes;
2408 }
2409
2410 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2411 {
2412         struct cgroup *cgrp = seq_css(seq)->cgroup;
2413
2414         spin_lock(&release_agent_path_lock);
2415         seq_puts(seq, cgrp->root->release_agent_path);
2416         spin_unlock(&release_agent_path_lock);
2417         seq_putc(seq, '\n');
2418         return 0;
2419 }
2420
2421 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2422 {
2423         struct cgroup *cgrp = seq_css(seq)->cgroup;
2424
2425         seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2426         return 0;
2427 }
2428
2429 static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2430 {
2431         struct cgroup_subsys *ss;
2432         bool printed = false;
2433         int ssid;
2434
2435         for_each_subsys(ss, ssid) {
2436                 if (ss_mask & (1 << ssid)) {
2437                         if (printed)
2438                                 seq_putc(seq, ' ');
2439                         seq_printf(seq, "%s", ss->name);
2440                         printed = true;
2441                 }
2442         }
2443         if (printed)
2444                 seq_putc(seq, '\n');
2445 }
2446
2447 /* show controllers which are currently attached to the default hierarchy */
2448 static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2449 {
2450         struct cgroup *cgrp = seq_css(seq)->cgroup;
2451
2452         cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
2453         return 0;
2454 }
2455
2456 /* show controllers which are enabled from the parent */
2457 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2458 {
2459         struct cgroup *cgrp = seq_css(seq)->cgroup;
2460
2461         cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
2462         return 0;
2463 }
2464
2465 /* show controllers which are enabled for a given cgroup's children */
2466 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2467 {
2468         struct cgroup *cgrp = seq_css(seq)->cgroup;
2469
2470         cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2471         return 0;
2472 }
2473
2474 /**
2475  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2476  * @cgrp: root of the subtree to update csses for
2477  *
2478  * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2479  * css associations need to be updated accordingly.  This function looks up
2480  * all css_sets which are attached to the subtree, creates the matching
2481  * updated css_sets and migrates the tasks to the new ones.
2482  */
2483 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2484 {
2485         LIST_HEAD(preloaded_csets);
2486         struct cgroup_subsys_state *css;
2487         struct css_set *src_cset;
2488         int ret;
2489
2490         lockdep_assert_held(&cgroup_mutex);
2491
2492         /* look up all csses currently attached to @cgrp's subtree */
2493         down_read(&css_set_rwsem);
2494         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2495                 struct cgrp_cset_link *link;
2496
2497                 /* self is not affected by child_subsys_mask change */
2498                 if (css->cgroup == cgrp)
2499                         continue;
2500
2501                 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2502                         cgroup_migrate_add_src(link->cset, cgrp,
2503                                                &preloaded_csets);
2504         }
2505         up_read(&css_set_rwsem);
2506
2507         /* NULL dst indicates self on default hierarchy */
2508         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2509         if (ret)
2510                 goto out_finish;
2511
2512         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2513                 struct task_struct *last_task = NULL, *task;
2514
2515                 /* src_csets precede dst_csets, break on the first dst_cset */
2516                 if (!src_cset->mg_src_cgrp)
2517                         break;
2518
2519                 /*
2520                  * All tasks in src_cset need to be migrated to the
2521                  * matching dst_cset.  Empty it process by process.  We
2522                  * walk tasks but migrate processes.  The leader might even
2523                  * belong to a different cset but such src_cset would also
2524                  * be among the target src_csets because the default
2525                  * hierarchy enforces per-process membership.
2526                  */
2527                 while (true) {
2528                         down_read(&css_set_rwsem);
2529                         task = list_first_entry_or_null(&src_cset->tasks,
2530                                                 struct task_struct, cg_list);
2531                         if (task) {
2532                                 task = task->group_leader;
2533                                 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2534                                 get_task_struct(task);
2535                         }
2536                         up_read(&css_set_rwsem);
2537
2538                         if (!task)
2539                                 break;
2540
2541                         /* guard against possible infinite loop */
2542                         if (WARN(last_task == task,
2543                                  "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2544                                 goto out_finish;
2545                         last_task = task;
2546
2547                         threadgroup_lock(task);
2548                         /* raced against de_thread() from another thread? */
2549                         if (!thread_group_leader(task)) {
2550                                 threadgroup_unlock(task);
2551                                 put_task_struct(task);
2552                                 continue;
2553                         }
2554
2555                         ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2556
2557                         threadgroup_unlock(task);
2558                         put_task_struct(task);
2559
2560                         if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2561                                 goto out_finish;
2562                 }
2563         }
2564
2565 out_finish:
2566         cgroup_migrate_finish(&preloaded_csets);
2567         return ret;
2568 }
2569
2570 /* change the enabled child controllers for a cgroup in the default hierarchy */
2571 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2572                                             char *buf, size_t nbytes,
2573                                             loff_t off)
2574 {
2575         unsigned int enable = 0, disable = 0;
2576         struct cgroup *cgrp, *child;
2577         struct cgroup_subsys *ss;
2578         char *tok;
2579         int ssid, ret;
2580
2581         /*
2582          * Parse input - space separated list of subsystem names prefixed
2583          * with either + or -.
2584          */
2585         buf = strstrip(buf);
2586         while ((tok = strsep(&buf, " "))) {
2587                 if (tok[0] == '\0')
2588                         continue;
2589                 for_each_subsys(ss, ssid) {
2590                         if (ss->disabled || strcmp(tok + 1, ss->name))
2591                                 continue;
2592
2593                         if (*tok == '+') {
2594                                 enable |= 1 << ssid;
2595                                 disable &= ~(1 << ssid);
2596                         } else if (*tok == '-') {
2597                                 disable |= 1 << ssid;
2598                                 enable &= ~(1 << ssid);
2599                         } else {
2600                                 return -EINVAL;
2601                         }
2602                         break;
2603                 }
2604                 if (ssid == CGROUP_SUBSYS_COUNT)
2605                         return -EINVAL;
2606         }
2607
2608         cgrp = cgroup_kn_lock_live(of->kn);
2609         if (!cgrp)
2610                 return -ENODEV;
2611
2612         for_each_subsys(ss, ssid) {
2613                 if (enable & (1 << ssid)) {
2614                         if (cgrp->child_subsys_mask & (1 << ssid)) {
2615                                 enable &= ~(1 << ssid);
2616                                 continue;
2617                         }
2618
2619                         /*
2620                          * Because css offlining is asynchronous, userland
2621                          * might try to re-enable the same controller while
2622                          * the previous instance is still around.  In such
2623                          * cases, wait till it's gone using offline_waitq.
2624                          */
2625                         cgroup_for_each_live_child(child, cgrp) {
2626                                 DEFINE_WAIT(wait);
2627
2628                                 if (!cgroup_css(child, ss))
2629                                         continue;
2630
2631                                 cgroup_get(child);
2632                                 prepare_to_wait(&child->offline_waitq, &wait,
2633                                                 TASK_UNINTERRUPTIBLE);
2634                                 cgroup_kn_unlock(of->kn);
2635                                 schedule();
2636                                 finish_wait(&child->offline_waitq, &wait);
2637                                 cgroup_put(child);
2638
2639                                 return restart_syscall();
2640                         }
2641
2642                         /* unavailable or not enabled on the parent? */
2643                         if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2644                             (cgrp->parent &&
2645                              !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
2646                                 ret = -ENOENT;
2647                                 goto out_unlock;
2648                         }
2649                 } else if (disable & (1 << ssid)) {
2650                         if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2651                                 disable &= ~(1 << ssid);
2652                                 continue;
2653                         }
2654
2655                         /* a child has it enabled? */
2656                         cgroup_for_each_live_child(child, cgrp) {
2657                                 if (child->child_subsys_mask & (1 << ssid)) {
2658                                         ret = -EBUSY;
2659                                         goto out_unlock;
2660                                 }
2661                         }
2662                 }
2663         }
2664
2665         if (!enable && !disable) {
2666                 ret = 0;
2667                 goto out_unlock;
2668         }
2669
2670         /*
2671          * Except for the root, child_subsys_mask must be zero for a cgroup
2672          * with tasks so that child cgroups don't compete against tasks.
2673          */
2674         if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
2675                 ret = -EBUSY;
2676                 goto out_unlock;
2677         }
2678
2679         /*
2680          * Create csses for enables and update child_subsys_mask.  This
2681          * changes cgroup_e_css() results which in turn makes the
2682          * subsequent cgroup_update_dfl_csses() associate all tasks in the
2683          * subtree to the updated csses.
2684          */
2685         for_each_subsys(ss, ssid) {
2686                 if (!(enable & (1 << ssid)))
2687                         continue;
2688
2689                 cgroup_for_each_live_child(child, cgrp) {
2690                         ret = create_css(child, ss);
2691                         if (ret)
2692                                 goto err_undo_css;
2693                 }
2694         }
2695
2696         cgrp->child_subsys_mask |= enable;
2697         cgrp->child_subsys_mask &= ~disable;
2698
2699         ret = cgroup_update_dfl_csses(cgrp);
2700         if (ret)
2701                 goto err_undo_css;
2702
2703         /* all tasks are now migrated away from the old csses, kill them */
2704         for_each_subsys(ss, ssid) {
2705                 if (!(disable & (1 << ssid)))
2706                         continue;
2707
2708                 cgroup_for_each_live_child(child, cgrp)
2709                         kill_css(cgroup_css(child, ss));
2710         }
2711
2712         kernfs_activate(cgrp->kn);
2713         ret = 0;
2714 out_unlock:
2715         cgroup_kn_unlock(of->kn);
2716         return ret ?: nbytes;
2717
2718 err_undo_css:
2719         cgrp->child_subsys_mask &= ~enable;
2720         cgrp->child_subsys_mask |= disable;
2721
2722         for_each_subsys(ss, ssid) {
2723                 if (!(enable & (1 << ssid)))
2724                         continue;
2725
2726                 cgroup_for_each_live_child(child, cgrp) {
2727                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
2728                         if (css)
2729                                 kill_css(css);
2730                 }
2731         }
2732         goto out_unlock;
2733 }
2734
2735 static int cgroup_populated_show(struct seq_file *seq, void *v)
2736 {
2737         seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2738         return 0;
2739 }
2740
2741 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2742                                  size_t nbytes, loff_t off)
2743 {
2744         struct cgroup *cgrp = of->kn->parent->priv;
2745         struct cftype *cft = of->kn->priv;
2746         struct cgroup_subsys_state *css;
2747         int ret;
2748
2749         if (cft->write)
2750                 return cft->write(of, buf, nbytes, off);
2751
2752         /*
2753          * kernfs guarantees that a file isn't deleted with operations in
2754          * flight, which means that the matching css is and stays alive and
2755          * doesn't need to be pinned.  The RCU locking is not necessary
2756          * either.  It's just for the convenience of using cgroup_css().
2757          */
2758         rcu_read_lock();
2759         css = cgroup_css(cgrp, cft->ss);
2760         rcu_read_unlock();
2761
2762         if (cft->write_u64) {
2763                 unsigned long long v;
2764                 ret = kstrtoull(buf, 0, &v);
2765                 if (!ret)
2766                         ret = cft->write_u64(css, cft, v);
2767         } else if (cft->write_s64) {
2768                 long long v;
2769                 ret = kstrtoll(buf, 0, &v);
2770                 if (!ret)
2771                         ret = cft->write_s64(css, cft, v);
2772         } else {
2773                 ret = -EINVAL;
2774         }
2775
2776         return ret ?: nbytes;
2777 }
2778
2779 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2780 {
2781         return seq_cft(seq)->seq_start(seq, ppos);
2782 }
2783
2784 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2785 {
2786         return seq_cft(seq)->seq_next(seq, v, ppos);
2787 }
2788
2789 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2790 {
2791         seq_cft(seq)->seq_stop(seq, v);
2792 }
2793
2794 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2795 {
2796         struct cftype *cft = seq_cft(m);
2797         struct cgroup_subsys_state *css = seq_css(m);
2798
2799         if (cft->seq_show)
2800                 return cft->seq_show(m, arg);
2801
2802         if (cft->read_u64)
2803                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2804         else if (cft->read_s64)
2805                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2806         else
2807                 return -EINVAL;
2808         return 0;
2809 }
2810
2811 static struct kernfs_ops cgroup_kf_single_ops = {
2812         .atomic_write_len       = PAGE_SIZE,
2813         .write                  = cgroup_file_write,
2814         .seq_show               = cgroup_seqfile_show,
2815 };
2816
2817 static struct kernfs_ops cgroup_kf_ops = {
2818         .atomic_write_len       = PAGE_SIZE,
2819         .write                  = cgroup_file_write,
2820         .seq_start              = cgroup_seqfile_start,
2821         .seq_next               = cgroup_seqfile_next,
2822         .seq_stop               = cgroup_seqfile_stop,
2823         .seq_show               = cgroup_seqfile_show,
2824 };
2825
2826 /*
2827  * cgroup_rename - Only allow simple rename of directories in place.
2828  */
2829 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2830                          const char *new_name_str)
2831 {
2832         struct cgroup *cgrp = kn->priv;
2833         int ret;
2834
2835         if (kernfs_type(kn) != KERNFS_DIR)
2836                 return -ENOTDIR;
2837         if (kn->parent != new_parent)
2838                 return -EIO;
2839
2840         /*
2841          * This isn't a proper migration and its usefulness is very
2842          * limited.  Disallow if sane_behavior.
2843          */
2844         if (cgroup_sane_behavior(cgrp))
2845                 return -EPERM;
2846
2847         /*
2848          * We're gonna grab cgroup_mutex which nests outside kernfs
2849          * active_ref.  kernfs_rename() doesn't require active_ref
2850          * protection.  Break them before grabbing cgroup_mutex.
2851          */
2852         kernfs_break_active_protection(new_parent);
2853         kernfs_break_active_protection(kn);
2854
2855         mutex_lock(&cgroup_mutex);
2856
2857         ret = kernfs_rename(kn, new_parent, new_name_str);
2858
2859         mutex_unlock(&cgroup_mutex);
2860
2861         kernfs_unbreak_active_protection(kn);
2862         kernfs_unbreak_active_protection(new_parent);
2863         return ret;
2864 }
2865
2866 /* set uid and gid of cgroup dirs and files to that of the creator */
2867 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2868 {
2869         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2870                                .ia_uid = current_fsuid(),
2871                                .ia_gid = current_fsgid(), };
2872
2873         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2874             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2875                 return 0;
2876
2877         return kernfs_setattr(kn, &iattr);
2878 }
2879
2880 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2881 {
2882         char name[CGROUP_FILE_NAME_MAX];
2883         struct kernfs_node *kn;
2884         struct lock_class_key *key = NULL;
2885         int ret;
2886
2887 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2888         key = &cft->lockdep_key;
2889 #endif
2890         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2891                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2892                                   NULL, false, key);
2893         if (IS_ERR(kn))
2894                 return PTR_ERR(kn);
2895
2896         ret = cgroup_kn_set_ugid(kn);
2897         if (ret) {
2898                 kernfs_remove(kn);
2899                 return ret;
2900         }
2901
2902         if (cft->seq_show == cgroup_populated_show)
2903                 cgrp->populated_kn = kn;
2904         return 0;
2905 }
2906
2907 /**
2908  * cgroup_addrm_files - add or remove files to a cgroup directory
2909  * @cgrp: the target cgroup
2910  * @cfts: array of cftypes to be added
2911  * @is_add: whether to add or remove
2912  *
2913  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2914  * For removals, this function never fails.  If addition fails, this
2915  * function doesn't remove files already added.  The caller is responsible
2916  * for cleaning up.
2917  */
2918 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2919                               bool is_add)
2920 {
2921         struct cftype *cft;
2922         int ret;
2923
2924         lockdep_assert_held(&cgroup_mutex);
2925
2926         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2927                 /* does cft->flags tell us to skip this file on @cgrp? */
2928                 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2929                         continue;
2930                 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2931                         continue;
2932                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2933                         continue;
2934                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2935                         continue;
2936
2937                 if (is_add) {
2938                         ret = cgroup_add_file(cgrp, cft);
2939                         if (ret) {
2940                                 pr_warn("%s: failed to add %s, err=%d\n",
2941                                         __func__, cft->name, ret);
2942                                 return ret;
2943                         }
2944                 } else {
2945                         cgroup_rm_file(cgrp, cft);
2946                 }
2947         }
2948         return 0;
2949 }
2950
2951 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2952 {
2953         LIST_HEAD(pending);
2954         struct cgroup_subsys *ss = cfts[0].ss;
2955         struct cgroup *root = &ss->root->cgrp;
2956         struct cgroup_subsys_state *css;
2957         int ret = 0;
2958
2959         lockdep_assert_held(&cgroup_mutex);
2960
2961         /* add/rm files for all cgroups created before */
2962         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2963                 struct cgroup *cgrp = css->cgroup;
2964
2965                 if (cgroup_is_dead(cgrp))
2966                         continue;
2967
2968                 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2969                 if (ret)
2970                         break;
2971         }
2972
2973         if (is_add && !ret)
2974                 kernfs_activate(root->kn);
2975         return ret;
2976 }
2977
2978 static void cgroup_exit_cftypes(struct cftype *cfts)
2979 {
2980         struct cftype *cft;
2981
2982         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2983                 /* free copy for custom atomic_write_len, see init_cftypes() */
2984                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2985                         kfree(cft->kf_ops);
2986                 cft->kf_ops = NULL;
2987                 cft->ss = NULL;
2988         }
2989 }
2990
2991 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2992 {
2993         struct cftype *cft;
2994
2995         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2996                 struct kernfs_ops *kf_ops;
2997
2998                 WARN_ON(cft->ss || cft->kf_ops);
2999
3000                 if (cft->seq_start)
3001                         kf_ops = &cgroup_kf_ops;
3002                 else
3003                         kf_ops = &cgroup_kf_single_ops;
3004
3005                 /*
3006                  * Ugh... if @cft wants a custom max_write_len, we need to
3007                  * make a copy of kf_ops to set its atomic_write_len.
3008                  */
3009                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3010                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3011                         if (!kf_ops) {
3012                                 cgroup_exit_cftypes(cfts);
3013                                 return -ENOMEM;
3014                         }
3015                         kf_ops->atomic_write_len = cft->max_write_len;
3016                 }
3017
3018                 cft->kf_ops = kf_ops;
3019                 cft->ss = ss;
3020         }
3021
3022         return 0;
3023 }
3024
3025 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3026 {
3027         lockdep_assert_held(&cgroup_mutex);
3028
3029         if (!cfts || !cfts[0].ss)
3030                 return -ENOENT;
3031
3032         list_del(&cfts->node);
3033         cgroup_apply_cftypes(cfts, false);
3034         cgroup_exit_cftypes(cfts);
3035         return 0;
3036 }
3037
3038 /**
3039  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3040  * @cfts: zero-length name terminated array of cftypes
3041  *
3042  * Unregister @cfts.  Files described by @cfts are removed from all
3043  * existing cgroups and all future cgroups won't have them either.  This
3044  * function can be called anytime whether @cfts' subsys is attached or not.
3045  *
3046  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3047  * registered.
3048  */
3049 int cgroup_rm_cftypes(struct cftype *cfts)
3050 {
3051         int ret;
3052
3053         mutex_lock(&cgroup_mutex);
3054         ret = cgroup_rm_cftypes_locked(cfts);
3055         mutex_unlock(&cgroup_mutex);
3056         return ret;
3057 }
3058
3059 /**
3060  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3061  * @ss: target cgroup subsystem
3062  * @cfts: zero-length name terminated array of cftypes
3063  *
3064  * Register @cfts to @ss.  Files described by @cfts are created for all
3065  * existing cgroups to which @ss is attached and all future cgroups will
3066  * have them too.  This function can be called anytime whether @ss is
3067  * attached or not.
3068  *
3069  * Returns 0 on successful registration, -errno on failure.  Note that this
3070  * function currently returns 0 as long as @cfts registration is successful
3071  * even if some file creation attempts on existing cgroups fail.
3072  */
3073 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3074 {
3075         int ret;
3076
3077         if (!cfts || cfts[0].name[0] == '\0')
3078                 return 0;
3079
3080         ret = cgroup_init_cftypes(ss, cfts);
3081         if (ret)
3082                 return ret;
3083
3084         mutex_lock(&cgroup_mutex);
3085
3086         list_add_tail(&cfts->node, &ss->cfts);
3087         ret = cgroup_apply_cftypes(cfts, true);
3088         if (ret)
3089                 cgroup_rm_cftypes_locked(cfts);
3090
3091         mutex_unlock(&cgroup_mutex);
3092         return ret;
3093 }
3094
3095 /**
3096  * cgroup_task_count - count the number of tasks in a cgroup.
3097  * @cgrp: the cgroup in question
3098  *
3099  * Return the number of tasks in the cgroup.
3100  */
3101 static int cgroup_task_count(const struct cgroup *cgrp)
3102 {
3103         int count = 0;
3104         struct cgrp_cset_link *link;
3105
3106         down_read(&css_set_rwsem);
3107         list_for_each_entry(link, &cgrp->cset_links, cset_link)
3108                 count += atomic_read(&link->cset->refcount);
3109         up_read(&css_set_rwsem);
3110         return count;
3111 }
3112
3113 /**
3114  * css_next_child - find the next child of a given css
3115  * @pos_css: the current position (%NULL to initiate traversal)
3116  * @parent_css: css whose children to walk
3117  *
3118  * This function returns the next child of @parent_css and should be called
3119  * under either cgroup_mutex or RCU read lock.  The only requirement is
3120  * that @parent_css and @pos_css are accessible.  The next sibling is
3121  * guaranteed to be returned regardless of their states.
3122  */
3123 struct cgroup_subsys_state *
3124 css_next_child(struct cgroup_subsys_state *pos_css,
3125                struct cgroup_subsys_state *parent_css)
3126 {
3127         struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3128         struct cgroup *cgrp = parent_css->cgroup;
3129         struct cgroup *next;
3130
3131         cgroup_assert_mutex_or_rcu_locked();
3132
3133         /*
3134          * @pos could already have been removed.  Once a cgroup is removed,
3135          * its ->sibling.next is no longer updated when its next sibling
3136          * changes.  As CGRP_DEAD assertion is serialized and happens
3137          * before the cgroup is taken off the ->sibling list, if we see it
3138          * unasserted, it's guaranteed that the next sibling hasn't
3139          * finished its grace period even if it's already removed, and thus
3140          * safe to dereference from this RCU critical section.  If
3141          * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3142          * to be visible as %true here.
3143          *
3144          * If @pos is dead, its next pointer can't be dereferenced;
3145          * however, as each cgroup is given a monotonically increasing
3146          * unique serial number and always appended to the sibling list,
3147          * the next one can be found by walking the parent's children until
3148          * we see a cgroup with higher serial number than @pos's.  While
3149          * this path can be slower, it's taken only when either the current
3150          * cgroup is removed or iteration and removal race.
3151          */
3152         if (!pos) {
3153                 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3154         } else if (likely(!cgroup_is_dead(pos))) {
3155                 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3156         } else {
3157                 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3158                         if (next->serial_nr > pos->serial_nr)
3159                                 break;
3160         }
3161
3162         /*
3163          * @next, if not pointing to the head, can be dereferenced and is
3164          * the next sibling; however, it might have @ss disabled.  If so,
3165          * fast-forward to the next enabled one.
3166          */
3167         while (&next->sibling != &cgrp->children) {
3168                 struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
3169
3170                 if (next_css)
3171                         return next_css;
3172                 next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
3173         }
3174         return NULL;
3175 }
3176
3177 /**
3178  * css_next_descendant_pre - find the next descendant for pre-order walk
3179  * @pos: the current position (%NULL to initiate traversal)
3180  * @root: css whose descendants to walk
3181  *
3182  * To be used by css_for_each_descendant_pre().  Find the next descendant
3183  * to visit for pre-order traversal of @root's descendants.  @root is
3184  * included in the iteration and the first node to be visited.
3185  *
3186  * While this function requires cgroup_mutex or RCU read locking, it
3187  * doesn't require the whole traversal to be contained in a single critical
3188  * section.  This function will return the correct next descendant as long
3189  * as both @pos and @root are accessible and @pos is a descendant of @root.
3190  */
3191 struct cgroup_subsys_state *
3192 css_next_descendant_pre(struct cgroup_subsys_state *pos,
3193                         struct cgroup_subsys_state *root)
3194 {
3195         struct cgroup_subsys_state *next;
3196
3197         cgroup_assert_mutex_or_rcu_locked();
3198
3199         /* if first iteration, visit @root */
3200         if (!pos)
3201                 return root;
3202
3203         /* visit the first child if exists */
3204         next = css_next_child(NULL, pos);
3205         if (next)
3206                 return next;
3207
3208         /* no child, visit my or the closest ancestor's next sibling */
3209         while (pos != root) {
3210                 next = css_next_child(pos, css_parent(pos));
3211                 if (next)
3212                         return next;
3213                 pos = css_parent(pos);
3214         }
3215
3216         return NULL;
3217 }
3218
3219 /**
3220  * css_rightmost_descendant - return the rightmost descendant of a css
3221  * @pos: css of interest
3222  *
3223  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3224  * is returned.  This can be used during pre-order traversal to skip
3225  * subtree of @pos.
3226  *
3227  * While this function requires cgroup_mutex or RCU read locking, it
3228  * doesn't require the whole traversal to be contained in a single critical
3229  * section.  This function will return the correct rightmost descendant as
3230  * long as @pos is accessible.
3231  */
3232 struct cgroup_subsys_state *
3233 css_rightmost_descendant(struct cgroup_subsys_state *pos)
3234 {
3235         struct cgroup_subsys_state *last, *tmp;
3236
3237         cgroup_assert_mutex_or_rcu_locked();
3238
3239         do {
3240                 last = pos;
3241                 /* ->prev isn't RCU safe, walk ->next till the end */
3242                 pos = NULL;
3243                 css_for_each_child(tmp, last)
3244                         pos = tmp;
3245         } while (pos);
3246
3247         return last;
3248 }
3249
3250 static struct cgroup_subsys_state *
3251 css_leftmost_descendant(struct cgroup_subsys_state *pos)
3252 {
3253         struct cgroup_subsys_state *last;
3254
3255         do {
3256                 last = pos;
3257                 pos = css_next_child(NULL, pos);
3258         } while (pos);
3259
3260         return last;
3261 }
3262
3263 /**
3264  * css_next_descendant_post - find the next descendant for post-order walk
3265  * @pos: the current position (%NULL to initiate traversal)
3266  * @root: css whose descendants to walk
3267  *
3268  * To be used by css_for_each_descendant_post().  Find the next descendant
3269  * to visit for post-order traversal of @root's descendants.  @root is
3270  * included in the iteration and the last node to be visited.
3271  *
3272  * While this function requires cgroup_mutex or RCU read locking, it
3273  * doesn't require the whole traversal to be contained in a single critical
3274  * section.  This function will return the correct next descendant as long
3275  * as both @pos and @cgroup are accessible and @pos is a descendant of
3276  * @cgroup.
3277  */
3278 struct cgroup_subsys_state *
3279 css_next_descendant_post(struct cgroup_subsys_state *pos,
3280                          struct cgroup_subsys_state *root)
3281 {
3282         struct cgroup_subsys_state *next;
3283
3284         cgroup_assert_mutex_or_rcu_locked();
3285
3286         /* if first iteration, visit leftmost descendant which may be @root */
3287         if (!pos)
3288                 return css_leftmost_descendant(root);
3289
3290         /* if we visited @root, we're done */
3291         if (pos == root)
3292                 return NULL;
3293
3294         /* if there's an unvisited sibling, visit its leftmost descendant */
3295         next = css_next_child(pos, css_parent(pos));
3296         if (next)
3297                 return css_leftmost_descendant(next);
3298
3299         /* no sibling left, visit parent */
3300         return css_parent(pos);
3301 }
3302
3303 static bool cgroup_has_live_children(struct cgroup *cgrp)
3304 {
3305         struct cgroup *child;
3306
3307         rcu_read_lock();
3308         list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3309                 if (!cgroup_is_dead(child)) {
3310                         rcu_read_unlock();
3311                         return true;
3312                 }
3313         }
3314         rcu_read_unlock();
3315         return false;
3316 }
3317
3318 /**
3319  * css_advance_task_iter - advance a task itererator to the next css_set
3320  * @it: the iterator to advance
3321  *
3322  * Advance @it to the next css_set to walk.
3323  */
3324 static void css_advance_task_iter(struct css_task_iter *it)
3325 {
3326         struct list_head *l = it->cset_pos;
3327         struct cgrp_cset_link *link;
3328         struct css_set *cset;
3329
3330         /* Advance to the next non-empty css_set */
3331         do {
3332                 l = l->next;
3333                 if (l == it->cset_head) {
3334                         it->cset_pos = NULL;
3335                         return;
3336                 }
3337
3338                 if (it->ss) {
3339                         cset = container_of(l, struct css_set,
3340                                             e_cset_node[it->ss->id]);
3341                 } else {
3342                         link = list_entry(l, struct cgrp_cset_link, cset_link);
3343                         cset = link->cset;
3344                 }
3345         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
3346
3347         it->cset_pos = l;
3348
3349         if (!list_empty(&cset->tasks))
3350                 it->task_pos = cset->tasks.next;
3351         else
3352                 it->task_pos = cset->mg_tasks.next;
3353
3354         it->tasks_head = &cset->tasks;
3355         it->mg_tasks_head = &cset->mg_tasks;
3356 }
3357
3358 /**
3359  * css_task_iter_start - initiate task iteration
3360  * @css: the css to walk tasks of
3361  * @it: the task iterator to use
3362  *
3363  * Initiate iteration through the tasks of @css.  The caller can call
3364  * css_task_iter_next() to walk through the tasks until the function
3365  * returns NULL.  On completion of iteration, css_task_iter_end() must be
3366  * called.
3367  *
3368  * Note that this function acquires a lock which is released when the
3369  * iteration finishes.  The caller can't sleep while iteration is in
3370  * progress.
3371  */
3372 void css_task_iter_start(struct cgroup_subsys_state *css,
3373                          struct css_task_iter *it)
3374         __acquires(css_set_rwsem)
3375 {
3376         /* no one should try to iterate before mounting cgroups */
3377         WARN_ON_ONCE(!use_task_css_set_links);
3378
3379         down_read(&css_set_rwsem);
3380
3381         it->ss = css->ss;
3382
3383         if (it->ss)
3384                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3385         else
3386                 it->cset_pos = &css->cgroup->cset_links;
3387
3388         it->cset_head = it->cset_pos;
3389
3390         css_advance_task_iter(it);
3391 }
3392
3393 /**
3394  * css_task_iter_next - return the next task for the iterator
3395  * @it: the task iterator being iterated
3396  *
3397  * The "next" function for task iteration.  @it should have been
3398  * initialized via css_task_iter_start().  Returns NULL when the iteration
3399  * reaches the end.
3400  */
3401 struct task_struct *css_task_iter_next(struct css_task_iter *it)
3402 {
3403         struct task_struct *res;
3404         struct list_head *l = it->task_pos;
3405
3406         /* If the iterator cg is NULL, we have no tasks */
3407         if (!it->cset_pos)
3408                 return NULL;
3409         res = list_entry(l, struct task_struct, cg_list);
3410
3411         /*
3412          * Advance iterator to find next entry.  cset->tasks is consumed
3413          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3414          * next cset.
3415          */
3416         l = l->next;
3417
3418         if (l == it->tasks_head)
3419                 l = it->mg_tasks_head->next;
3420
3421         if (l == it->mg_tasks_head)
3422                 css_advance_task_iter(it);
3423         else
3424                 it->task_pos = l;
3425
3426         return res;
3427 }
3428
3429 /**
3430  * css_task_iter_end - finish task iteration
3431  * @it: the task iterator to finish
3432  *
3433  * Finish task iteration started by css_task_iter_start().
3434  */
3435 void css_task_iter_end(struct css_task_iter *it)
3436         __releases(css_set_rwsem)
3437 {
3438         up_read(&css_set_rwsem);
3439 }
3440
3441 /**
3442  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3443  * @to: cgroup to which the tasks will be moved
3444  * @from: cgroup in which the tasks currently reside
3445  *
3446  * Locking rules between cgroup_post_fork() and the migration path
3447  * guarantee that, if a task is forking while being migrated, the new child
3448  * is guaranteed to be either visible in the source cgroup after the
3449  * parent's migration is complete or put into the target cgroup.  No task
3450  * can slip out of migration through forking.
3451  */
3452 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3453 {
3454         LIST_HEAD(preloaded_csets);
3455         struct cgrp_cset_link *link;
3456         struct css_task_iter it;
3457         struct task_struct *task;
3458         int ret;
3459
3460         mutex_lock(&cgroup_mutex);
3461
3462         /* all tasks in @from are being moved, all csets are source */
3463         down_read(&css_set_rwsem);
3464         list_for_each_entry(link, &from->cset_links, cset_link)
3465                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3466         up_read(&css_set_rwsem);
3467
3468         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3469         if (ret)
3470                 goto out_err;
3471
3472         /*
3473          * Migrate tasks one-by-one until @form is empty.  This fails iff
3474          * ->can_attach() fails.
3475          */
3476         do {
3477                 css_task_iter_start(&from->self, &it);
3478                 task = css_task_iter_next(&it);
3479                 if (task)
3480                         get_task_struct(task);
3481                 css_task_iter_end(&it);
3482
3483                 if (task) {
3484                         ret = cgroup_migrate(to, task, false);
3485                         put_task_struct(task);
3486                 }
3487         } while (task && !ret);
3488 out_err:
3489         cgroup_migrate_finish(&preloaded_csets);
3490         mutex_unlock(&cgroup_mutex);
3491         return ret;
3492 }
3493
3494 /*
3495  * Stuff for reading the 'tasks'/'procs' files.
3496  *
3497  * Reading this file can return large amounts of data if a cgroup has
3498  * *lots* of attached tasks. So it may need several calls to read(),
3499  * but we cannot guarantee that the information we produce is correct
3500  * unless we produce it entirely atomically.
3501  *
3502  */
3503
3504 /* which pidlist file are we talking about? */
3505 enum cgroup_filetype {
3506         CGROUP_FILE_PROCS,
3507         CGROUP_FILE_TASKS,
3508 };
3509
3510 /*
3511  * A pidlist is a list of pids that virtually represents the contents of one
3512  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3513  * a pair (one each for procs, tasks) for each pid namespace that's relevant
3514  * to the cgroup.
3515  */
3516 struct cgroup_pidlist {
3517         /*
3518          * used to find which pidlist is wanted. doesn't change as long as
3519          * this particular list stays in the list.
3520         */
3521         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3522         /* array of xids */
3523         pid_t *list;
3524         /* how many elements the above list has */
3525         int length;
3526         /* each of these stored in a list by its cgroup */
3527         struct list_head links;
3528         /* pointer to the cgroup we belong to, for list removal purposes */
3529         struct cgroup *owner;
3530         /* for delayed destruction */
3531         struct delayed_work destroy_dwork;
3532 };
3533
3534 /*
3535  * The following two functions "fix" the issue where there are more pids
3536  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3537  * TODO: replace with a kernel-wide solution to this problem
3538  */
3539 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3540 static void *pidlist_allocate(int count)
3541 {
3542         if (PIDLIST_TOO_LARGE(count))
3543                 return vmalloc(count * sizeof(pid_t));
3544         else
3545                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3546 }
3547
3548 static void pidlist_free(void *p)
3549 {
3550         if (is_vmalloc_addr(p))
3551                 vfree(p);
3552         else
3553                 kfree(p);
3554 }
3555
3556 /*
3557  * Used to destroy all pidlists lingering waiting for destroy timer.  None
3558  * should be left afterwards.
3559  */
3560 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3561 {
3562         struct cgroup_pidlist *l, *tmp_l;
3563
3564         mutex_lock(&cgrp->pidlist_mutex);
3565         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3566                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3567         mutex_unlock(&cgrp->pidlist_mutex);
3568
3569         flush_workqueue(cgroup_pidlist_destroy_wq);
3570         BUG_ON(!list_empty(&cgrp->pidlists));
3571 }
3572
3573 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3574 {
3575         struct delayed_work *dwork = to_delayed_work(work);
3576         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3577                                                 destroy_dwork);
3578         struct cgroup_pidlist *tofree = NULL;
3579
3580         mutex_lock(&l->owner->pidlist_mutex);
3581
3582         /*
3583          * Destroy iff we didn't get queued again.  The state won't change
3584          * as destroy_dwork can only be queued while locked.
3585          */
3586         if (!delayed_work_pending(dwork)) {
3587                 list_del(&l->links);
3588                 pidlist_free(l->list);
3589                 put_pid_ns(l->key.ns);
3590                 tofree = l;
3591         }
3592
3593         mutex_unlock(&l->owner->pidlist_mutex);
3594         kfree(tofree);
3595 }
3596
3597 /*
3598  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3599  * Returns the number of unique elements.
3600  */
3601 static int pidlist_uniq(pid_t *list, int length)
3602 {
3603         int src, dest = 1;
3604
3605         /*
3606          * we presume the 0th element is unique, so i starts at 1. trivial
3607          * edge cases first; no work needs to be done for either
3608          */
3609         if (length == 0 || length == 1)
3610                 return length;
3611         /* src and dest walk down the list; dest counts unique elements */
3612         for (src = 1; src < length; src++) {
3613                 /* find next unique element */
3614                 while (list[src] == list[src-1]) {
3615                         src++;
3616                         if (src == length)
3617                                 goto after;
3618                 }
3619                 /* dest always points to where the next unique element goes */
3620                 list[dest] = list[src];
3621                 dest++;
3622         }
3623 after:
3624         return dest;
3625 }
3626
3627 /*
3628  * The two pid files - task and cgroup.procs - guaranteed that the result
3629  * is sorted, which forced this whole pidlist fiasco.  As pid order is
3630  * different per namespace, each namespace needs differently sorted list,
3631  * making it impossible to use, for example, single rbtree of member tasks
3632  * sorted by task pointer.  As pidlists can be fairly large, allocating one
3633  * per open file is dangerous, so cgroup had to implement shared pool of
3634  * pidlists keyed by cgroup and namespace.
3635  *
3636  * All this extra complexity was caused by the original implementation
3637  * committing to an entirely unnecessary property.  In the long term, we
3638  * want to do away with it.  Explicitly scramble sort order if
3639  * sane_behavior so that no such expectation exists in the new interface.
3640  *
3641  * Scrambling is done by swapping every two consecutive bits, which is
3642  * non-identity one-to-one mapping which disturbs sort order sufficiently.
3643  */
3644 static pid_t pid_fry(pid_t pid)
3645 {
3646         unsigned a = pid & 0x55555555;
3647         unsigned b = pid & 0xAAAAAAAA;
3648
3649         return (a << 1) | (b >> 1);
3650 }
3651
3652 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3653 {
3654         if (cgroup_sane_behavior(cgrp))
3655                 return pid_fry(pid);
3656         else
3657                 return pid;
3658 }
3659
3660 static int cmppid(const void *a, const void *b)
3661 {
3662         return *(pid_t *)a - *(pid_t *)b;
3663 }
3664
3665 static int fried_cmppid(const void *a, const void *b)
3666 {
3667         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3668 }
3669
3670 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3671                                                   enum cgroup_filetype type)
3672 {
3673         struct cgroup_pidlist *l;
3674         /* don't need task_nsproxy() if we're looking at ourself */
3675         struct pid_namespace *ns = task_active_pid_ns(current);
3676
3677         lockdep_assert_held(&cgrp->pidlist_mutex);
3678
3679         list_for_each_entry(l, &cgrp->pidlists, links)
3680                 if (l->key.type == type && l->key.ns == ns)
3681                         return l;
3682         return NULL;
3683 }
3684
3685 /*
3686  * find the appropriate pidlist for our purpose (given procs vs tasks)
3687  * returns with the lock on that pidlist already held, and takes care
3688  * of the use count, or returns NULL with no locks held if we're out of
3689  * memory.
3690  */
3691 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3692                                                 enum cgroup_filetype type)
3693 {
3694         struct cgroup_pidlist *l;
3695
3696         lockdep_assert_held(&cgrp->pidlist_mutex);
3697
3698         l = cgroup_pidlist_find(cgrp, type);
3699         if (l)
3700                 return l;
3701
3702         /* entry not found; create a new one */
3703         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3704         if (!l)
3705                 return l;
3706
3707         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3708         l->key.type = type;
3709         /* don't need task_nsproxy() if we're looking at ourself */
3710         l->key.ns = get_pid_ns(task_active_pid_ns(current));
3711         l->owner = cgrp;
3712         list_add(&l->links, &cgrp->pidlists);
3713         return l;
3714 }
3715
3716 /*
3717  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3718  */
3719 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3720                               struct cgroup_pidlist **lp)
3721 {
3722         pid_t *array;
3723         int length;
3724         int pid, n = 0; /* used for populating the array */
3725         struct css_task_iter it;
3726         struct task_struct *tsk;
3727         struct cgroup_pidlist *l;
3728
3729         lockdep_assert_held(&cgrp->pidlist_mutex);
3730
3731         /*
3732          * If cgroup gets more users after we read count, we won't have
3733          * enough space - tough.  This race is indistinguishable to the
3734          * caller from the case that the additional cgroup users didn't
3735          * show up until sometime later on.
3736          */
3737         length = cgroup_task_count(cgrp);
3738         array = pidlist_allocate(length);
3739         if (!array)
3740                 return -ENOMEM;
3741         /* now, populate the array */
3742         css_task_iter_start(&cgrp->self, &it);
3743         while ((tsk = css_task_iter_next(&it))) {
3744                 if (unlikely(n == length))
3745                         break;
3746                 /* get tgid or pid for procs or tasks file respectively */
3747                 if (type == CGROUP_FILE_PROCS)
3748                         pid = task_tgid_vnr(tsk);
3749                 else
3750                         pid = task_pid_vnr(tsk);
3751                 if (pid > 0) /* make sure to only use valid results */
3752                         array[n++] = pid;
3753         }
3754         css_task_iter_end(&it);
3755         length = n;
3756         /* now sort & (if procs) strip out duplicates */
3757         if (cgroup_sane_behavior(cgrp))
3758                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3759         else
3760                 sort(array, length, sizeof(pid_t), cmppid, NULL);
3761         if (type == CGROUP_FILE_PROCS)
3762                 length = pidlist_uniq(array, length);
3763
3764         l = cgroup_pidlist_find_create(cgrp, type);
3765         if (!l) {
3766                 mutex_unlock(&cgrp->pidlist_mutex);
3767                 pidlist_free(array);
3768                 return -ENOMEM;
3769         }
3770
3771         /* store array, freeing old if necessary */
3772         pidlist_free(l->list);
3773         l->list = array;
3774         l->length = length;
3775         *lp = l;
3776         return 0;
3777 }
3778
3779 /**
3780  * cgroupstats_build - build and fill cgroupstats
3781  * @stats: cgroupstats to fill information into
3782  * @dentry: A dentry entry belonging to the cgroup for which stats have
3783  * been requested.
3784  *
3785  * Build and fill cgroupstats so that taskstats can export it to user
3786  * space.
3787  */
3788 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3789 {
3790         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3791         struct cgroup *cgrp;
3792         struct css_task_iter it;
3793         struct task_struct *tsk;
3794
3795         /* it should be kernfs_node belonging to cgroupfs and is a directory */
3796         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3797             kernfs_type(kn) != KERNFS_DIR)
3798                 return -EINVAL;
3799
3800         mutex_lock(&cgroup_mutex);
3801
3802         /*
3803          * We aren't being called from kernfs and there's no guarantee on
3804          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
3805          * @kn->priv is RCU safe.  Let's do the RCU dancing.
3806          */
3807         rcu_read_lock();
3808         cgrp = rcu_dereference(kn->priv);
3809         if (!cgrp || cgroup_is_dead(cgrp)) {
3810                 rcu_read_unlock();
3811                 mutex_unlock(&cgroup_mutex);
3812                 return -ENOENT;
3813         }
3814         rcu_read_unlock();
3815
3816         css_task_iter_start(&cgrp->self, &it);
3817         while ((tsk = css_task_iter_next(&it))) {
3818                 switch (tsk->state) {
3819                 case TASK_RUNNING:
3820                         stats->nr_running++;
3821                         break;
3822                 case TASK_INTERRUPTIBLE:
3823                         stats->nr_sleeping++;
3824                         break;
3825                 case TASK_UNINTERRUPTIBLE:
3826                         stats->nr_uninterruptible++;
3827                         break;
3828                 case TASK_STOPPED:
3829                         stats->nr_stopped++;
3830                         break;
3831                 default:
3832                         if (delayacct_is_task_waiting_on_io(tsk))
3833                                 stats->nr_io_wait++;
3834                         break;
3835                 }
3836         }
3837         css_task_iter_end(&it);
3838
3839         mutex_unlock(&cgroup_mutex);
3840         return 0;
3841 }
3842
3843
3844 /*
3845  * seq_file methods for the tasks/procs files. The seq_file position is the
3846  * next pid to display; the seq_file iterator is a pointer to the pid
3847  * in the cgroup->l->list array.
3848  */
3849
3850 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3851 {
3852         /*
3853          * Initially we receive a position value that corresponds to
3854          * one more than the last pid shown (or 0 on the first call or
3855          * after a seek to the start). Use a binary-search to find the
3856          * next pid to display, if any
3857          */
3858         struct kernfs_open_file *of = s->private;
3859         struct cgroup *cgrp = seq_css(s)->cgroup;
3860         struct cgroup_pidlist *l;
3861         enum cgroup_filetype type = seq_cft(s)->private;
3862         int index = 0, pid = *pos;
3863         int *iter, ret;
3864
3865         mutex_lock(&cgrp->pidlist_mutex);
3866
3867         /*
3868          * !NULL @of->priv indicates that this isn't the first start()
3869          * after open.  If the matching pidlist is around, we can use that.
3870          * Look for it.  Note that @of->priv can't be used directly.  It
3871          * could already have been destroyed.
3872          */
3873         if (of->priv)
3874                 of->priv = cgroup_pidlist_find(cgrp, type);
3875
3876         /*
3877          * Either this is the first start() after open or the matching
3878          * pidlist has been destroyed inbetween.  Create a new one.
3879          */
3880         if (!of->priv) {
3881                 ret = pidlist_array_load(cgrp, type,
3882                                          (struct cgroup_pidlist **)&of->priv);
3883                 if (ret)
3884                         return ERR_PTR(ret);
3885         }
3886         l = of->priv;
3887
3888         if (pid) {
3889                 int end = l->length;
3890
3891                 while (index < end) {
3892                         int mid = (index + end) / 2;
3893                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3894                                 index = mid;
3895                                 break;
3896                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3897                                 index = mid + 1;
3898                         else
3899                                 end = mid;
3900                 }
3901         }
3902         /* If we're off the end of the array, we're done */
3903         if (index >= l->length)
3904                 return NULL;
3905         /* Update the abstract position to be the actual pid that we found */
3906         iter = l->list + index;
3907         *pos = cgroup_pid_fry(cgrp, *iter);
3908         return iter;
3909 }
3910
3911 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3912 {
3913         struct kernfs_open_file *of = s->private;
3914         struct cgroup_pidlist *l = of->priv;
3915
3916         if (l)
3917                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3918                                  CGROUP_PIDLIST_DESTROY_DELAY);
3919         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3920 }
3921
3922 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3923 {
3924         struct kernfs_open_file *of = s->private;
3925         struct cgroup_pidlist *l = of->priv;
3926         pid_t *p = v;
3927         pid_t *end = l->list + l->length;
3928         /*
3929          * Advance to the next pid in the array. If this goes off the
3930          * end, we're done
3931          */
3932         p++;
3933         if (p >= end) {
3934                 return NULL;
3935         } else {
3936                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3937                 return p;
3938         }
3939 }
3940
3941 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3942 {
3943         return seq_printf(s, "%d\n", *(int *)v);
3944 }
3945
3946 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3947                                          struct cftype *cft)
3948 {
3949         return notify_on_release(css->cgroup);
3950 }
3951
3952 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3953                                           struct cftype *cft, u64 val)
3954 {
3955         clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3956         if (val)
3957                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3958         else
3959                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3960         return 0;
3961 }
3962
3963 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3964                                       struct cftype *cft)
3965 {
3966         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3967 }
3968
3969 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3970                                        struct cftype *cft, u64 val)
3971 {
3972         if (val)
3973                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3974         else
3975                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3976         return 0;
3977 }
3978
3979 static struct cftype cgroup_base_files[] = {
3980         {
3981                 .name = "cgroup.procs",
3982                 .seq_start = cgroup_pidlist_start,
3983                 .seq_next = cgroup_pidlist_next,
3984                 .seq_stop = cgroup_pidlist_stop,
3985                 .seq_show = cgroup_pidlist_show,
3986                 .private = CGROUP_FILE_PROCS,
3987                 .write = cgroup_procs_write,
3988                 .mode = S_IRUGO | S_IWUSR,
3989         },
3990         {
3991                 .name = "cgroup.clone_children",
3992                 .flags = CFTYPE_INSANE,
3993                 .read_u64 = cgroup_clone_children_read,
3994                 .write_u64 = cgroup_clone_children_write,
3995         },
3996         {
3997                 .name = "cgroup.sane_behavior",
3998                 .flags = CFTYPE_ONLY_ON_ROOT,
3999                 .seq_show = cgroup_sane_behavior_show,
4000         },
4001         {
4002                 .name = "cgroup.controllers",
4003                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
4004                 .seq_show = cgroup_root_controllers_show,
4005         },
4006         {
4007                 .name = "cgroup.controllers",
4008                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4009                 .seq_show = cgroup_controllers_show,
4010         },
4011         {
4012                 .name = "cgroup.subtree_control",
4013                 .flags = CFTYPE_ONLY_ON_DFL,
4014                 .seq_show = cgroup_subtree_control_show,
4015                 .write = cgroup_subtree_control_write,
4016         },
4017         {
4018                 .name = "cgroup.populated",
4019                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4020                 .seq_show = cgroup_populated_show,
4021         },
4022
4023         /*
4024          * Historical crazy stuff.  These don't have "cgroup."  prefix and
4025          * don't exist if sane_behavior.  If you're depending on these, be
4026          * prepared to be burned.
4027          */
4028         {
4029                 .name = "tasks",
4030                 .flags = CFTYPE_INSANE,         /* use "procs" instead */
4031                 .seq_start = cgroup_pidlist_start,
4032                 .seq_next = cgroup_pidlist_next,
4033                 .seq_stop = cgroup_pidlist_stop,
4034                 .seq_show = cgroup_pidlist_show,
4035                 .private = CGROUP_FILE_TASKS,
4036                 .write = cgroup_tasks_write,
4037                 .mode = S_IRUGO | S_IWUSR,
4038         },
4039         {
4040                 .name = "notify_on_release",
4041                 .flags = CFTYPE_INSANE,
4042                 .read_u64 = cgroup_read_notify_on_release,
4043                 .write_u64 = cgroup_write_notify_on_release,
4044         },
4045         {
4046                 .name = "release_agent",
4047                 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4048                 .seq_show = cgroup_release_agent_show,
4049                 .write = cgroup_release_agent_write,
4050                 .max_write_len = PATH_MAX - 1,
4051         },
4052         { }     /* terminate */
4053 };
4054
4055 /**
4056  * cgroup_populate_dir - create subsys files in a cgroup directory
4057  * @cgrp: target cgroup
4058  * @subsys_mask: mask of the subsystem ids whose files should be added
4059  *
4060  * On failure, no file is added.
4061  */
4062 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
4063 {
4064         struct cgroup_subsys *ss;
4065         int i, ret = 0;
4066
4067         /* process cftsets of each subsystem */
4068         for_each_subsys(ss, i) {
4069                 struct cftype *cfts;
4070
4071                 if (!(subsys_mask & (1 << i)))
4072                         continue;
4073
4074                 list_for_each_entry(cfts, &ss->cfts, node) {
4075                         ret = cgroup_addrm_files(cgrp, cfts, true);
4076                         if (ret < 0)
4077                                 goto err;
4078                 }
4079         }
4080         return 0;
4081 err:
4082         cgroup_clear_dir(cgrp, subsys_mask);
4083         return ret;
4084 }
4085
4086 /*
4087  * css destruction is four-stage process.
4088  *
4089  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4090  *    Implemented in kill_css().
4091  *
4092  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4093  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4094  *    offlined by invoking offline_css().  After offlining, the base ref is
4095  *    put.  Implemented in css_killed_work_fn().
4096  *
4097  * 3. When the percpu_ref reaches zero, the only possible remaining
4098  *    accessors are inside RCU read sections.  css_release() schedules the
4099  *    RCU callback.
4100  *
4101  * 4. After the grace period, the css can be freed.  Implemented in
4102  *    css_free_work_fn().
4103  *
4104  * It is actually hairier because both step 2 and 4 require process context
4105  * and thus involve punting to css->destroy_work adding two additional
4106  * steps to the already complex sequence.
4107  */
4108 static void css_free_work_fn(struct work_struct *work)
4109 {
4110         struct cgroup_subsys_state *css =
4111                 container_of(work, struct cgroup_subsys_state, destroy_work);
4112         struct cgroup *cgrp = css->cgroup;
4113
4114         if (css->parent)
4115                 css_put(css->parent);
4116
4117         css->ss->css_free(css);
4118         cgroup_put(cgrp);
4119 }
4120
4121 static void css_free_rcu_fn(struct rcu_head *rcu_head)
4122 {
4123         struct cgroup_subsys_state *css =
4124                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4125
4126         INIT_WORK(&css->destroy_work, css_free_work_fn);
4127         queue_work(cgroup_destroy_wq, &css->destroy_work);
4128 }
4129
4130 static void css_release(struct percpu_ref *ref)
4131 {
4132         struct cgroup_subsys_state *css =
4133                 container_of(ref, struct cgroup_subsys_state, refcnt);
4134         struct cgroup_subsys *ss = css->ss;
4135
4136         cgroup_idr_remove(&ss->css_idr, css->id);
4137
4138         call_rcu(&css->rcu_head, css_free_rcu_fn);
4139 }
4140
4141 static void init_and_link_css(struct cgroup_subsys_state *css,
4142                               struct cgroup_subsys *ss, struct cgroup *cgrp)
4143 {
4144         cgroup_get(cgrp);
4145
4146         css->cgroup = cgrp;
4147         css->ss = ss;
4148         css->flags = 0;
4149
4150         if (cgrp->parent) {
4151                 css->parent = cgroup_css(cgrp->parent, ss);
4152                 css_get(css->parent);
4153         } else {
4154                 css->flags |= CSS_ROOT;
4155         }
4156
4157         BUG_ON(cgroup_css(cgrp, ss));
4158 }
4159
4160 /* invoke ->css_online() on a new CSS and mark it online if successful */
4161 static int online_css(struct cgroup_subsys_state *css)
4162 {
4163         struct cgroup_subsys *ss = css->ss;
4164         int ret = 0;
4165
4166         lockdep_assert_held(&cgroup_mutex);
4167
4168         if (ss->css_online)
4169                 ret = ss->css_online(css);
4170         if (!ret) {
4171                 css->flags |= CSS_ONLINE;
4172                 css->cgroup->nr_css++;
4173                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4174         }
4175         return ret;
4176 }
4177
4178 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4179 static void offline_css(struct cgroup_subsys_state *css)
4180 {
4181         struct cgroup_subsys *ss = css->ss;
4182
4183         lockdep_assert_held(&cgroup_mutex);
4184
4185         if (!(css->flags & CSS_ONLINE))
4186                 return;
4187
4188         if (ss->css_offline)
4189                 ss->css_offline(css);
4190
4191         css->flags &= ~CSS_ONLINE;
4192         css->cgroup->nr_css--;
4193         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4194
4195         wake_up_all(&css->cgroup->offline_waitq);
4196 }
4197
4198 /**
4199  * create_css - create a cgroup_subsys_state
4200  * @cgrp: the cgroup new css will be associated with
4201  * @ss: the subsys of new css
4202  *
4203  * Create a new css associated with @cgrp - @ss pair.  On success, the new
4204  * css is online and installed in @cgrp with all interface files created.
4205  * Returns 0 on success, -errno on failure.
4206  */
4207 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4208 {
4209         struct cgroup *parent = cgrp->parent;
4210         struct cgroup_subsys_state *css;
4211         int err;
4212
4213         lockdep_assert_held(&cgroup_mutex);
4214
4215         css = ss->css_alloc(cgroup_css(parent, ss));
4216         if (IS_ERR(css))
4217                 return PTR_ERR(css);
4218
4219         init_and_link_css(css, ss, cgrp);
4220
4221         err = percpu_ref_init(&css->refcnt, css_release);
4222         if (err)
4223                 goto err_free_css;
4224
4225         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4226         if (err < 0)
4227                 goto err_free_percpu_ref;
4228         css->id = err;
4229
4230         err = cgroup_populate_dir(cgrp, 1 << ss->id);
4231         if (err)
4232                 goto err_free_id;
4233
4234         /* @css is ready to be brought online now, make it visible */
4235         cgroup_idr_replace(&ss->css_idr, css, css->id);
4236
4237         err = online_css(css);
4238         if (err)
4239                 goto err_clear_dir;
4240
4241         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4242             parent->parent) {
4243                 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4244                         current->comm, current->pid, ss->name);
4245                 if (!strcmp(ss->name, "memory"))
4246                         pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4247                 ss->warned_broken_hierarchy = true;
4248         }
4249
4250         return 0;
4251
4252 err_clear_dir:
4253         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4254 err_free_id:
4255         cgroup_idr_remove(&ss->css_idr, css->id);
4256 err_free_percpu_ref:
4257         percpu_ref_cancel_init(&css->refcnt);
4258 err_free_css:
4259         call_rcu(&css->rcu_head, css_free_rcu_fn);
4260         return err;
4261 }
4262
4263 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4264                         umode_t mode)
4265 {
4266         struct cgroup *parent, *cgrp;
4267         struct cgroup_root *root;
4268         struct cgroup_subsys *ss;
4269         struct kernfs_node *kn;
4270         int ssid, ret;
4271
4272         parent = cgroup_kn_lock_live(parent_kn);
4273         if (!parent)
4274                 return -ENODEV;
4275         root = parent->root;
4276
4277         /* allocate the cgroup and its ID, 0 is reserved for the root */
4278         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4279         if (!cgrp) {
4280                 ret = -ENOMEM;
4281                 goto out_unlock;
4282         }
4283
4284         /*
4285          * Temporarily set the pointer to NULL, so idr_find() won't return
4286          * a half-baked cgroup.
4287          */
4288         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
4289         if (cgrp->id < 0) {
4290                 ret = -ENOMEM;
4291                 goto out_free_cgrp;
4292         }
4293
4294         init_cgroup_housekeeping(cgrp);
4295
4296         cgrp->parent = parent;
4297         cgrp->self.parent = &parent->self;
4298         cgrp->root = root;
4299
4300         if (notify_on_release(parent))
4301                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4302
4303         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4304                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4305
4306         /* create the directory */
4307         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4308         if (IS_ERR(kn)) {
4309                 ret = PTR_ERR(kn);
4310                 goto out_free_id;
4311         }
4312         cgrp->kn = kn;
4313
4314         /*
4315          * This extra ref will be put in cgroup_free_fn() and guarantees
4316          * that @cgrp->kn is always accessible.
4317          */
4318         kernfs_get(kn);
4319
4320         cgrp->serial_nr = cgroup_serial_nr_next++;
4321
4322         /* allocation complete, commit to creation */
4323         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4324         atomic_inc(&root->nr_cgrps);
4325         cgroup_get(parent);
4326
4327         /*
4328          * @cgrp is now fully operational.  If something fails after this
4329          * point, it'll be released via the normal destruction path.
4330          */
4331         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4332
4333         ret = cgroup_kn_set_ugid(kn);
4334         if (ret)
4335                 goto out_destroy;
4336
4337         ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4338         if (ret)
4339                 goto out_destroy;
4340
4341         /* let's create and online css's */
4342         for_each_subsys(ss, ssid) {
4343                 if (parent->child_subsys_mask & (1 << ssid)) {
4344                         ret = create_css(cgrp, ss);
4345                         if (ret)
4346                                 goto out_destroy;
4347                 }
4348         }
4349
4350         /*
4351          * On the default hierarchy, a child doesn't automatically inherit
4352          * child_subsys_mask from the parent.  Each is configured manually.
4353          */
4354         if (!cgroup_on_dfl(cgrp))
4355                 cgrp->child_subsys_mask = parent->child_subsys_mask;
4356
4357         kernfs_activate(kn);
4358
4359         ret = 0;
4360         goto out_unlock;
4361
4362 out_free_id:
4363         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4364 out_free_cgrp:
4365         kfree(cgrp);
4366 out_unlock:
4367         cgroup_kn_unlock(parent_kn);
4368         return ret;
4369
4370 out_destroy:
4371         cgroup_destroy_locked(cgrp);
4372         goto out_unlock;
4373 }
4374
4375 /*
4376  * This is called when the refcnt of a css is confirmed to be killed.
4377  * css_tryget_online() is now guaranteed to fail.
4378  */
4379 static void css_killed_work_fn(struct work_struct *work)
4380 {
4381         struct cgroup_subsys_state *css =
4382                 container_of(work, struct cgroup_subsys_state, destroy_work);
4383         struct cgroup *cgrp = css->cgroup;
4384
4385         mutex_lock(&cgroup_mutex);
4386
4387         /*
4388          * css_tryget_online() is guaranteed to fail now.  Tell subsystems
4389          * to initate destruction.
4390          */
4391         offline_css(css);
4392
4393         /*
4394          * If @cgrp is marked dead, it's waiting for refs of all css's to
4395          * be disabled before proceeding to the second phase of cgroup
4396          * destruction.  If we are the last one, kick it off.
4397          */
4398         if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4399                 cgroup_destroy_css_killed(cgrp);
4400
4401         mutex_unlock(&cgroup_mutex);
4402
4403         /*
4404          * Put the css refs from kill_css().  Each css holds an extra
4405          * reference to the cgroup's dentry and cgroup removal proceeds
4406          * regardless of css refs.  On the last put of each css, whenever
4407          * that may be, the extra dentry ref is put so that dentry
4408          * destruction happens only after all css's are released.
4409          */
4410         css_put(css);
4411 }
4412
4413 /* css kill confirmation processing requires process context, bounce */
4414 static void css_killed_ref_fn(struct percpu_ref *ref)
4415 {
4416         struct cgroup_subsys_state *css =
4417                 container_of(ref, struct cgroup_subsys_state, refcnt);
4418
4419         INIT_WORK(&css->destroy_work, css_killed_work_fn);
4420         queue_work(cgroup_destroy_wq, &css->destroy_work);
4421 }
4422
4423 /**
4424  * kill_css - destroy a css
4425  * @css: css to destroy
4426  *
4427  * This function initiates destruction of @css by removing cgroup interface
4428  * files and putting its base reference.  ->css_offline() will be invoked
4429  * asynchronously once css_tryget_online() is guaranteed to fail and when
4430  * the reference count reaches zero, @css will be released.
4431  */
4432 static void kill_css(struct cgroup_subsys_state *css)
4433 {
4434         lockdep_assert_held(&cgroup_mutex);
4435
4436         /*
4437          * This must happen before css is disassociated with its cgroup.
4438          * See seq_css() for details.
4439          */
4440         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4441
4442         /*
4443          * Killing would put the base ref, but we need to keep it alive
4444          * until after ->css_offline().
4445          */
4446         css_get(css);
4447
4448         /*
4449          * cgroup core guarantees that, by the time ->css_offline() is
4450          * invoked, no new css reference will be given out via
4451          * css_tryget_online().  We can't simply call percpu_ref_kill() and
4452          * proceed to offlining css's because percpu_ref_kill() doesn't
4453          * guarantee that the ref is seen as killed on all CPUs on return.
4454          *
4455          * Use percpu_ref_kill_and_confirm() to get notifications as each
4456          * css is confirmed to be seen as killed on all CPUs.
4457          */
4458         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4459 }
4460
4461 /**
4462  * cgroup_destroy_locked - the first stage of cgroup destruction
4463  * @cgrp: cgroup to be destroyed
4464  *
4465  * css's make use of percpu refcnts whose killing latency shouldn't be
4466  * exposed to userland and are RCU protected.  Also, cgroup core needs to
4467  * guarantee that css_tryget_online() won't succeed by the time
4468  * ->css_offline() is invoked.  To satisfy all the requirements,
4469  * destruction is implemented in the following two steps.
4470  *
4471  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4472  *     userland visible parts and start killing the percpu refcnts of
4473  *     css's.  Set up so that the next stage will be kicked off once all
4474  *     the percpu refcnts are confirmed to be killed.
4475  *
4476  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4477  *     rest of destruction.  Once all cgroup references are gone, the
4478  *     cgroup is RCU-freed.
4479  *
4480  * This function implements s1.  After this step, @cgrp is gone as far as
4481  * the userland is concerned and a new cgroup with the same name may be
4482  * created.  As cgroup doesn't care about the names internally, this
4483  * doesn't cause any problem.
4484  */
4485 static int cgroup_destroy_locked(struct cgroup *cgrp)
4486         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4487 {
4488         struct cgroup_subsys_state *css;
4489         bool empty;
4490         int ssid;
4491
4492         lockdep_assert_held(&cgroup_mutex);
4493
4494         /*
4495          * css_set_rwsem synchronizes access to ->cset_links and prevents
4496          * @cgrp from being removed while put_css_set() is in progress.
4497          */
4498         down_read(&css_set_rwsem);
4499         empty = list_empty(&cgrp->cset_links);
4500         up_read(&css_set_rwsem);
4501         if (!empty)
4502                 return -EBUSY;
4503
4504         /*
4505          * Make sure there's no live children.  We can't test ->children
4506          * emptiness as dead children linger on it while being destroyed;
4507          * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4508          */
4509         if (cgroup_has_live_children(cgrp))
4510                 return -EBUSY;
4511
4512         /*
4513          * Mark @cgrp dead.  This prevents further task migration and child
4514          * creation by disabling cgroup_lock_live_group().  Note that
4515          * CGRP_DEAD assertion is depended upon by css_next_child() to
4516          * resume iteration after dropping RCU read lock.  See
4517          * css_next_child() for details.
4518          */
4519         set_bit(CGRP_DEAD, &cgrp->flags);
4520
4521         /*
4522          * Initiate massacre of all css's.  cgroup_destroy_css_killed()
4523          * will be invoked to perform the rest of destruction once the
4524          * percpu refs of all css's are confirmed to be killed.
4525          */
4526         for_each_css(css, ssid, cgrp)
4527                 kill_css(css);
4528
4529         /* CGRP_DEAD is set, remove from ->release_list for the last time */
4530         raw_spin_lock(&release_list_lock);
4531         if (!list_empty(&cgrp->release_list))
4532                 list_del_init(&cgrp->release_list);
4533         raw_spin_unlock(&release_list_lock);
4534
4535         /*
4536          * If @cgrp has css's attached, the second stage of cgroup
4537          * destruction is kicked off from css_killed_work_fn() after the
4538          * refs of all attached css's are killed.  If @cgrp doesn't have
4539          * any css, we kick it off here.
4540          */
4541         if (!cgrp->nr_css)
4542                 cgroup_destroy_css_killed(cgrp);
4543
4544         /*
4545          * Remove @cgrp directory along with the base files.  @cgrp has an
4546          * extra ref on its kn.
4547          */
4548         kernfs_remove(cgrp->kn);
4549
4550         set_bit(CGRP_RELEASABLE, &cgrp->parent->flags);
4551         check_for_release(cgrp->parent);
4552
4553         return 0;
4554 };
4555
4556 /**
4557  * cgroup_destroy_css_killed - the second step of cgroup destruction
4558  * @cgrp: the cgroup whose csses have just finished offlining
4559  *
4560  * This function is invoked from a work item for a cgroup which is being
4561  * destroyed after all css's are offlined and performs the rest of
4562  * destruction.  This is the second step of destruction described in the
4563  * comment above cgroup_destroy_locked().
4564  */
4565 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4566 {
4567         lockdep_assert_held(&cgroup_mutex);
4568
4569         cgroup_put(cgrp);
4570 }
4571
4572 static int cgroup_rmdir(struct kernfs_node *kn)
4573 {
4574         struct cgroup *cgrp;
4575         int ret = 0;
4576
4577         cgrp = cgroup_kn_lock_live(kn);
4578         if (!cgrp)
4579                 return 0;
4580         cgroup_get(cgrp);       /* for @kn->priv clearing */
4581
4582         ret = cgroup_destroy_locked(cgrp);
4583
4584         cgroup_kn_unlock(kn);
4585
4586         /*
4587          * There are two control paths which try to determine cgroup from
4588          * dentry without going through kernfs - cgroupstats_build() and
4589          * css_tryget_online_from_dir().  Those are supported by RCU
4590          * protecting clearing of cgrp->kn->priv backpointer, which should
4591          * happen after all files under it have been removed.
4592          */
4593         if (!ret)
4594                 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4595
4596         cgroup_put(cgrp);
4597         return ret;
4598 }
4599
4600 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4601         .remount_fs             = cgroup_remount,
4602         .show_options           = cgroup_show_options,
4603         .mkdir                  = cgroup_mkdir,
4604         .rmdir                  = cgroup_rmdir,
4605         .rename                 = cgroup_rename,
4606 };
4607
4608 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4609 {
4610         struct cgroup_subsys_state *css;
4611
4612         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4613
4614         mutex_lock(&cgroup_mutex);
4615
4616         idr_init(&ss->css_idr);
4617         INIT_LIST_HEAD(&ss->cfts);
4618
4619         /* Create the root cgroup state for this subsystem */
4620         ss->root = &cgrp_dfl_root;
4621         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4622         /* We don't handle early failures gracefully */
4623         BUG_ON(IS_ERR(css));
4624         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4625         if (early) {
4626                 /* idr_alloc() can't be called safely during early init */
4627                 css->id = 1;
4628         } else {
4629                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4630                 BUG_ON(css->id < 0);
4631         }
4632
4633         /* Update the init_css_set to contain a subsys
4634          * pointer to this state - since the subsystem is
4635          * newly registered, all tasks and hence the
4636          * init_css_set is in the subsystem's root cgroup. */
4637         init_css_set.subsys[ss->id] = css;
4638
4639         need_forkexit_callback |= ss->fork || ss->exit;
4640
4641         /* At system boot, before all subsystems have been
4642          * registered, no tasks have been forked, so we don't
4643          * need to invoke fork callbacks here. */
4644         BUG_ON(!list_empty(&init_task.tasks));
4645
4646         BUG_ON(online_css(css));
4647
4648         cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4649
4650         mutex_unlock(&cgroup_mutex);
4651 }
4652
4653 /**
4654  * cgroup_init_early - cgroup initialization at system boot
4655  *
4656  * Initialize cgroups at system boot, and initialize any
4657  * subsystems that request early init.
4658  */
4659 int __init cgroup_init_early(void)
4660 {
4661         static struct cgroup_sb_opts __initdata opts =
4662                 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4663         struct cgroup_subsys *ss;
4664         int i;
4665
4666         init_cgroup_root(&cgrp_dfl_root, &opts);
4667         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4668
4669         for_each_subsys(ss, i) {
4670                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4671                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4672                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4673                      ss->id, ss->name);
4674                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4675                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4676
4677                 ss->id = i;
4678                 ss->name = cgroup_subsys_name[i];
4679
4680                 if (ss->early_init)
4681                         cgroup_init_subsys(ss, true);
4682         }
4683         return 0;
4684 }
4685
4686 /**
4687  * cgroup_init - cgroup initialization
4688  *
4689  * Register cgroup filesystem and /proc file, and initialize
4690  * any subsystems that didn't request early init.
4691  */
4692 int __init cgroup_init(void)
4693 {
4694         struct cgroup_subsys *ss;
4695         unsigned long key;
4696         int ssid, err;
4697
4698         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4699
4700         mutex_lock(&cgroup_mutex);
4701
4702         /* Add init_css_set to the hash table */
4703         key = css_set_hash(init_css_set.subsys);
4704         hash_add(css_set_table, &init_css_set.hlist, key);
4705
4706         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4707
4708         mutex_unlock(&cgroup_mutex);
4709
4710         for_each_subsys(ss, ssid) {
4711                 if (ss->early_init) {
4712                         struct cgroup_subsys_state *css =
4713                                 init_css_set.subsys[ss->id];
4714
4715                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4716                                                    GFP_KERNEL);
4717                         BUG_ON(css->id < 0);
4718                 } else {
4719                         cgroup_init_subsys(ss, false);
4720                 }
4721
4722                 list_add_tail(&init_css_set.e_cset_node[ssid],
4723                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
4724
4725                 /*
4726                  * cftype registration needs kmalloc and can't be done
4727                  * during early_init.  Register base cftypes separately.
4728                  */
4729                 if (ss->base_cftypes)
4730                         WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4731         }
4732
4733         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4734         if (!cgroup_kobj)
4735                 return -ENOMEM;
4736
4737         err = register_filesystem(&cgroup_fs_type);
4738         if (err < 0) {
4739                 kobject_put(cgroup_kobj);
4740                 return err;
4741         }
4742
4743         proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4744         return 0;
4745 }
4746
4747 static int __init cgroup_wq_init(void)
4748 {
4749         /*
4750          * There isn't much point in executing destruction path in
4751          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4752          * Use 1 for @max_active.
4753          *
4754          * We would prefer to do this in cgroup_init() above, but that
4755          * is called before init_workqueues(): so leave this until after.
4756          */
4757         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4758         BUG_ON(!cgroup_destroy_wq);
4759
4760         /*
4761          * Used to destroy pidlists and separate to serve as flush domain.
4762          * Cap @max_active to 1 too.
4763          */
4764         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4765                                                     0, 1);
4766         BUG_ON(!cgroup_pidlist_destroy_wq);
4767
4768         return 0;
4769 }
4770 core_initcall(cgroup_wq_init);
4771
4772 /*
4773  * proc_cgroup_show()
4774  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4775  *  - Used for /proc/<pid>/cgroup.
4776  */
4777
4778 /* TODO: Use a proper seq_file iterator */
4779 int proc_cgroup_show(struct seq_file *m, void *v)
4780 {
4781         struct pid *pid;
4782         struct task_struct *tsk;
4783         char *buf, *path;
4784         int retval;
4785         struct cgroup_root *root;
4786
4787         retval = -ENOMEM;
4788         buf = kmalloc(PATH_MAX, GFP_KERNEL);
4789         if (!buf)
4790                 goto out;
4791
4792         retval = -ESRCH;
4793         pid = m->private;
4794         tsk = get_pid_task(pid, PIDTYPE_PID);
4795         if (!tsk)
4796                 goto out_free;
4797
4798         retval = 0;
4799
4800         mutex_lock(&cgroup_mutex);
4801         down_read(&css_set_rwsem);
4802
4803         for_each_root(root) {
4804                 struct cgroup_subsys *ss;
4805                 struct cgroup *cgrp;
4806                 int ssid, count = 0;
4807
4808                 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4809                         continue;
4810
4811                 seq_printf(m, "%d:", root->hierarchy_id);
4812                 for_each_subsys(ss, ssid)
4813                         if (root->subsys_mask & (1 << ssid))
4814                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4815                 if (strlen(root->name))
4816                         seq_printf(m, "%sname=%s", count ? "," : "",
4817                                    root->name);
4818                 seq_putc(m, ':');
4819                 cgrp = task_cgroup_from_root(tsk, root);
4820                 path = cgroup_path(cgrp, buf, PATH_MAX);
4821                 if (!path) {
4822                         retval = -ENAMETOOLONG;
4823                         goto out_unlock;
4824                 }
4825                 seq_puts(m, path);
4826                 seq_putc(m, '\n');
4827         }
4828
4829 out_unlock:
4830         up_read(&css_set_rwsem);
4831         mutex_unlock(&cgroup_mutex);
4832         put_task_struct(tsk);
4833 out_free:
4834         kfree(buf);
4835 out:
4836         return retval;
4837 }
4838
4839 /* Display information about each subsystem and each hierarchy */
4840 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4841 {
4842         struct cgroup_subsys *ss;
4843         int i;
4844
4845         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4846         /*
4847          * ideally we don't want subsystems moving around while we do this.
4848          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4849          * subsys/hierarchy state.
4850          */
4851         mutex_lock(&cgroup_mutex);
4852
4853         for_each_subsys(ss, i)
4854                 seq_printf(m, "%s\t%d\t%d\t%d\n",
4855                            ss->name, ss->root->hierarchy_id,
4856                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4857
4858         mutex_unlock(&cgroup_mutex);
4859         return 0;
4860 }
4861
4862 static int cgroupstats_open(struct inode *inode, struct file *file)
4863 {
4864         return single_open(file, proc_cgroupstats_show, NULL);
4865 }
4866
4867 static const struct file_operations proc_cgroupstats_operations = {
4868         .open = cgroupstats_open,
4869         .read = seq_read,
4870         .llseek = seq_lseek,
4871         .release = single_release,
4872 };
4873
4874 /**
4875  * cgroup_fork - initialize cgroup related fields during copy_process()
4876  * @child: pointer to task_struct of forking parent process.
4877  *
4878  * A task is associated with the init_css_set until cgroup_post_fork()
4879  * attaches it to the parent's css_set.  Empty cg_list indicates that
4880  * @child isn't holding reference to its css_set.
4881  */
4882 void cgroup_fork(struct task_struct *child)
4883 {
4884         RCU_INIT_POINTER(child->cgroups, &init_css_set);
4885         INIT_LIST_HEAD(&child->cg_list);
4886 }
4887
4888 /**
4889  * cgroup_post_fork - called on a new task after adding it to the task list
4890  * @child: the task in question
4891  *
4892  * Adds the task to the list running through its css_set if necessary and
4893  * call the subsystem fork() callbacks.  Has to be after the task is
4894  * visible on the task list in case we race with the first call to
4895  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4896  * list.
4897  */
4898 void cgroup_post_fork(struct task_struct *child)
4899 {
4900         struct cgroup_subsys *ss;
4901         int i;
4902
4903         /*
4904          * This may race against cgroup_enable_task_cg_links().  As that
4905          * function sets use_task_css_set_links before grabbing
4906          * tasklist_lock and we just went through tasklist_lock to add
4907          * @child, it's guaranteed that either we see the set
4908          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4909          * @child during its iteration.
4910          *
4911          * If we won the race, @child is associated with %current's
4912          * css_set.  Grabbing css_set_rwsem guarantees both that the
4913          * association is stable, and, on completion of the parent's
4914          * migration, @child is visible in the source of migration or
4915          * already in the destination cgroup.  This guarantee is necessary
4916          * when implementing operations which need to migrate all tasks of
4917          * a cgroup to another.
4918          *
4919          * Note that if we lose to cgroup_enable_task_cg_links(), @child
4920          * will remain in init_css_set.  This is safe because all tasks are
4921          * in the init_css_set before cg_links is enabled and there's no
4922          * operation which transfers all tasks out of init_css_set.
4923          */
4924         if (use_task_css_set_links) {
4925                 struct css_set *cset;
4926
4927                 down_write(&css_set_rwsem);
4928                 cset = task_css_set(current);
4929                 if (list_empty(&child->cg_list)) {
4930                         rcu_assign_pointer(child->cgroups, cset);
4931                         list_add(&child->cg_list, &cset->tasks);
4932                         get_css_set(cset);
4933                 }
4934                 up_write(&css_set_rwsem);
4935         }
4936
4937         /*
4938          * Call ss->fork().  This must happen after @child is linked on
4939          * css_set; otherwise, @child might change state between ->fork()
4940          * and addition to css_set.
4941          */
4942         if (need_forkexit_callback) {
4943                 for_each_subsys(ss, i)
4944                         if (ss->fork)
4945                                 ss->fork(child);
4946         }
4947 }
4948
4949 /**
4950  * cgroup_exit - detach cgroup from exiting task
4951  * @tsk: pointer to task_struct of exiting process
4952  *
4953  * Description: Detach cgroup from @tsk and release it.
4954  *
4955  * Note that cgroups marked notify_on_release force every task in
4956  * them to take the global cgroup_mutex mutex when exiting.
4957  * This could impact scaling on very large systems.  Be reluctant to
4958  * use notify_on_release cgroups where very high task exit scaling
4959  * is required on large systems.
4960  *
4961  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
4962  * call cgroup_exit() while the task is still competent to handle
4963  * notify_on_release(), then leave the task attached to the root cgroup in
4964  * each hierarchy for the remainder of its exit.  No need to bother with
4965  * init_css_set refcnting.  init_css_set never goes away and we can't race
4966  * with migration path - PF_EXITING is visible to migration path.
4967  */
4968 void cgroup_exit(struct task_struct *tsk)
4969 {
4970         struct cgroup_subsys *ss;
4971         struct css_set *cset;
4972         bool put_cset = false;
4973         int i;
4974
4975         /*
4976          * Unlink from @tsk from its css_set.  As migration path can't race
4977          * with us, we can check cg_list without grabbing css_set_rwsem.
4978          */
4979         if (!list_empty(&tsk->cg_list)) {
4980                 down_write(&css_set_rwsem);
4981                 list_del_init(&tsk->cg_list);
4982                 up_write(&css_set_rwsem);
4983                 put_cset = true;
4984         }
4985
4986         /* Reassign the task to the init_css_set. */
4987         cset = task_css_set(tsk);
4988         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4989
4990         if (need_forkexit_callback) {
4991                 /* see cgroup_post_fork() for details */
4992                 for_each_subsys(ss, i) {
4993                         if (ss->exit) {
4994                                 struct cgroup_subsys_state *old_css = cset->subsys[i];
4995                                 struct cgroup_subsys_state *css = task_css(tsk, i);
4996
4997                                 ss->exit(css, old_css, tsk);
4998                         }
4999                 }
5000         }
5001
5002         if (put_cset)
5003                 put_css_set(cset, true);
5004 }
5005
5006 static void check_for_release(struct cgroup *cgrp)
5007 {
5008         if (cgroup_is_releasable(cgrp) &&
5009             list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) {
5010                 /*
5011                  * Control Group is currently removeable. If it's not
5012                  * already queued for a userspace notification, queue
5013                  * it now
5014                  */
5015                 int need_schedule_work = 0;
5016
5017                 raw_spin_lock(&release_list_lock);
5018                 if (!cgroup_is_dead(cgrp) &&
5019                     list_empty(&cgrp->release_list)) {
5020                         list_add(&cgrp->release_list, &release_list);
5021                         need_schedule_work = 1;
5022                 }
5023                 raw_spin_unlock(&release_list_lock);
5024                 if (need_schedule_work)
5025                         schedule_work(&release_agent_work);
5026         }
5027 }
5028
5029 /*
5030  * Notify userspace when a cgroup is released, by running the
5031  * configured release agent with the name of the cgroup (path
5032  * relative to the root of cgroup file system) as the argument.
5033  *
5034  * Most likely, this user command will try to rmdir this cgroup.
5035  *
5036  * This races with the possibility that some other task will be
5037  * attached to this cgroup before it is removed, or that some other
5038  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5039  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5040  * unused, and this cgroup will be reprieved from its death sentence,
5041  * to continue to serve a useful existence.  Next time it's released,
5042  * we will get notified again, if it still has 'notify_on_release' set.
5043  *
5044  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5045  * means only wait until the task is successfully execve()'d.  The
5046  * separate release agent task is forked by call_usermodehelper(),
5047  * then control in this thread returns here, without waiting for the
5048  * release agent task.  We don't bother to wait because the caller of
5049  * this routine has no use for the exit status of the release agent
5050  * task, so no sense holding our caller up for that.
5051  */
5052 static void cgroup_release_agent(struct work_struct *work)
5053 {
5054         BUG_ON(work != &release_agent_work);
5055         mutex_lock(&cgroup_mutex);
5056         raw_spin_lock(&release_list_lock);
5057         while (!list_empty(&release_list)) {
5058                 char *argv[3], *envp[3];
5059                 int i;
5060                 char *pathbuf = NULL, *agentbuf = NULL, *path;
5061                 struct cgroup *cgrp = list_entry(release_list.next,
5062                                                     struct cgroup,
5063                                                     release_list);
5064                 list_del_init(&cgrp->release_list);
5065                 raw_spin_unlock(&release_list_lock);
5066                 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5067                 if (!pathbuf)
5068                         goto continue_free;
5069                 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5070                 if (!path)
5071                         goto continue_free;
5072                 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5073                 if (!agentbuf)
5074                         goto continue_free;
5075
5076                 i = 0;
5077                 argv[i++] = agentbuf;
5078                 argv[i++] = path;
5079                 argv[i] = NULL;
5080
5081                 i = 0;
5082                 /* minimal command environment */
5083                 envp[i++] = "HOME=/";
5084                 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5085                 envp[i] = NULL;
5086
5087                 /* Drop the lock while we invoke the usermode helper,
5088                  * since the exec could involve hitting disk and hence
5089                  * be a slow process */
5090                 mutex_unlock(&cgroup_mutex);
5091                 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5092                 mutex_lock(&cgroup_mutex);
5093  continue_free:
5094                 kfree(pathbuf);
5095                 kfree(agentbuf);
5096                 raw_spin_lock(&release_list_lock);
5097         }
5098         raw_spin_unlock(&release_list_lock);
5099         mutex_unlock(&cgroup_mutex);
5100 }
5101
5102 static int __init cgroup_disable(char *str)
5103 {
5104         struct cgroup_subsys *ss;
5105         char *token;
5106         int i;
5107
5108         while ((token = strsep(&str, ",")) != NULL) {
5109                 if (!*token)
5110                         continue;
5111
5112                 for_each_subsys(ss, i) {
5113                         if (!strcmp(token, ss->name)) {
5114                                 ss->disabled = 1;
5115                                 printk(KERN_INFO "Disabling %s control group"
5116                                         " subsystem\n", ss->name);
5117                                 break;
5118                         }
5119                 }
5120         }
5121         return 1;
5122 }
5123 __setup("cgroup_disable=", cgroup_disable);
5124
5125 /**
5126  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5127  * @dentry: directory dentry of interest
5128  * @ss: subsystem of interest
5129  *
5130  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5131  * to get the corresponding css and return it.  If such css doesn't exist
5132  * or can't be pinned, an ERR_PTR value is returned.
5133  */
5134 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5135                                                        struct cgroup_subsys *ss)
5136 {
5137         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5138         struct cgroup_subsys_state *css = NULL;
5139         struct cgroup *cgrp;
5140
5141         /* is @dentry a cgroup dir? */
5142         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5143             kernfs_type(kn) != KERNFS_DIR)
5144                 return ERR_PTR(-EBADF);
5145
5146         rcu_read_lock();
5147
5148         /*
5149          * This path doesn't originate from kernfs and @kn could already
5150          * have been or be removed at any point.  @kn->priv is RCU
5151          * protected for this access.  See cgroup_rmdir() for details.
5152          */
5153         cgrp = rcu_dereference(kn->priv);
5154         if (cgrp)
5155                 css = cgroup_css(cgrp, ss);
5156
5157         if (!css || !css_tryget_online(css))
5158                 css = ERR_PTR(-ENOENT);
5159
5160         rcu_read_unlock();
5161         return css;
5162 }
5163
5164 /**
5165  * css_from_id - lookup css by id
5166  * @id: the cgroup id
5167  * @ss: cgroup subsys to be looked into
5168  *
5169  * Returns the css if there's valid one with @id, otherwise returns NULL.
5170  * Should be called under rcu_read_lock().
5171  */
5172 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5173 {
5174         WARN_ON_ONCE(!rcu_read_lock_held());
5175         return idr_find(&ss->css_idr, id);
5176 }
5177
5178 #ifdef CONFIG_CGROUP_DEBUG
5179 static struct cgroup_subsys_state *
5180 debug_css_alloc(struct cgroup_subsys_state *parent_css)
5181 {
5182         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5183
5184         if (!css)
5185                 return ERR_PTR(-ENOMEM);
5186
5187         return css;
5188 }
5189
5190 static void debug_css_free(struct cgroup_subsys_state *css)
5191 {
5192         kfree(css);
5193 }
5194
5195 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5196                                 struct cftype *cft)
5197 {
5198         return cgroup_task_count(css->cgroup);
5199 }
5200
5201 static u64 current_css_set_read(struct cgroup_subsys_state *css,
5202                                 struct cftype *cft)
5203 {
5204         return (u64)(unsigned long)current->cgroups;
5205 }
5206
5207 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5208                                          struct cftype *cft)
5209 {
5210         u64 count;
5211
5212         rcu_read_lock();
5213         count = atomic_read(&task_css_set(current)->refcount);
5214         rcu_read_unlock();
5215         return count;
5216 }
5217
5218 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5219 {
5220         struct cgrp_cset_link *link;
5221         struct css_set *cset;
5222         char *name_buf;
5223
5224         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
5225         if (!name_buf)
5226                 return -ENOMEM;
5227
5228         down_read(&css_set_rwsem);
5229         rcu_read_lock();
5230         cset = rcu_dereference(current->cgroups);
5231         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5232                 struct cgroup *c = link->cgrp;
5233
5234                 cgroup_name(c, name_buf, NAME_MAX + 1);
5235                 seq_printf(seq, "Root %d group %s\n",
5236                            c->root->hierarchy_id, name_buf);
5237         }
5238         rcu_read_unlock();
5239         up_read(&css_set_rwsem);
5240         kfree(name_buf);
5241         return 0;
5242 }
5243
5244 #define MAX_TASKS_SHOWN_PER_CSS 25
5245 static int cgroup_css_links_read(struct seq_file *seq, void *v)
5246 {
5247         struct cgroup_subsys_state *css = seq_css(seq);
5248         struct cgrp_cset_link *link;
5249
5250         down_read(&css_set_rwsem);
5251         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5252                 struct css_set *cset = link->cset;
5253                 struct task_struct *task;
5254                 int count = 0;
5255
5256                 seq_printf(seq, "css_set %p\n", cset);
5257
5258                 list_for_each_entry(task, &cset->tasks, cg_list) {
5259                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5260                                 goto overflow;
5261                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5262                 }
5263
5264                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5265                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5266                                 goto overflow;
5267                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5268                 }
5269                 continue;
5270         overflow:
5271                 seq_puts(seq, "  ...\n");
5272         }
5273         up_read(&css_set_rwsem);
5274         return 0;
5275 }
5276
5277 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5278 {
5279         return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5280 }
5281
5282 static struct cftype debug_files[] =  {
5283         {
5284                 .name = "taskcount",
5285                 .read_u64 = debug_taskcount_read,
5286         },
5287
5288         {
5289                 .name = "current_css_set",
5290                 .read_u64 = current_css_set_read,
5291         },
5292
5293         {
5294                 .name = "current_css_set_refcount",
5295                 .read_u64 = current_css_set_refcount_read,
5296         },
5297
5298         {
5299                 .name = "current_css_set_cg_links",
5300                 .seq_show = current_css_set_cg_links_read,
5301         },
5302
5303         {
5304                 .name = "cgroup_css_links",
5305                 .seq_show = cgroup_css_links_read,
5306         },
5307
5308         {
5309                 .name = "releasable",
5310                 .read_u64 = releasable_read,
5311         },
5312
5313         { }     /* terminate */
5314 };
5315
5316 struct cgroup_subsys debug_cgrp_subsys = {
5317         .css_alloc = debug_css_alloc,
5318         .css_free = debug_css_free,
5319         .base_cftypes = debug_files,
5320 };
5321 #endif /* CONFIG_CGROUP_DEBUG */