kernel/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include <linux/cgroup.h>
  32 #include <linux/cred.h>
  33 #include <linux/ctype.h>
  34 #include <linux/errno.h>
  35 #include <linux/init_task.h>
  36 #include <linux/kernel.h>
  37 #include <linux/list.h>
  38 #include <linux/mm.h>
  39 #include <linux/mutex.h>
  40 #include <linux/mount.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/slab.h>
  46 #include <linux/spinlock.h>
  47 #include <linux/rwsem.h>
  48 #include <linux/string.h>
  49 #include <linux/sort.h>
  50 #include <linux/kmod.h>
  51 #include <linux/delayacct.h>
  52 #include <linux/cgroupstats.h>
  53 #include <linux/hashtable.h>
  54 #include <linux/pid_namespace.h>
  55 #include <linux/idr.h>
  56 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  57 #include <linux/kthread.h>
  58 #include <linux/delay.h>
  59
  60 #include <linux/atomic.h>
  61
  62 /*
  63  * pidlists linger the following amount before being destroyed.  The goal
  64  * is avoiding frequent destruction in the middle of consecutive read calls
  65  * Expiring in the middle is a performance problem not a correctness one.
  66  * 1 sec should be enough.
  67  */
  68 #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
  69
  70 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  71                                          MAX_CFTYPE_NAME + 2)
  72
  73 /*
  74  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  75  * hierarchy must be performed while holding it.
  76  *
  77  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  78  * objects, and the chain of tasks off each css_set.
  79  *
  80  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  81  * cgroup.h can use them for lockdep annotations.
  82  */
  83 #ifdef CONFIG_PROVE_RCU
  84 DEFINE_MUTEX(cgroup_mutex);
  85 DECLARE_RWSEM(css_set_rwsem);
  86 EXPORT_SYMBOL_GPL(cgroup_mutex);
  87 EXPORT_SYMBOL_GPL(css_set_rwsem);
  88 #else
  89 static DEFINE_MUTEX(cgroup_mutex);
  90 static DECLARE_RWSEM(css_set_rwsem);
  91 #endif
  92
  93 /*
  94  * Protects cgroup_idr and css_idr so that IDs can be released without
  95  * grabbing cgroup_mutex.
  96  */
  97 static DEFINE_SPINLOCK(cgroup_idr_lock);
  98
  99 /*
 100  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 101  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 102  */
 103 static DEFINE_SPINLOCK(release_agent_path_lock);
 104
 105 #define cgroup_assert_mutex_or_rcu_locked()                             \
 106         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
 107                            lockdep_is_held(&cgroup_mutex),              \
 108                            "cgroup_mutex or RCU read lock required");
 109
 110 /*
 111  * cgroup destruction makes heavy use of work items and there can be a lot
 112  * of concurrent destructions.  Use a separate workqueue so that cgroup
 113  * destruction work items don't end up filling up max_active of system_wq
 114  * which may lead to deadlock.
 115  */
 116 static struct workqueue_struct *cgroup_destroy_wq;
 117
 118 /*
 119  * pidlist destructions need to be flushed on cgroup destruction.  Use a
 120  * separate workqueue as flush domain.
 121  */
 122 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 123
 124 /* generate an array of cgroup subsystem pointers */
 125 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 126 static struct cgroup_subsys *cgroup_subsys[] = {
 127 #include <linux/cgroup_subsys.h>
 128 };
 129 #undef SUBSYS
 130
 131 /* array of cgroup subsystem names */
 132 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 133 static const char *cgroup_subsys_name[] = {
 134 #include <linux/cgroup_subsys.h>
 135 };
 136 #undef SUBSYS
 137
 138 /*
 139  * The default hierarchy, reserved for the subsystems that are otherwise
 140  * unattached - it never has more than a single cgroup, and all tasks are
 141  * part of that cgroup.
 142  */
 143 struct cgroup_root cgrp_dfl_root;
 144
 145 /*
 146  * The default hierarchy always exists but is hidden until mounted for the
 147  * first time.  This is for backward compatibility.
 148  */
 149 static bool cgrp_dfl_root_visible;
 150
 151 /* The list of hierarchy roots */
 152
 153 static LIST_HEAD(cgroup_roots);
 154 static int cgroup_root_count;
 155
 156 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 157 static DEFINE_IDR(cgroup_hierarchy_idr);
 158
 159 /*
 160  * Assign a monotonically increasing serial number to cgroups.  It
 161  * guarantees cgroups with bigger numbers are newer than those with smaller
 162  * numbers.  Also, as cgroups are always appended to the parent's
 163  * ->children list, it guarantees that sibling cgroups are always sorted in
 164  * the ascending serial number order on the list.  Protected by
 165  * cgroup_mutex.
 166  */
 167 static u64 cgroup_serial_nr_next = 1;
 168
 169 /* This flag indicates whether tasks in the fork and exit paths should
 170  * check for fork/exit handlers to call. This avoids us having to do
 171  * extra work in the fork/exit path if none of the subsystems need to
 172  * be called.
 173  */
 174 static int need_forkexit_callback __read_mostly;
 175
 176 static struct cftype cgroup_base_files[];
 177
 178 static void cgroup_put(struct cgroup *cgrp);
 179 static int rebind_subsystems(struct cgroup_root *dst_root,
 180                              unsigned int ss_mask);
 181 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 182 static int cgroup_destroy_locked(struct cgroup *cgrp);
 183 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
 184 static void kill_css(struct cgroup_subsys_state *css);
 185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 186                               bool is_add);
 187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 188
 189 /* IDR wrappers which synchronize using cgroup_idr_lock */
 190 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 191                             gfp_t gfp_mask)
 192 {
 193         int ret;
 194
 195         idr_preload(gfp_mask);
 196         spin_lock_bh(&cgroup_idr_lock);
 197         ret = idr_alloc(idr, ptr, start, end, gfp_mask);
 198         spin_unlock_bh(&cgroup_idr_lock);
 199         idr_preload_end();
 200         return ret;
 201 }
 202
 203 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 204 {
 205         void *ret;
 206
 207         spin_lock_bh(&cgroup_idr_lock);
 208         ret = idr_replace(idr, ptr, id);
 209         spin_unlock_bh(&cgroup_idr_lock);
 210         return ret;
 211 }
 212
 213 static void cgroup_idr_remove(struct idr *idr, int id)
 214 {
 215         spin_lock_bh(&cgroup_idr_lock);
 216         idr_remove(idr, id);
 217         spin_unlock_bh(&cgroup_idr_lock);
 218 }
 219
 220 /**
 221  * cgroup_css - obtain a cgroup's css for the specified subsystem
 222  * @cgrp: the cgroup of interest
 223  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 224  *
 225  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 226  * function must be called either under cgroup_mutex or rcu_read_lock() and
 227  * the caller is responsible for pinning the returned css if it wants to
 228  * keep accessing it outside the said locks.  This function may return
 229  * %NULL if @cgrp doesn't have @subsys_id enabled.
 230  */
 231 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 232                                               struct cgroup_subsys *ss)
 233 {
 234         if (ss)
 235                 return rcu_dereference_check(cgrp->subsys[ss->id],
 236                                         lockdep_is_held(&cgroup_mutex));
 237         else
 238                 return &cgrp->self;
 239 }
 240
 241 /**
 242  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 243  * @cgrp: the cgroup of interest
 244  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 245  *
 246  * Similar to cgroup_css() but returns the effctive css, which is defined
 247  * as the matching css of the nearest ancestor including self which has @ss
 248  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 249  * function is guaranteed to return non-NULL css.
 250  */
 251 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 252                                                 struct cgroup_subsys *ss)
 253 {
 254         lockdep_assert_held(&cgroup_mutex);
 255
 256         if (!ss)
 257                 return &cgrp->self;
 258
 259         if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 260                 return NULL;
 261
 262         while (cgrp->parent &&
 263                !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
 264                 cgrp = cgrp->parent;
 265
 266         return cgroup_css(cgrp, ss);
 267 }
 268
 269 /* convenient tests for these bits */
 270 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 271 {
 272         return test_bit(CGRP_DEAD, &cgrp->flags);
 273 }
 274
 275 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 276 {
 277         struct cgroup *cgrp = of->kn->parent->priv;
 278         struct cftype *cft = of_cft(of);
 279
 280         /*
 281          * This is open and unprotected implementation of cgroup_css().
 282          * seq_css() is only called from a kernfs file operation which has
 283          * an active reference on the file.  Because all the subsystem
 284          * files are drained before a css is disassociated with a cgroup,
 285          * the matching css from the cgroup's subsys table is guaranteed to
 286          * be and stay valid until the enclosing operation is complete.
 287          */
 288         if (cft->ss)
 289                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 290         else
 291                 return &cgrp->self;
 292 }
 293 EXPORT_SYMBOL_GPL(of_css);
 294
 295 /**
 296  * cgroup_is_descendant - test ancestry
 297  * @cgrp: the cgroup to be tested
 298  * @ancestor: possible ancestor of @cgrp
 299  *
 300  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 301  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 302  * and @ancestor are accessible.
 303  */
 304 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 305 {
 306         while (cgrp) {
 307                 if (cgrp == ancestor)
 308                         return true;
 309                 cgrp = cgrp->parent;
 310         }
 311         return false;
 312 }
 313
 314 static int cgroup_is_releasable(const struct cgroup *cgrp)
 315 {
 316         const int bits =
 317                 (1 << CGRP_RELEASABLE) |
 318                 (1 << CGRP_NOTIFY_ON_RELEASE);
 319         return (cgrp->flags & bits) == bits;
 320 }
 321
 322 static int notify_on_release(const struct cgroup *cgrp)
 323 {
 324         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 325 }
 326
 327 /**
 328  * for_each_css - iterate all css's of a cgroup
 329  * @css: the iteration cursor
 330  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 331  * @cgrp: the target cgroup to iterate css's of
 332  *
 333  * Should be called under cgroup_[tree_]mutex.
 334  */
 335 #define for_each_css(css, ssid, cgrp)                                   \
 336         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 337                 if (!((css) = rcu_dereference_check(                    \
 338                                 (cgrp)->subsys[(ssid)],                 \
 339                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 340                 else
 341
 342 /**
 343  * for_each_e_css - iterate all effective css's of a cgroup
 344  * @css: the iteration cursor
 345  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 346  * @cgrp: the target cgroup to iterate css's of
 347  *
 348  * Should be called under cgroup_[tree_]mutex.
 349  */
 350 #define for_each_e_css(css, ssid, cgrp)                                 \
 351         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 352                 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 353                         ;                                               \
 354                 else
 355
 356 /**
 357  * for_each_subsys - iterate all enabled cgroup subsystems
 358  * @ss: the iteration cursor
 359  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 360  */
 361 #define for_each_subsys(ss, ssid)                                       \
 362         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
 363              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 364
 365 /* iterate across the hierarchies */
 366 #define for_each_root(root)                                             \
 367         list_for_each_entry((root), &cgroup_roots, root_list)
 368
 369 /* iterate over child cgrps, lock should be held throughout iteration */
 370 #define cgroup_for_each_live_child(child, cgrp)                         \
 371         list_for_each_entry((child), &(cgrp)->children, sibling)        \
 372                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 373                        cgroup_is_dead(child); }))                       \
 374                         ;                                               \
 375                 else
 376
 377 /* the list of cgroups eligible for automatic release. Protected by
 378  * release_list_lock */
 379 static LIST_HEAD(release_list);
 380 static DEFINE_RAW_SPINLOCK(release_list_lock);
 381 static void cgroup_release_agent(struct work_struct *work);
 382 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 383 static void check_for_release(struct cgroup *cgrp);
 384
 385 /*
 386  * A cgroup can be associated with multiple css_sets as different tasks may
 387  * belong to different cgroups on different hierarchies.  In the other
 388  * direction, a css_set is naturally associated with multiple cgroups.
 389  * This M:N relationship is represented by the following link structure
 390  * which exists for each association and allows traversing the associations
 391  * from both sides.
 392  */
 393 struct cgrp_cset_link {
 394         /* the cgroup and css_set this link associates */
 395         struct cgroup           *cgrp;
 396         struct css_set          *cset;
 397
 398         /* list of cgrp_cset_links anchored at cgrp->cset_links */
 399         struct list_head        cset_link;
 400
 401         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
 402         struct list_head        cgrp_link;
 403 };
 404
 405 /*
 406  * The default css_set - used by init and its children prior to any
 407  * hierarchies being mounted. It contains a pointer to the root state
 408  * for each subsystem. Also used to anchor the list of css_sets. Not
 409  * reference-counted, to improve performance when child cgroups
 410  * haven't been created.
 411  */
 412 struct css_set init_css_set = {
 413         .refcount               = ATOMIC_INIT(1),
 414         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 415         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 416         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 417         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 418         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 419 };
 420
 421 static int css_set_count        = 1;    /* 1 for init_css_set */
 422
 423 /**
 424  * cgroup_update_populated - updated populated count of a cgroup
 425  * @cgrp: the target cgroup
 426  * @populated: inc or dec populated count
 427  *
 428  * @cgrp is either getting the first task (css_set) or losing the last.
 429  * Update @cgrp->populated_cnt accordingly.  The count is propagated
 430  * towards root so that a given cgroup's populated_cnt is zero iff the
 431  * cgroup and all its descendants are empty.
 432  *
 433  * @cgrp's interface file "cgroup.populated" is zero if
 434  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
 435  * changes from or to zero, userland is notified that the content of the
 436  * interface file has changed.  This can be used to detect when @cgrp and
 437  * its descendants become populated or empty.
 438  */
 439 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 440 {
 441         lockdep_assert_held(&css_set_rwsem);
 442
 443         do {
 444                 bool trigger;
 445
 446                 if (populated)
 447                         trigger = !cgrp->populated_cnt++;
 448                 else
 449                         trigger = !--cgrp->populated_cnt;
 450
 451                 if (!trigger)
 452                         break;
 453
 454                 if (cgrp->populated_kn)
 455                         kernfs_notify(cgrp->populated_kn);
 456                 cgrp = cgrp->parent;
 457         } while (cgrp);
 458 }
 459
 460 /*
 461  * hash table for cgroup groups. This improves the performance to find
 462  * an existing css_set. This hash doesn't (currently) take into
 463  * account cgroups in empty hierarchies.
 464  */
 465 #define CSS_SET_HASH_BITS       7
 466 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 467
 468 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 469 {
 470         unsigned long key = 0UL;
 471         struct cgroup_subsys *ss;
 472         int i;
 473
 474         for_each_subsys(ss, i)
 475                 key += (unsigned long)css[i];
 476         key = (key >> 16) ^ key;
 477
 478         return key;
 479 }
 480
 481 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 482 {
 483         struct cgrp_cset_link *link, *tmp_link;
 484         struct cgroup_subsys *ss;
 485         int ssid;
 486
 487         lockdep_assert_held(&css_set_rwsem);
 488
 489         if (!atomic_dec_and_test(&cset->refcount))
 490                 return;
 491
 492         /* This css_set is dead. unlink it and release cgroup refcounts */
 493         for_each_subsys(ss, ssid)
 494                 list_del(&cset->e_cset_node[ssid]);
 495         hash_del(&cset->hlist);
 496         css_set_count--;
 497
 498         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 499                 struct cgroup *cgrp = link->cgrp;
 500
 501                 list_del(&link->cset_link);
 502                 list_del(&link->cgrp_link);
 503
 504                 /* @cgrp can't go away while we're holding css_set_rwsem */
 505                 if (list_empty(&cgrp->cset_links)) {
 506                         cgroup_update_populated(cgrp, false);
 507                         if (notify_on_release(cgrp)) {
 508                                 if (taskexit)
 509                                         set_bit(CGRP_RELEASABLE, &cgrp->flags);
 510                                 check_for_release(cgrp);
 511                         }
 512                 }
 513
 514                 kfree(link);
 515         }
 516
 517         kfree_rcu(cset, rcu_head);
 518 }
 519
 520 static void put_css_set(struct css_set *cset, bool taskexit)
 521 {
 522         /*
 523          * Ensure that the refcount doesn't hit zero while any readers
 524          * can see it. Similar to atomic_dec_and_lock(), but for an
 525          * rwlock
 526          */
 527         if (atomic_add_unless(&cset->refcount, -1, 1))
 528                 return;
 529
 530         down_write(&css_set_rwsem);
 531         put_css_set_locked(cset, taskexit);
 532         up_write(&css_set_rwsem);
 533 }
 534
 535 /*
 536  * refcounted get/put for css_set objects
 537  */
 538 static inline void get_css_set(struct css_set *cset)
 539 {
 540         atomic_inc(&cset->refcount);
 541 }
 542
 543 /**
 544  * compare_css_sets - helper function for find_existing_css_set().
 545  * @cset: candidate css_set being tested
 546  * @old_cset: existing css_set for a task
 547  * @new_cgrp: cgroup that's being entered by the task
 548  * @template: desired set of css pointers in css_set (pre-calculated)
 549  *
 550  * Returns true if "cset" matches "old_cset" except for the hierarchy
 551  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 552  */
 553 static bool compare_css_sets(struct css_set *cset,
 554                              struct css_set *old_cset,
 555                              struct cgroup *new_cgrp,
 556                              struct cgroup_subsys_state *template[])
 557 {
 558         struct list_head *l1, *l2;
 559
 560         /*
 561          * On the default hierarchy, there can be csets which are
 562          * associated with the same set of cgroups but different csses.
 563          * Let's first ensure that csses match.
 564          */
 565         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 566                 return false;
 567
 568         /*
 569          * Compare cgroup pointers in order to distinguish between
 570          * different cgroups in hierarchies.  As different cgroups may
 571          * share the same effective css, this comparison is always
 572          * necessary.
 573          */
 574         l1 = &cset->cgrp_links;
 575         l2 = &old_cset->cgrp_links;
 576         while (1) {
 577                 struct cgrp_cset_link *link1, *link2;
 578                 struct cgroup *cgrp1, *cgrp2;
 579
 580                 l1 = l1->next;
 581                 l2 = l2->next;
 582                 /* See if we reached the end - both lists are equal length. */
 583                 if (l1 == &cset->cgrp_links) {
 584                         BUG_ON(l2 != &old_cset->cgrp_links);
 585                         break;
 586                 } else {
 587                         BUG_ON(l2 == &old_cset->cgrp_links);
 588                 }
 589                 /* Locate the cgroups associated with these links. */
 590                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 591                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 592                 cgrp1 = link1->cgrp;
 593                 cgrp2 = link2->cgrp;
 594                 /* Hierarchies should be linked in the same order. */
 595                 BUG_ON(cgrp1->root != cgrp2->root);
 596
 597                 /*
 598                  * If this hierarchy is the hierarchy of the cgroup
 599                  * that's changing, then we need to check that this
 600                  * css_set points to the new cgroup; if it's any other
 601                  * hierarchy, then this css_set should point to the
 602                  * same cgroup as the old css_set.
 603                  */
 604                 if (cgrp1->root == new_cgrp->root) {
 605                         if (cgrp1 != new_cgrp)
 606                                 return false;
 607                 } else {
 608                         if (cgrp1 != cgrp2)
 609                                 return false;
 610                 }
 611         }
 612         return true;
 613 }
 614
 615 /**
 616  * find_existing_css_set - init css array and find the matching css_set
 617  * @old_cset: the css_set that we're using before the cgroup transition
 618  * @cgrp: the cgroup that we're moving into
 619  * @template: out param for the new set of csses, should be clear on entry
 620  */
 621 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 622                                         struct cgroup *cgrp,
 623                                         struct cgroup_subsys_state *template[])
 624 {
 625         struct cgroup_root *root = cgrp->root;
 626         struct cgroup_subsys *ss;
 627         struct css_set *cset;
 628         unsigned long key;
 629         int i;
 630
 631         /*
 632          * Build the set of subsystem state objects that we want to see in the
 633          * new css_set. while subsystems can change globally, the entries here
 634          * won't change, so no need for locking.
 635          */
 636         for_each_subsys(ss, i) {
 637                 if (root->subsys_mask & (1UL << i)) {
 638                         /*
 639                          * @ss is in this hierarchy, so we want the
 640                          * effective css from @cgrp.
 641                          */
 642                         template[i] = cgroup_e_css(cgrp, ss);
 643                 } else {
 644                         /*
 645                          * @ss is not in this hierarchy, so we don't want
 646                          * to change the css.
 647                          */
 648                         template[i] = old_cset->subsys[i];
 649                 }
 650         }
 651
 652         key = css_set_hash(template);
 653         hash_for_each_possible(css_set_table, cset, hlist, key) {
 654                 if (!compare_css_sets(cset, old_cset, cgrp, template))
 655                         continue;
 656
 657                 /* This css_set matches what we need */
 658                 return cset;
 659         }
 660
 661         /* No existing cgroup group matched */
 662         return NULL;
 663 }
 664
 665 static void free_cgrp_cset_links(struct list_head *links_to_free)
 666 {
 667         struct cgrp_cset_link *link, *tmp_link;
 668
 669         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 670                 list_del(&link->cset_link);
 671                 kfree(link);
 672         }
 673 }
 674
 675 /**
 676  * allocate_cgrp_cset_links - allocate cgrp_cset_links
 677  * @count: the number of links to allocate
 678  * @tmp_links: list_head the allocated links are put on
 679  *
 680  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 681  * through ->cset_link.  Returns 0 on success or -errno.
 682  */
 683 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 684 {
 685         struct cgrp_cset_link *link;
 686         int i;
 687
 688         INIT_LIST_HEAD(tmp_links);
 689
 690         for (i = 0; i < count; i++) {
 691                 link = kzalloc(sizeof(*link), GFP_KERNEL);
 692                 if (!link) {
 693                         free_cgrp_cset_links(tmp_links);
 694                         return -ENOMEM;
 695                 }
 696                 list_add(&link->cset_link, tmp_links);
 697         }
 698         return 0;
 699 }
 700
 701 /**
 702  * link_css_set - a helper function to link a css_set to a cgroup
 703  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 704  * @cset: the css_set to be linked
 705  * @cgrp: the destination cgroup
 706  */
 707 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 708                          struct cgroup *cgrp)
 709 {
 710         struct cgrp_cset_link *link;
 711
 712         BUG_ON(list_empty(tmp_links));
 713
 714         if (cgroup_on_dfl(cgrp))
 715                 cset->dfl_cgrp = cgrp;
 716
 717         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 718         link->cset = cset;
 719         link->cgrp = cgrp;
 720
 721         if (list_empty(&cgrp->cset_links))
 722                 cgroup_update_populated(cgrp, true);
 723         list_move(&link->cset_link, &cgrp->cset_links);
 724
 725         /*
 726          * Always add links to the tail of the list so that the list
 727          * is sorted by order of hierarchy creation
 728          */
 729         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 730 }
 731
 732 /**
 733  * find_css_set - return a new css_set with one cgroup updated
 734  * @old_cset: the baseline css_set
 735  * @cgrp: the cgroup to be updated
 736  *
 737  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 738  * substituted into the appropriate hierarchy.
 739  */
 740 static struct css_set *find_css_set(struct css_set *old_cset,
 741                                     struct cgroup *cgrp)
 742 {
 743         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 744         struct css_set *cset;
 745         struct list_head tmp_links;
 746         struct cgrp_cset_link *link;
 747         struct cgroup_subsys *ss;
 748         unsigned long key;
 749         int ssid;
 750
 751         lockdep_assert_held(&cgroup_mutex);
 752
 753         /* First see if we already have a cgroup group that matches
 754          * the desired set */
 755         down_read(&css_set_rwsem);
 756         cset = find_existing_css_set(old_cset, cgrp, template);
 757         if (cset)
 758                 get_css_set(cset);
 759         up_read(&css_set_rwsem);
 760
 761         if (cset)
 762                 return cset;
 763
 764         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 765         if (!cset)
 766                 return NULL;
 767
 768         /* Allocate all the cgrp_cset_link objects that we'll need */
 769         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 770                 kfree(cset);
 771                 return NULL;
 772         }
 773
 774         atomic_set(&cset->refcount, 1);
 775         INIT_LIST_HEAD(&cset->cgrp_links);
 776         INIT_LIST_HEAD(&cset->tasks);
 777         INIT_LIST_HEAD(&cset->mg_tasks);
 778         INIT_LIST_HEAD(&cset->mg_preload_node);
 779         INIT_LIST_HEAD(&cset->mg_node);
 780         INIT_HLIST_NODE(&cset->hlist);
 781
 782         /* Copy the set of subsystem state objects generated in
 783          * find_existing_css_set() */
 784         memcpy(cset->subsys, template, sizeof(cset->subsys));
 785
 786         down_write(&css_set_rwsem);
 787         /* Add reference counts and links from the new css_set. */
 788         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 789                 struct cgroup *c = link->cgrp;
 790
 791                 if (c->root == cgrp->root)
 792                         c = cgrp;
 793                 link_css_set(&tmp_links, cset, c);
 794         }
 795
 796         BUG_ON(!list_empty(&tmp_links));
 797
 798         css_set_count++;
 799
 800         /* Add @cset to the hash table */
 801         key = css_set_hash(cset->subsys);
 802         hash_add(css_set_table, &cset->hlist, key);
 803
 804         for_each_subsys(ss, ssid)
 805                 list_add_tail(&cset->e_cset_node[ssid],
 806                               &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 807
 808         up_write(&css_set_rwsem);
 809
 810         return cset;
 811 }
 812
 813 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 814 {
 815         struct cgroup *root_cgrp = kf_root->kn->priv;
 816
 817         return root_cgrp->root;
 818 }
 819
 820 static int cgroup_init_root_id(struct cgroup_root *root)
 821 {
 822         int id;
 823
 824         lockdep_assert_held(&cgroup_mutex);
 825
 826         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 827         if (id < 0)
 828                 return id;
 829
 830         root->hierarchy_id = id;
 831         return 0;
 832 }
 833
 834 static void cgroup_exit_root_id(struct cgroup_root *root)
 835 {
 836         lockdep_assert_held(&cgroup_mutex);
 837
 838         if (root->hierarchy_id) {
 839                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 840                 root->hierarchy_id = 0;
 841         }
 842 }
 843
 844 static void cgroup_free_root(struct cgroup_root *root)
 845 {
 846         if (root) {
 847                 /* hierarhcy ID shoulid already have been released */
 848                 WARN_ON_ONCE(root->hierarchy_id);
 849
 850                 idr_destroy(&root->cgroup_idr);
 851                 kfree(root);
 852         }
 853 }
 854
 855 static void cgroup_destroy_root(struct cgroup_root *root)
 856 {
 857         struct cgroup *cgrp = &root->cgrp;
 858         struct cgrp_cset_link *link, *tmp_link;
 859
 860         mutex_lock(&cgroup_mutex);
 861
 862         BUG_ON(atomic_read(&root->nr_cgrps));
 863         BUG_ON(!list_empty(&cgrp->children));
 864
 865         /* Rebind all subsystems back to the default hierarchy */
 866         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 867
 868         /*
 869          * Release all the links from cset_links to this hierarchy's
 870          * root cgroup
 871          */
 872         down_write(&css_set_rwsem);
 873
 874         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 875                 list_del(&link->cset_link);
 876                 list_del(&link->cgrp_link);
 877                 kfree(link);
 878         }
 879         up_write(&css_set_rwsem);
 880
 881         if (!list_empty(&root->root_list)) {
 882                 list_del(&root->root_list);
 883                 cgroup_root_count--;
 884         }
 885
 886         cgroup_exit_root_id(root);
 887
 888         mutex_unlock(&cgroup_mutex);
 889
 890         kernfs_destroy_root(root->kf_root);
 891         cgroup_free_root(root);
 892 }
 893
 894 /* look up cgroup associated with given css_set on the specified hierarchy */
 895 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 896                                             struct cgroup_root *root)
 897 {
 898         struct cgroup *res = NULL;
 899
 900         lockdep_assert_held(&cgroup_mutex);
 901         lockdep_assert_held(&css_set_rwsem);
 902
 903         if (cset == &init_css_set) {
 904                 res = &root->cgrp;
 905         } else {
 906                 struct cgrp_cset_link *link;
 907
 908                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 909                         struct cgroup *c = link->cgrp;
 910
 911                         if (c->root == root) {
 912                                 res = c;
 913                                 break;
 914                         }
 915                 }
 916         }
 917
 918         BUG_ON(!res);
 919         return res;
 920 }
 921
 922 /*
 923  * Return the cgroup for "task" from the given hierarchy. Must be
 924  * called with cgroup_mutex and css_set_rwsem held.
 925  */
 926 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 927                                             struct cgroup_root *root)
 928 {
 929         /*
 930          * No need to lock the task - since we hold cgroup_mutex the
 931          * task can't change groups, so the only thing that can happen
 932          * is that it exits and its css is set back to init_css_set.
 933          */
 934         return cset_cgroup_from_root(task_css_set(task), root);
 935 }
 936
 937 /*
 938  * A task must hold cgroup_mutex to modify cgroups.
 939  *
 940  * Any task can increment and decrement the count field without lock.
 941  * So in general, code holding cgroup_mutex can't rely on the count
 942  * field not changing.  However, if the count goes to zero, then only
 943  * cgroup_attach_task() can increment it again.  Because a count of zero
 944  * means that no tasks are currently attached, therefore there is no
 945  * way a task attached to that cgroup can fork (the other way to
 946  * increment the count).  So code holding cgroup_mutex can safely
 947  * assume that if the count is zero, it will stay zero. Similarly, if
 948  * a task holds cgroup_mutex on a cgroup with zero count, it
 949  * knows that the cgroup won't be removed, as cgroup_rmdir()
 950  * needs that mutex.
 951  *
 952  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 953  * (usually) take cgroup_mutex.  These are the two most performance
 954  * critical pieces of code here.  The exception occurs on cgroup_exit(),
 955  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 956  * is taken, and if the cgroup count is zero, a usermode call made
 957  * to the release agent with the name of the cgroup (path relative to
 958  * the root of cgroup file system) as the argument.
 959  *
 960  * A cgroup can only be deleted if both its 'count' of using tasks
 961  * is zero, and its list of 'children' cgroups is empty.  Since all
 962  * tasks in the system use _some_ cgroup, and since there is always at
 963  * least one task in the system (init, pid == 1), therefore, root cgroup
 964  * always has either children cgroups and/or using tasks.  So we don't
 965  * need a special hack to ensure that root cgroup cannot be deleted.
 966  *
 967  * P.S.  One more locking exception.  RCU is used to guard the
 968  * update of a tasks cgroup pointer by cgroup_attach_task()
 969  */
 970
 971 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
 972 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 973 static const struct file_operations proc_cgroupstats_operations;
 974
 975 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 976                               char *buf)
 977 {
 978         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 979             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 980                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 981                          cft->ss->name, cft->name);
 982         else
 983                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 984         return buf;
 985 }
 986
 987 /**
 988  * cgroup_file_mode - deduce file mode of a control file
 989  * @cft: the control file in question
 990  *
 991  * returns cft->mode if ->mode is not 0
 992  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 993  * returns S_IRUGO if it has only a read handler
 994  * returns S_IWUSR if it has only a write hander
 995  */
 996 static umode_t cgroup_file_mode(const struct cftype *cft)
 997 {
 998         umode_t mode = 0;
 999
1000         if (cft->mode)
1001                 return cft->mode;
1002
1003         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1004                 mode |= S_IRUGO;
1005
1006         if (cft->write_u64 || cft->write_s64 || cft->write)
1007                 mode |= S_IWUSR;
1008
1009         return mode;
1010 }
1011
1012 static void cgroup_free_fn(struct work_struct *work)
1013 {
1014         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
1015
1016         atomic_dec(&cgrp->root->nr_cgrps);
1017         cgroup_pidlist_destroy_all(cgrp);
1018
1019         if (cgrp->parent) {
1020                 /*
1021                  * We get a ref to the parent, and put the ref when this
1022                  * cgroup is being freed, so it's guaranteed that the
1023                  * parent won't be destroyed before its children.
1024                  */
1025                 cgroup_put(cgrp->parent);
1026                 kernfs_put(cgrp->kn);
1027                 kfree(cgrp);
1028         } else {
1029                 /*
1030                  * This is root cgroup's refcnt reaching zero, which
1031                  * indicates that the root should be released.
1032                  */
1033                 cgroup_destroy_root(cgrp->root);
1034         }
1035 }
1036
1037 static void cgroup_free_rcu(struct rcu_head *head)
1038 {
1039         struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
1040
1041         INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
1042         queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
1043 }
1044
1045 static void cgroup_get(struct cgroup *cgrp)
1046 {
1047         WARN_ON_ONCE(cgroup_is_dead(cgrp));
1048         WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
1049         atomic_inc(&cgrp->refcnt);
1050 }
1051
1052 static void cgroup_put(struct cgroup *cgrp)
1053 {
1054         if (!atomic_dec_and_test(&cgrp->refcnt))
1055                 return;
1056         if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
1057                 return;
1058
1059         cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
1060         cgrp->id = -1;
1061
1062         call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
1063 }
1064
1065 /**
1066  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1067  * @kn: the kernfs_node being serviced
1068  *
1069  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1070  * the method finishes if locking succeeded.  Note that once this function
1071  * returns the cgroup returned by cgroup_kn_lock_live() may become
1072  * inaccessible any time.  If the caller intends to continue to access the
1073  * cgroup, it should pin it before invoking this function.
1074  */
1075 static void cgroup_kn_unlock(struct kernfs_node *kn)
1076 {
1077         struct cgroup *cgrp;
1078
1079         if (kernfs_type(kn) == KERNFS_DIR)
1080                 cgrp = kn->priv;
1081         else
1082                 cgrp = kn->parent->priv;
1083
1084         mutex_unlock(&cgroup_mutex);
1085
1086         kernfs_unbreak_active_protection(kn);
1087         cgroup_put(cgrp);
1088 }
1089
1090 /**
1091  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1092  * @kn: the kernfs_node being serviced
1093  *
1094  * This helper is to be used by a cgroup kernfs method currently servicing
1095  * @kn.  It breaks the active protection, performs cgroup locking and
1096  * verifies that the associated cgroup is alive.  Returns the cgroup if
1097  * alive; otherwise, %NULL.  A successful return should be undone by a
1098  * matching cgroup_kn_unlock() invocation.
1099  *
1100  * Any cgroup kernfs method implementation which requires locking the
1101  * associated cgroup should use this helper.  It avoids nesting cgroup
1102  * locking under kernfs active protection and allows all kernfs operations
1103  * including self-removal.
1104  */
1105 static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1106 {
1107         struct cgroup *cgrp;
1108
1109         if (kernfs_type(kn) == KERNFS_DIR)
1110                 cgrp = kn->priv;
1111         else
1112                 cgrp = kn->parent->priv;
1113
1114         /*
1115          * We're gonna grab cgroup_mutex which nests outside kernfs
1116          * active_ref.  cgroup liveliness check alone provides enough
1117          * protection against removal.  Ensure @cgrp stays accessible and
1118          * break the active_ref protection.
1119          */
1120         cgroup_get(cgrp);
1121         kernfs_break_active_protection(kn);
1122
1123         mutex_lock(&cgroup_mutex);
1124
1125         if (!cgroup_is_dead(cgrp))
1126                 return cgrp;
1127
1128         cgroup_kn_unlock(kn);
1129         return NULL;
1130 }
1131
1132 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1133 {
1134         char name[CGROUP_FILE_NAME_MAX];
1135
1136         lockdep_assert_held(&cgroup_mutex);
1137         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1138 }
1139
1140 /**
1141  * cgroup_clear_dir - remove subsys files in a cgroup directory
1142  * @cgrp: target cgroup
1143  * @subsys_mask: mask of the subsystem ids whose files should be removed
1144  */
1145 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
1146 {
1147         struct cgroup_subsys *ss;
1148         int i;
1149
1150         for_each_subsys(ss, i) {
1151                 struct cftype *cfts;
1152
1153                 if (!(subsys_mask & (1 << i)))
1154                         continue;
1155                 list_for_each_entry(cfts, &ss->cfts, node)
1156                         cgroup_addrm_files(cgrp, cfts, false);
1157         }
1158 }
1159
1160 static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
1161 {
1162         struct cgroup_subsys *ss;
1163         int ssid, i, ret;
1164
1165         lockdep_assert_held(&cgroup_mutex);
1166
1167         for_each_subsys(ss, ssid) {
1168                 if (!(ss_mask & (1 << ssid)))
1169                         continue;
1170
1171                 /* if @ss has non-root csses attached to it, can't move */
1172                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
1173                         return -EBUSY;
1174
1175                 /* can't move between two non-dummy roots either */
1176                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1177                         return -EBUSY;
1178         }
1179
1180         ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1181         if (ret) {
1182                 if (dst_root != &cgrp_dfl_root)
1183                         return ret;
1184
1185                 /*
1186                  * Rebinding back to the default root is not allowed to
1187                  * fail.  Using both default and non-default roots should
1188                  * be rare.  Moving subsystems back and forth even more so.
1189                  * Just warn about it and continue.
1190                  */
1191                 if (cgrp_dfl_root_visible) {
1192                         pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
1193                                 ret, ss_mask);
1194                         pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1195                 }
1196         }
1197
1198         /*
1199          * Nothing can fail from this point on.  Remove files for the
1200          * removed subsystems and rebind each subsystem.
1201          */
1202         for_each_subsys(ss, ssid)
1203                 if (ss_mask & (1 << ssid))
1204                         cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1205
1206         for_each_subsys(ss, ssid) {
1207                 struct cgroup_root *src_root;
1208                 struct cgroup_subsys_state *css;
1209                 struct css_set *cset;
1210
1211                 if (!(ss_mask & (1 << ssid)))
1212                         continue;
1213
1214                 src_root = ss->root;
1215                 css = cgroup_css(&src_root->cgrp, ss);
1216
1217                 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1218
1219                 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1220                 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1221                 ss->root = dst_root;
1222                 css->cgroup = &dst_root->cgrp;
1223
1224                 down_write(&css_set_rwsem);
1225                 hash_for_each(css_set_table, i, cset, hlist)
1226                         list_move_tail(&cset->e_cset_node[ss->id],
1227                                        &dst_root->cgrp.e_csets[ss->id]);
1228                 up_write(&css_set_rwsem);
1229
1230                 src_root->subsys_mask &= ~(1 << ssid);
1231                 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1232
1233                 /* default hierarchy doesn't enable controllers by default */
1234                 dst_root->subsys_mask |= 1 << ssid;
1235                 if (dst_root != &cgrp_dfl_root)
1236                         dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1237
1238                 if (ss->bind)
1239                         ss->bind(css);
1240         }
1241
1242         kernfs_activate(dst_root->cgrp.kn);
1243         return 0;
1244 }
1245
1246 static int cgroup_show_options(struct seq_file *seq,
1247                                struct kernfs_root *kf_root)
1248 {
1249         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1250         struct cgroup_subsys *ss;
1251         int ssid;
1252
1253         for_each_subsys(ss, ssid)
1254                 if (root->subsys_mask & (1 << ssid))
1255                         seq_printf(seq, ",%s", ss->name);
1256         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1257                 seq_puts(seq, ",sane_behavior");
1258         if (root->flags & CGRP_ROOT_NOPREFIX)
1259                 seq_puts(seq, ",noprefix");
1260         if (root->flags & CGRP_ROOT_XATTR)
1261                 seq_puts(seq, ",xattr");
1262
1263         spin_lock(&release_agent_path_lock);
1264         if (strlen(root->release_agent_path))
1265                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1266         spin_unlock(&release_agent_path_lock);
1267
1268         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1269                 seq_puts(seq, ",clone_children");
1270         if (strlen(root->name))
1271                 seq_printf(seq, ",name=%s", root->name);
1272         return 0;
1273 }
1274
1275 struct cgroup_sb_opts {
1276         unsigned int subsys_mask;
1277         unsigned int flags;
1278         char *release_agent;
1279         bool cpuset_clone_children;
1280         char *name;
1281         /* User explicitly requested empty subsystem */
1282         bool none;
1283 };
1284
1285 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1286 {
1287         char *token, *o = data;
1288         bool all_ss = false, one_ss = false;
1289         unsigned int mask = -1U;
1290         struct cgroup_subsys *ss;
1291         int i;
1292
1293 #ifdef CONFIG_CPUSETS
1294         mask = ~(1U << cpuset_cgrp_id);
1295 #endif
1296
1297         memset(opts, 0, sizeof(*opts));
1298
1299         while ((token = strsep(&o, ",")) != NULL) {
1300                 if (!*token)
1301                         return -EINVAL;
1302                 if (!strcmp(token, "none")) {
1303                         /* Explicitly have no subsystems */
1304                         opts->none = true;
1305                         continue;
1306                 }
1307                 if (!strcmp(token, "all")) {
1308                         /* Mutually exclusive option 'all' + subsystem name */
1309                         if (one_ss)
1310                                 return -EINVAL;
1311                         all_ss = true;
1312                         continue;
1313                 }
1314                 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1315                         opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1316                         continue;
1317                 }
1318                 if (!strcmp(token, "noprefix")) {
1319                         opts->flags |= CGRP_ROOT_NOPREFIX;
1320                         continue;
1321                 }
1322                 if (!strcmp(token, "clone_children")) {
1323                         opts->cpuset_clone_children = true;
1324                         continue;
1325                 }
1326                 if (!strcmp(token, "xattr")) {
1327                         opts->flags |= CGRP_ROOT_XATTR;
1328                         continue;
1329                 }
1330                 if (!strncmp(token, "release_agent=", 14)) {
1331                         /* Specifying two release agents is forbidden */
1332                         if (opts->release_agent)
1333                                 return -EINVAL;
1334                         opts->release_agent =
1335                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1336                         if (!opts->release_agent)
1337                                 return -ENOMEM;
1338                         continue;
1339                 }
1340                 if (!strncmp(token, "name=", 5)) {
1341                         const char *name = token + 5;
1342                         /* Can't specify an empty name */
1343                         if (!strlen(name))
1344                                 return -EINVAL;
1345                         /* Must match [\w.-]+ */
1346                         for (i = 0; i < strlen(name); i++) {
1347                                 char c = name[i];
1348                                 if (isalnum(c))
1349                                         continue;
1350                                 if ((c == '.') || (c == '-') || (c == '_'))
1351                                         continue;
1352                                 return -EINVAL;
1353                         }
1354                         /* Specifying two names is forbidden */
1355                         if (opts->name)
1356                                 return -EINVAL;
1357                         opts->name = kstrndup(name,
1358                                               MAX_CGROUP_ROOT_NAMELEN - 1,
1359                                               GFP_KERNEL);
1360                         if (!opts->name)
1361                                 return -ENOMEM;
1362
1363                         continue;
1364                 }
1365
1366                 for_each_subsys(ss, i) {
1367                         if (strcmp(token, ss->name))
1368                                 continue;
1369                         if (ss->disabled)
1370                                 continue;
1371
1372                         /* Mutually exclusive option 'all' + subsystem name */
1373                         if (all_ss)
1374                                 return -EINVAL;
1375                         opts->subsys_mask |= (1 << i);
1376                         one_ss = true;
1377
1378                         break;
1379                 }
1380                 if (i == CGROUP_SUBSYS_COUNT)
1381                         return -ENOENT;
1382         }
1383
1384         /* Consistency checks */
1385
1386         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1387                 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1388
1389                 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1390                     opts->cpuset_clone_children || opts->release_agent ||
1391                     opts->name) {
1392                         pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1393                         return -EINVAL;
1394                 }
1395         } else {
1396                 /*
1397                  * If the 'all' option was specified select all the
1398                  * subsystems, otherwise if 'none', 'name=' and a subsystem
1399                  * name options were not specified, let's default to 'all'
1400                  */
1401                 if (all_ss || (!one_ss && !opts->none && !opts->name))
1402                         for_each_subsys(ss, i)
1403                                 if (!ss->disabled)
1404                                         opts->subsys_mask |= (1 << i);
1405
1406                 /*
1407                  * We either have to specify by name or by subsystems. (So
1408                  * all empty hierarchies must have a name).
1409                  */
1410                 if (!opts->subsys_mask && !opts->name)
1411                         return -EINVAL;
1412         }
1413
1414         /*
1415          * Option noprefix was introduced just for backward compatibility
1416          * with the old cpuset, so we allow noprefix only if mounting just
1417          * the cpuset subsystem.
1418          */
1419         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1420                 return -EINVAL;
1421
1422
1423         /* Can't specify "none" and some subsystems */
1424         if (opts->subsys_mask && opts->none)
1425                 return -EINVAL;
1426
1427         return 0;
1428 }
1429
1430 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1431 {
1432         int ret = 0;
1433         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1434         struct cgroup_sb_opts opts;
1435         unsigned int added_mask, removed_mask;
1436
1437         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1438                 pr_err("sane_behavior: remount is not allowed\n");
1439                 return -EINVAL;
1440         }
1441
1442         mutex_lock(&cgroup_mutex);
1443
1444         /* See what subsystems are wanted */
1445         ret = parse_cgroupfs_options(data, &opts);
1446         if (ret)
1447                 goto out_unlock;
1448
1449         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1450                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
1451                         task_tgid_nr(current), current->comm);
1452
1453         added_mask = opts.subsys_mask & ~root->subsys_mask;
1454         removed_mask = root->subsys_mask & ~opts.subsys_mask;
1455
1456         /* Don't allow flags or name to change at remount */
1457         if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1458             (opts.name && strcmp(opts.name, root->name))) {
1459                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
1460                        opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1461                        root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1462                 ret = -EINVAL;
1463                 goto out_unlock;
1464         }
1465
1466         /* remounting is not allowed for populated hierarchies */
1467         if (!list_empty(&root->cgrp.children)) {
1468                 ret = -EBUSY;
1469                 goto out_unlock;
1470         }
1471
1472         ret = rebind_subsystems(root, added_mask);
1473         if (ret)
1474                 goto out_unlock;
1475
1476         rebind_subsystems(&cgrp_dfl_root, removed_mask);
1477
1478         if (opts.release_agent) {
1479                 spin_lock(&release_agent_path_lock);
1480                 strcpy(root->release_agent_path, opts.release_agent);
1481                 spin_unlock(&release_agent_path_lock);
1482         }
1483  out_unlock:
1484         kfree(opts.release_agent);
1485         kfree(opts.name);
1486         mutex_unlock(&cgroup_mutex);
1487         return ret;
1488 }
1489
1490 /*
1491  * To reduce the fork() overhead for systems that are not actually using
1492  * their cgroups capability, we don't maintain the lists running through
1493  * each css_set to its tasks until we see the list actually used - in other
1494  * words after the first mount.
1495  */
1496 static bool use_task_css_set_links __read_mostly;
1497
1498 static void cgroup_enable_task_cg_lists(void)
1499 {
1500         struct task_struct *p, *g;
1501
1502         down_write(&css_set_rwsem);
1503
1504         if (use_task_css_set_links)
1505                 goto out_unlock;
1506
1507         use_task_css_set_links = true;
1508
1509         /*
1510          * We need tasklist_lock because RCU is not safe against
1511          * while_each_thread(). Besides, a forking task that has passed
1512          * cgroup_post_fork() without seeing use_task_css_set_links = 1
1513          * is not guaranteed to have its child immediately visible in the
1514          * tasklist if we walk through it with RCU.
1515          */
1516         read_lock(&tasklist_lock);
1517         do_each_thread(g, p) {
1518                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1519                              task_css_set(p) != &init_css_set);
1520
1521                 /*
1522                  * We should check if the process is exiting, otherwise
1523                  * it will race with cgroup_exit() in that the list
1524                  * entry won't be deleted though the process has exited.
1525                  * Do it while holding siglock so that we don't end up
1526                  * racing against cgroup_exit().
1527                  */
1528                 spin_lock_irq(&p->sighand->siglock);
1529                 if (!(p->flags & PF_EXITING)) {
1530                         struct css_set *cset = task_css_set(p);
1531
1532                         list_add(&p->cg_list, &cset->tasks);
1533                         get_css_set(cset);
1534                 }
1535                 spin_unlock_irq(&p->sighand->siglock);
1536         } while_each_thread(g, p);
1537         read_unlock(&tasklist_lock);
1538 out_unlock:
1539         up_write(&css_set_rwsem);
1540 }
1541
1542 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1543 {
1544         struct cgroup_subsys *ss;
1545         int ssid;
1546
1547         atomic_set(&cgrp->refcnt, 1);
1548         INIT_LIST_HEAD(&cgrp->sibling);
1549         INIT_LIST_HEAD(&cgrp->children);
1550         INIT_LIST_HEAD(&cgrp->cset_links);
1551         INIT_LIST_HEAD(&cgrp->release_list);
1552         INIT_LIST_HEAD(&cgrp->pidlists);
1553         mutex_init(&cgrp->pidlist_mutex);
1554         cgrp->self.cgroup = cgrp;
1555
1556         for_each_subsys(ss, ssid)
1557                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1558
1559         init_waitqueue_head(&cgrp->offline_waitq);
1560 }
1561
1562 static void init_cgroup_root(struct cgroup_root *root,
1563                              struct cgroup_sb_opts *opts)
1564 {
1565         struct cgroup *cgrp = &root->cgrp;
1566
1567         INIT_LIST_HEAD(&root->root_list);
1568         atomic_set(&root->nr_cgrps, 1);
1569         cgrp->root = root;
1570         init_cgroup_housekeeping(cgrp);
1571         idr_init(&root->cgroup_idr);
1572
1573         root->flags = opts->flags;
1574         if (opts->release_agent)
1575                 strcpy(root->release_agent_path, opts->release_agent);
1576         if (opts->name)
1577                 strcpy(root->name, opts->name);
1578         if (opts->cpuset_clone_children)
1579                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1580 }
1581
1582 static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1583 {
1584         LIST_HEAD(tmp_links);
1585         struct cgroup *root_cgrp = &root->cgrp;
1586         struct css_set *cset;
1587         int i, ret;
1588
1589         lockdep_assert_held(&cgroup_mutex);
1590
1591         ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
1592         if (ret < 0)
1593                 goto out;
1594         root_cgrp->id = ret;
1595
1596         /*
1597          * We're accessing css_set_count without locking css_set_rwsem here,
1598          * but that's OK - it can only be increased by someone holding
1599          * cgroup_lock, and that's us. The worst that can happen is that we
1600          * have some link structures left over
1601          */
1602         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1603         if (ret)
1604                 goto out;
1605
1606         ret = cgroup_init_root_id(root);
1607         if (ret)
1608                 goto out;
1609
1610         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1611                                            KERNFS_ROOT_CREATE_DEACTIVATED,
1612                                            root_cgrp);
1613         if (IS_ERR(root->kf_root)) {
1614                 ret = PTR_ERR(root->kf_root);
1615                 goto exit_root_id;
1616         }
1617         root_cgrp->kn = root->kf_root->kn;
1618
1619         ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1620         if (ret)
1621                 goto destroy_root;
1622
1623         ret = rebind_subsystems(root, ss_mask);
1624         if (ret)
1625                 goto destroy_root;
1626
1627         /*
1628          * There must be no failure case after here, since rebinding takes
1629          * care of subsystems' refcounts, which are explicitly dropped in
1630          * the failure exit path.
1631          */
1632         list_add(&root->root_list, &cgroup_roots);
1633         cgroup_root_count++;
1634
1635         /*
1636          * Link the root cgroup in this hierarchy into all the css_set
1637          * objects.
1638          */
1639         down_write(&css_set_rwsem);
1640         hash_for_each(css_set_table, i, cset, hlist)
1641                 link_css_set(&tmp_links, cset, root_cgrp);
1642         up_write(&css_set_rwsem);
1643
1644         BUG_ON(!list_empty(&root_cgrp->children));
1645         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1646
1647         kernfs_activate(root_cgrp->kn);
1648         ret = 0;
1649         goto out;
1650
1651 destroy_root:
1652         kernfs_destroy_root(root->kf_root);
1653         root->kf_root = NULL;
1654 exit_root_id:
1655         cgroup_exit_root_id(root);
1656 out:
1657         free_cgrp_cset_links(&tmp_links);
1658         return ret;
1659 }
1660
1661 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1662                          int flags, const char *unused_dev_name,
1663                          void *data)
1664 {
1665         struct cgroup_root *root;
1666         struct cgroup_sb_opts opts;
1667         struct dentry *dentry;
1668         int ret;
1669         bool new_sb;
1670
1671         /*
1672          * The first time anyone tries to mount a cgroup, enable the list
1673          * linking each css_set to its tasks and fix up all existing tasks.
1674          */
1675         if (!use_task_css_set_links)
1676                 cgroup_enable_task_cg_lists();
1677
1678         mutex_lock(&cgroup_mutex);
1679
1680         /* First find the desired set of subsystems */
1681         ret = parse_cgroupfs_options(data, &opts);
1682         if (ret)
1683                 goto out_unlock;
1684
1685         /* look for a matching existing root */
1686         if (!opts.subsys_mask && !opts.none && !opts.name) {
1687                 cgrp_dfl_root_visible = true;
1688                 root = &cgrp_dfl_root;
1689                 cgroup_get(&root->cgrp);
1690                 ret = 0;
1691                 goto out_unlock;
1692         }
1693
1694         for_each_root(root) {
1695                 bool name_match = false;
1696
1697                 if (root == &cgrp_dfl_root)
1698                         continue;
1699
1700                 /*
1701                  * If we asked for a name then it must match.  Also, if
1702                  * name matches but sybsys_mask doesn't, we should fail.
1703                  * Remember whether name matched.
1704                  */
1705                 if (opts.name) {
1706                         if (strcmp(opts.name, root->name))
1707                                 continue;
1708                         name_match = true;
1709                 }
1710
1711                 /*
1712                  * If we asked for subsystems (or explicitly for no
1713                  * subsystems) then they must match.
1714                  */
1715                 if ((opts.subsys_mask || opts.none) &&
1716                     (opts.subsys_mask != root->subsys_mask)) {
1717                         if (!name_match)
1718                                 continue;
1719                         ret = -EBUSY;
1720                         goto out_unlock;
1721                 }
1722
1723                 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1724                         if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1725                                 pr_err("sane_behavior: new mount options should match the existing superblock\n");
1726                                 ret = -EINVAL;
1727                                 goto out_unlock;
1728                         } else {
1729                                 pr_warn("new mount options do not match the existing superblock, will be ignored\n");
1730                         }
1731                 }
1732
1733                 /*
1734                  * A root's lifetime is governed by its root cgroup.  Zero
1735                  * ref indicate that the root is being destroyed.  Wait for
1736                  * destruction to complete so that the subsystems are free.
1737                  * We can use wait_queue for the wait but this path is
1738                  * super cold.  Let's just sleep for a bit and retry.
1739                  */
1740                 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1741                         mutex_unlock(&cgroup_mutex);
1742                         msleep(10);
1743                         ret = restart_syscall();
1744                         goto out_free;
1745                 }
1746
1747                 ret = 0;
1748                 goto out_unlock;
1749         }
1750
1751         /*
1752          * No such thing, create a new one.  name= matching without subsys
1753          * specification is allowed for already existing hierarchies but we
1754          * can't create new one without subsys specification.
1755          */
1756         if (!opts.subsys_mask && !opts.none) {
1757                 ret = -EINVAL;
1758                 goto out_unlock;
1759         }
1760
1761         root = kzalloc(sizeof(*root), GFP_KERNEL);
1762         if (!root) {
1763                 ret = -ENOMEM;
1764                 goto out_unlock;
1765         }
1766
1767         init_cgroup_root(root, &opts);
1768
1769         ret = cgroup_setup_root(root, opts.subsys_mask);
1770         if (ret)
1771                 cgroup_free_root(root);
1772
1773 out_unlock:
1774         mutex_unlock(&cgroup_mutex);
1775 out_free:
1776         kfree(opts.release_agent);
1777         kfree(opts.name);
1778
1779         if (ret)
1780                 return ERR_PTR(ret);
1781
1782         dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1783         if (IS_ERR(dentry) || !new_sb)
1784                 cgroup_put(&root->cgrp);
1785         return dentry;
1786 }
1787
1788 static void cgroup_kill_sb(struct super_block *sb)
1789 {
1790         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1791         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1792
1793         cgroup_put(&root->cgrp);
1794         kernfs_kill_sb(sb);
1795 }
1796
1797 static struct file_system_type cgroup_fs_type = {
1798         .name = "cgroup",
1799         .mount = cgroup_mount,
1800         .kill_sb = cgroup_kill_sb,
1801 };
1802
1803 static struct kobject *cgroup_kobj;
1804
1805 /**
1806  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1807  * @task: target task
1808  * @buf: the buffer to write the path into
1809  * @buflen: the length of the buffer
1810  *
1811  * Determine @task's cgroup on the first (the one with the lowest non-zero
1812  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1813  * function grabs cgroup_mutex and shouldn't be used inside locks used by
1814  * cgroup controller callbacks.
1815  *
1816  * Return value is the same as kernfs_path().
1817  */
1818 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1819 {
1820         struct cgroup_root *root;
1821         struct cgroup *cgrp;
1822         int hierarchy_id = 1;
1823         char *path = NULL;
1824
1825         mutex_lock(&cgroup_mutex);
1826         down_read(&css_set_rwsem);
1827
1828         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1829
1830         if (root) {
1831                 cgrp = task_cgroup_from_root(task, root);
1832                 path = cgroup_path(cgrp, buf, buflen);
1833         } else {
1834                 /* if no hierarchy exists, everyone is in "/" */
1835                 if (strlcpy(buf, "/", buflen) < buflen)
1836                         path = buf;
1837         }
1838
1839         up_read(&css_set_rwsem);
1840         mutex_unlock(&cgroup_mutex);
1841         return path;
1842 }
1843 EXPORT_SYMBOL_GPL(task_cgroup_path);
1844
1845 /* used to track tasks and other necessary states during migration */
1846 struct cgroup_taskset {
1847         /* the src and dst cset list running through cset->mg_node */
1848         struct list_head        src_csets;
1849         struct list_head        dst_csets;
1850
1851         /*
1852          * Fields for cgroup_taskset_*() iteration.
1853          *
1854          * Before migration is committed, the target migration tasks are on
1855          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1856          * the csets on ->dst_csets.  ->csets point to either ->src_csets
1857          * or ->dst_csets depending on whether migration is committed.
1858          *
1859          * ->cur_csets and ->cur_task point to the current task position
1860          * during iteration.
1861          */
1862         struct list_head        *csets;
1863         struct css_set          *cur_cset;
1864         struct task_struct      *cur_task;
1865 };
1866
1867 /**
1868  * cgroup_taskset_first - reset taskset and return the first task
1869  * @tset: taskset of interest
1870  *
1871  * @tset iteration is initialized and the first task is returned.
1872  */
1873 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1874 {
1875         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1876         tset->cur_task = NULL;
1877
1878         return cgroup_taskset_next(tset);
1879 }
1880
1881 /**
1882  * cgroup_taskset_next - iterate to the next task in taskset
1883  * @tset: taskset of interest
1884  *
1885  * Return the next task in @tset.  Iteration must have been initialized
1886  * with cgroup_taskset_first().
1887  */
1888 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1889 {
1890         struct css_set *cset = tset->cur_cset;
1891         struct task_struct *task = tset->cur_task;
1892
1893         while (&cset->mg_node != tset->csets) {
1894                 if (!task)
1895                         task = list_first_entry(&cset->mg_tasks,
1896                                                 struct task_struct, cg_list);
1897                 else
1898                         task = list_next_entry(task, cg_list);
1899
1900                 if (&task->cg_list != &cset->mg_tasks) {
1901                         tset->cur_cset = cset;
1902                         tset->cur_task = task;
1903                         return task;
1904                 }
1905
1906                 cset = list_next_entry(cset, mg_node);
1907                 task = NULL;
1908         }
1909
1910         return NULL;
1911 }
1912
1913 /**
1914  * cgroup_task_migrate - move a task from one cgroup to another.
1915  * @old_cgrp: the cgroup @tsk is being migrated from
1916  * @tsk: the task being migrated
1917  * @new_cset: the new css_set @tsk is being attached to
1918  *
1919  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1920  */
1921 static void cgroup_task_migrate(struct cgroup *old_cgrp,
1922                                 struct task_struct *tsk,
1923                                 struct css_set *new_cset)
1924 {
1925         struct css_set *old_cset;
1926
1927         lockdep_assert_held(&cgroup_mutex);
1928         lockdep_assert_held(&css_set_rwsem);
1929
1930         /*
1931          * We are synchronized through threadgroup_lock() against PF_EXITING
1932          * setting such that we can't race against cgroup_exit() changing the
1933          * css_set to init_css_set and dropping the old one.
1934          */
1935         WARN_ON_ONCE(tsk->flags & PF_EXITING);
1936         old_cset = task_css_set(tsk);
1937
1938         get_css_set(new_cset);
1939         rcu_assign_pointer(tsk->cgroups, new_cset);
1940
1941         /*
1942          * Use move_tail so that cgroup_taskset_first() still returns the
1943          * leader after migration.  This works because cgroup_migrate()
1944          * ensures that the dst_cset of the leader is the first on the
1945          * tset's dst_csets list.
1946          */
1947         list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1948
1949         /*
1950          * We just gained a reference on old_cset by taking it from the
1951          * task. As trading it for new_cset is protected by cgroup_mutex,
1952          * we're safe to drop it here; it will be freed under RCU.
1953          */
1954         set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1955         put_css_set_locked(old_cset, false);
1956 }
1957
1958 /**
1959  * cgroup_migrate_finish - cleanup after attach
1960  * @preloaded_csets: list of preloaded css_sets
1961  *
1962  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
1963  * those functions for details.
1964  */
1965 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1966 {
1967         struct css_set *cset, *tmp_cset;
1968
1969         lockdep_assert_held(&cgroup_mutex);
1970
1971         down_write(&css_set_rwsem);
1972         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1973                 cset->mg_src_cgrp = NULL;
1974                 cset->mg_dst_cset = NULL;
1975                 list_del_init(&cset->mg_preload_node);
1976                 put_css_set_locked(cset, false);
1977         }
1978         up_write(&css_set_rwsem);
1979 }
1980
1981 /**
1982  * cgroup_migrate_add_src - add a migration source css_set
1983  * @src_cset: the source css_set to add
1984  * @dst_cgrp: the destination cgroup
1985  * @preloaded_csets: list of preloaded css_sets
1986  *
1987  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
1988  * @src_cset and add it to @preloaded_csets, which should later be cleaned
1989  * up by cgroup_migrate_finish().
1990  *
1991  * This function may be called without holding threadgroup_lock even if the
1992  * target is a process.  Threads may be created and destroyed but as long
1993  * as cgroup_mutex is not dropped, no new css_set can be put into play and
1994  * the preloaded css_sets are guaranteed to cover all migrations.
1995  */
1996 static void cgroup_migrate_add_src(struct css_set *src_cset,
1997                                    struct cgroup *dst_cgrp,
1998                                    struct list_head *preloaded_csets)
1999 {
2000         struct cgroup *src_cgrp;
2001
2002         lockdep_assert_held(&cgroup_mutex);
2003         lockdep_assert_held(&css_set_rwsem);
2004
2005         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2006
2007         if (!list_empty(&src_cset->mg_preload_node))
2008                 return;
2009
2010         WARN_ON(src_cset->mg_src_cgrp);
2011         WARN_ON(!list_empty(&src_cset->mg_tasks));
2012         WARN_ON(!list_empty(&src_cset->mg_node));
2013
2014         src_cset->mg_src_cgrp = src_cgrp;
2015         get_css_set(src_cset);
2016         list_add(&src_cset->mg_preload_node, preloaded_csets);
2017 }
2018
2019 /**
2020  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2021  * @dst_cgrp: the destination cgroup (may be %NULL)
2022  * @preloaded_csets: list of preloaded source css_sets
2023  *
2024  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
2025  * have been preloaded to @preloaded_csets.  This function looks up and
2026  * pins all destination css_sets, links each to its source, and append them
2027  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
2028  * source css_set is assumed to be its cgroup on the default hierarchy.
2029  *
2030  * This function must be called after cgroup_migrate_add_src() has been
2031  * called on each migration source css_set.  After migration is performed
2032  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2033  * @preloaded_csets.
2034  */
2035 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2036                                       struct list_head *preloaded_csets)
2037 {
2038         LIST_HEAD(csets);
2039         struct css_set *src_cset, *tmp_cset;
2040
2041         lockdep_assert_held(&cgroup_mutex);
2042
2043         /*
2044          * Except for the root, child_subsys_mask must be zero for a cgroup
2045          * with tasks so that child cgroups don't compete against tasks.
2046          */
2047         if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
2048             dst_cgrp->child_subsys_mask)
2049                 return -EBUSY;
2050
2051         /* look up the dst cset for each src cset and link it to src */
2052         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
2053                 struct css_set *dst_cset;
2054
2055                 dst_cset = find_css_set(src_cset,
2056                                         dst_cgrp ?: src_cset->dfl_cgrp);
2057                 if (!dst_cset)
2058                         goto err;
2059
2060                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2061
2062                 /*
2063                  * If src cset equals dst, it's noop.  Drop the src.
2064                  * cgroup_migrate() will skip the cset too.  Note that we
2065                  * can't handle src == dst as some nodes are used by both.
2066                  */
2067                 if (src_cset == dst_cset) {
2068                         src_cset->mg_src_cgrp = NULL;
2069                         list_del_init(&src_cset->mg_preload_node);
2070                         put_css_set(src_cset, false);
2071                         put_css_set(dst_cset, false);
2072                         continue;
2073                 }
2074
2075                 src_cset->mg_dst_cset = dst_cset;
2076
2077                 if (list_empty(&dst_cset->mg_preload_node))
2078                         list_add(&dst_cset->mg_preload_node, &csets);
2079                 else
2080                         put_css_set(dst_cset, false);
2081         }
2082
2083         list_splice_tail(&csets, preloaded_csets);
2084         return 0;
2085 err:
2086         cgroup_migrate_finish(&csets);
2087         return -ENOMEM;
2088 }
2089
2090 /**
2091  * cgroup_migrate - migrate a process or task to a cgroup
2092  * @cgrp: the destination cgroup
2093  * @leader: the leader of the process or the task to migrate
2094  * @threadgroup: whether @leader points to the whole process or a single task
2095  *
2096  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
2097  * process, the caller must be holding threadgroup_lock of @leader.  The
2098  * caller is also responsible for invoking cgroup_migrate_add_src() and
2099  * cgroup_migrate_prepare_dst() on the targets before invoking this
2100  * function and following up with cgroup_migrate_finish().
2101  *
2102  * As long as a controller's ->can_attach() doesn't fail, this function is
2103  * guaranteed to succeed.  This means that, excluding ->can_attach()
2104  * failure, when migrating multiple targets, the success or failure can be
2105  * decided for all targets by invoking group_migrate_prepare_dst() before
2106  * actually starting migrating.
2107  */
2108 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
2109                           bool threadgroup)
2110 {
2111         struct cgroup_taskset tset = {
2112                 .src_csets      = LIST_HEAD_INIT(tset.src_csets),
2113                 .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
2114                 .csets          = &tset.src_csets,
2115         };
2116         struct cgroup_subsys_state *css, *failed_css = NULL;
2117         struct css_set *cset, *tmp_cset;
2118         struct task_struct *task, *tmp_task;
2119         int i, ret;
2120
2121         /*
2122          * Prevent freeing of tasks while we take a snapshot. Tasks that are
2123          * already PF_EXITING could be freed from underneath us unless we
2124          * take an rcu_read_lock.
2125          */
2126         down_write(&css_set_rwsem);
2127         rcu_read_lock();
2128         task = leader;
2129         do {
2130                 /* @task either already exited or can't exit until the end */
2131                 if (task->flags & PF_EXITING)
2132                         goto next;
2133
2134                 /* leave @task alone if post_fork() hasn't linked it yet */
2135                 if (list_empty(&task->cg_list))
2136                         goto next;
2137
2138                 cset = task_css_set(task);
2139                 if (!cset->mg_src_cgrp)
2140                         goto next;
2141
2142                 /*
2143                  * cgroup_taskset_first() must always return the leader.
2144                  * Take care to avoid disturbing the ordering.
2145                  */
2146                 list_move_tail(&task->cg_list, &cset->mg_tasks);
2147                 if (list_empty(&cset->mg_node))
2148                         list_add_tail(&cset->mg_node, &tset.src_csets);
2149                 if (list_empty(&cset->mg_dst_cset->mg_node))
2150                         list_move_tail(&cset->mg_dst_cset->mg_node,
2151                                        &tset.dst_csets);
2152         next:
2153                 if (!threadgroup)
2154                         break;
2155         } while_each_thread(leader, task);
2156         rcu_read_unlock();
2157         up_write(&css_set_rwsem);
2158
2159         /* methods shouldn't be called if no task is actually migrating */
2160         if (list_empty(&tset.src_csets))
2161                 return 0;
2162
2163         /* check that we can legitimately attach to the cgroup */
2164         for_each_e_css(css, i, cgrp) {
2165                 if (css->ss->can_attach) {
2166                         ret = css->ss->can_attach(css, &tset);
2167                         if (ret) {
2168                                 failed_css = css;
2169                                 goto out_cancel_attach;
2170                         }
2171                 }
2172         }
2173
2174         /*
2175          * Now that we're guaranteed success, proceed to move all tasks to
2176          * the new cgroup.  There are no failure cases after here, so this
2177          * is the commit point.
2178          */
2179         down_write(&css_set_rwsem);
2180         list_for_each_entry(cset, &tset.src_csets, mg_node) {
2181                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2182                         cgroup_task_migrate(cset->mg_src_cgrp, task,
2183                                             cset->mg_dst_cset);
2184         }
2185         up_write(&css_set_rwsem);
2186
2187         /*
2188          * Migration is committed, all target tasks are now on dst_csets.
2189          * Nothing is sensitive to fork() after this point.  Notify
2190          * controllers that migration is complete.
2191          */
2192         tset.csets = &tset.dst_csets;
2193
2194         for_each_e_css(css, i, cgrp)
2195                 if (css->ss->attach)
2196                         css->ss->attach(css, &tset);
2197
2198         ret = 0;
2199         goto out_release_tset;
2200
2201 out_cancel_attach:
2202         for_each_e_css(css, i, cgrp) {
2203                 if (css == failed_css)
2204                         break;
2205                 if (css->ss->cancel_attach)
2206                         css->ss->cancel_attach(css, &tset);
2207         }
2208 out_release_tset:
2209         down_write(&css_set_rwsem);
2210         list_splice_init(&tset.dst_csets, &tset.src_csets);
2211         list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2212                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2213                 list_del_init(&cset->mg_node);
2214         }
2215         up_write(&css_set_rwsem);
2216         return ret;
2217 }
2218
2219 /**
2220  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2221  * @dst_cgrp: the cgroup to attach to
2222  * @leader: the task or the leader of the threadgroup to be attached
2223  * @threadgroup: attach the whole threadgroup?
2224  *
2225  * Call holding cgroup_mutex and threadgroup_lock of @leader.
2226  */
2227 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2228                               struct task_struct *leader, bool threadgroup)
2229 {
2230         LIST_HEAD(preloaded_csets);
2231         struct task_struct *task;
2232         int ret;
2233
2234         /* look up all src csets */
2235         down_read(&css_set_rwsem);
2236         rcu_read_lock();
2237         task = leader;
2238         do {
2239                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2240                                        &preloaded_csets);
2241                 if (!threadgroup)
2242                         break;
2243         } while_each_thread(leader, task);
2244         rcu_read_unlock();
2245         up_read(&css_set_rwsem);
2246
2247         /* prepare dst csets and commit */
2248         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2249         if (!ret)
2250                 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2251
2252         cgroup_migrate_finish(&preloaded_csets);
2253         return ret;
2254 }
2255
2256 /*
2257  * Find the task_struct of the task to attach by vpid and pass it along to the
2258  * function to attach either it or all tasks in its threadgroup. Will lock
2259  * cgroup_mutex and threadgroup.
2260  */
2261 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
2262                                     size_t nbytes, loff_t off, bool threadgroup)
2263 {
2264         struct task_struct *tsk;
2265         const struct cred *cred = current_cred(), *tcred;
2266         struct cgroup *cgrp;
2267         pid_t pid;
2268         int ret;
2269
2270         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2271                 return -EINVAL;
2272
2273         cgrp = cgroup_kn_lock_live(of->kn);
2274         if (!cgrp)
2275                 return -ENODEV;
2276
2277 retry_find_task:
2278         rcu_read_lock();
2279         if (pid) {
2280                 tsk = find_task_by_vpid(pid);
2281                 if (!tsk) {
2282                         rcu_read_unlock();
2283                         ret = -ESRCH;
2284                         goto out_unlock_cgroup;
2285                 }
2286                 /*
2287                  * even if we're attaching all tasks in the thread group, we
2288                  * only need to check permissions on one of them.
2289                  */
2290                 tcred = __task_cred(tsk);
2291                 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2292                     !uid_eq(cred->euid, tcred->uid) &&
2293                     !uid_eq(cred->euid, tcred->suid)) {
2294                         rcu_read_unlock();
2295                         ret = -EACCES;
2296                         goto out_unlock_cgroup;
2297                 }
2298         } else
2299                 tsk = current;
2300
2301         if (threadgroup)
2302                 tsk = tsk->group_leader;
2303
2304         /*
2305          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2306          * trapped in a cpuset, or RT worker may be born in a cgroup
2307          * with no rt_runtime allocated.  Just say no.
2308          */
2309         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2310                 ret = -EINVAL;
2311                 rcu_read_unlock();
2312                 goto out_unlock_cgroup;
2313         }
2314
2315         get_task_struct(tsk);
2316         rcu_read_unlock();
2317
2318         threadgroup_lock(tsk);
2319         if (threadgroup) {
2320                 if (!thread_group_leader(tsk)) {
2321                         /*
2322                          * a race with de_thread from another thread's exec()
2323                          * may strip us of our leadership, if this happens,
2324                          * there is no choice but to throw this task away and
2325                          * try again; this is
2326                          * "double-double-toil-and-trouble-check locking".
2327                          */
2328                         threadgroup_unlock(tsk);
2329                         put_task_struct(tsk);
2330                         goto retry_find_task;
2331                 }
2332         }
2333
2334         ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2335
2336         threadgroup_unlock(tsk);
2337
2338         put_task_struct(tsk);
2339 out_unlock_cgroup:
2340         cgroup_kn_unlock(of->kn);
2341         return ret ?: nbytes;
2342 }
2343
2344 /**
2345  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2346  * @from: attach to all cgroups of a given task
2347  * @tsk: the task to be attached
2348  */
2349 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2350 {
2351         struct cgroup_root *root;
2352         int retval = 0;
2353
2354         mutex_lock(&cgroup_mutex);
2355         for_each_root(root) {
2356                 struct cgroup *from_cgrp;
2357
2358                 if (root == &cgrp_dfl_root)
2359                         continue;
2360
2361                 down_read(&css_set_rwsem);
2362                 from_cgrp = task_cgroup_from_root(from, root);
2363                 up_read(&css_set_rwsem);
2364
2365                 retval = cgroup_attach_task(from_cgrp, tsk, false);
2366                 if (retval)
2367                         break;
2368         }
2369         mutex_unlock(&cgroup_mutex);
2370
2371         return retval;
2372 }
2373 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2374
2375 static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
2376                                   char *buf, size_t nbytes, loff_t off)
2377 {
2378         return __cgroup_procs_write(of, buf, nbytes, off, false);
2379 }
2380
2381 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
2382                                   char *buf, size_t nbytes, loff_t off)
2383 {
2384         return __cgroup_procs_write(of, buf, nbytes, off, true);
2385 }
2386
2387 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
2388                                           char *buf, size_t nbytes, loff_t off)
2389 {
2390         struct cgroup *cgrp;
2391
2392         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
2393
2394         cgrp = cgroup_kn_lock_live(of->kn);
2395         if (!cgrp)
2396                 return -ENODEV;
2397         spin_lock(&release_agent_path_lock);
2398         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
2399                 sizeof(cgrp->root->release_agent_path));
2400         spin_unlock(&release_agent_path_lock);
2401         cgroup_kn_unlock(of->kn);
2402         return nbytes;
2403 }
2404
2405 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2406 {
2407         struct cgroup *cgrp = seq_css(seq)->cgroup;
2408
2409         spin_lock(&release_agent_path_lock);
2410         seq_puts(seq, cgrp->root->release_agent_path);
2411         spin_unlock(&release_agent_path_lock);
2412         seq_putc(seq, '\n');
2413         return 0;
2414 }
2415
2416 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2417 {
2418         struct cgroup *cgrp = seq_css(seq)->cgroup;
2419
2420         seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2421         return 0;
2422 }
2423
2424 static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
2425 {
2426         struct cgroup_subsys *ss;
2427         bool printed = false;
2428         int ssid;
2429
2430         for_each_subsys(ss, ssid) {
2431                 if (ss_mask & (1 << ssid)) {
2432                         if (printed)
2433                                 seq_putc(seq, ' ');
2434                         seq_printf(seq, "%s", ss->name);
2435                         printed = true;
2436                 }
2437         }
2438         if (printed)
2439                 seq_putc(seq, '\n');
2440 }
2441
2442 /* show controllers which are currently attached to the default hierarchy */
2443 static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
2444 {
2445         struct cgroup *cgrp = seq_css(seq)->cgroup;
2446
2447         cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
2448         return 0;
2449 }
2450
2451 /* show controllers which are enabled from the parent */
2452 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2453 {
2454         struct cgroup *cgrp = seq_css(seq)->cgroup;
2455
2456         cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
2457         return 0;
2458 }
2459
2460 /* show controllers which are enabled for a given cgroup's children */
2461 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2462 {
2463         struct cgroup *cgrp = seq_css(seq)->cgroup;
2464
2465         cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
2466         return 0;
2467 }
2468
2469 /**
2470  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2471  * @cgrp: root of the subtree to update csses for
2472  *
2473  * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
2474  * css associations need to be updated accordingly.  This function looks up
2475  * all css_sets which are attached to the subtree, creates the matching
2476  * updated css_sets and migrates the tasks to the new ones.
2477  */
2478 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2479 {
2480         LIST_HEAD(preloaded_csets);
2481         struct cgroup_subsys_state *css;
2482         struct css_set *src_cset;
2483         int ret;
2484
2485         lockdep_assert_held(&cgroup_mutex);
2486
2487         /* look up all csses currently attached to @cgrp's subtree */
2488         down_read(&css_set_rwsem);
2489         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2490                 struct cgrp_cset_link *link;
2491
2492                 /* self is not affected by child_subsys_mask change */
2493                 if (css->cgroup == cgrp)
2494                         continue;
2495
2496                 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
2497                         cgroup_migrate_add_src(link->cset, cgrp,
2498                                                &preloaded_csets);
2499         }
2500         up_read(&css_set_rwsem);
2501
2502         /* NULL dst indicates self on default hierarchy */
2503         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2504         if (ret)
2505                 goto out_finish;
2506
2507         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2508                 struct task_struct *last_task = NULL, *task;
2509
2510                 /* src_csets precede dst_csets, break on the first dst_cset */
2511                 if (!src_cset->mg_src_cgrp)
2512                         break;
2513
2514                 /*
2515                  * All tasks in src_cset need to be migrated to the
2516                  * matching dst_cset.  Empty it process by process.  We
2517                  * walk tasks but migrate processes.  The leader might even
2518                  * belong to a different cset but such src_cset would also
2519                  * be among the target src_csets because the default
2520                  * hierarchy enforces per-process membership.
2521                  */
2522                 while (true) {
2523                         down_read(&css_set_rwsem);
2524                         task = list_first_entry_or_null(&src_cset->tasks,
2525                                                 struct task_struct, cg_list);
2526                         if (task) {
2527                                 task = task->group_leader;
2528                                 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2529                                 get_task_struct(task);
2530                         }
2531                         up_read(&css_set_rwsem);
2532
2533                         if (!task)
2534                                 break;
2535
2536                         /* guard against possible infinite loop */
2537                         if (WARN(last_task == task,
2538                                  "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2539                                 goto out_finish;
2540                         last_task = task;
2541
2542                         threadgroup_lock(task);
2543                         /* raced against de_thread() from another thread? */
2544                         if (!thread_group_leader(task)) {
2545                                 threadgroup_unlock(task);
2546                                 put_task_struct(task);
2547                                 continue;
2548                         }
2549
2550                         ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2551
2552                         threadgroup_unlock(task);
2553                         put_task_struct(task);
2554
2555                         if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2556                                 goto out_finish;
2557                 }
2558         }
2559
2560 out_finish:
2561         cgroup_migrate_finish(&preloaded_csets);
2562         return ret;
2563 }
2564
2565 /* change the enabled child controllers for a cgroup in the default hierarchy */
2566 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2567                                             char *buf, size_t nbytes,
2568                                             loff_t off)
2569 {
2570         unsigned int enable = 0, disable = 0;
2571         struct cgroup *cgrp, *child;
2572         struct cgroup_subsys *ss;
2573         char *tok;
2574         int ssid, ret;
2575
2576         /*
2577          * Parse input - space separated list of subsystem names prefixed
2578          * with either + or -.
2579          */
2580         buf = strstrip(buf);
2581         while ((tok = strsep(&buf, " "))) {
2582                 if (tok[0] == '\0')
2583                         continue;
2584                 for_each_subsys(ss, ssid) {
2585                         if (ss->disabled || strcmp(tok + 1, ss->name))
2586                                 continue;
2587
2588                         if (*tok == '+') {
2589                                 enable |= 1 << ssid;
2590                                 disable &= ~(1 << ssid);
2591                         } else if (*tok == '-') {
2592                                 disable |= 1 << ssid;
2593                                 enable &= ~(1 << ssid);
2594                         } else {
2595                                 return -EINVAL;
2596                         }
2597                         break;
2598                 }
2599                 if (ssid == CGROUP_SUBSYS_COUNT)
2600                         return -EINVAL;
2601         }
2602
2603         cgrp = cgroup_kn_lock_live(of->kn);
2604         if (!cgrp)
2605                 return -ENODEV;
2606
2607         for_each_subsys(ss, ssid) {
2608                 if (enable & (1 << ssid)) {
2609                         if (cgrp->child_subsys_mask & (1 << ssid)) {
2610                                 enable &= ~(1 << ssid);
2611                                 continue;
2612                         }
2613
2614                         /*
2615                          * Because css offlining is asynchronous, userland
2616                          * might try to re-enable the same controller while
2617                          * the previous instance is still around.  In such
2618                          * cases, wait till it's gone using offline_waitq.
2619                          */
2620                         cgroup_for_each_live_child(child, cgrp) {
2621                                 DEFINE_WAIT(wait);
2622
2623                                 if (!cgroup_css(child, ss))
2624                                         continue;
2625
2626                                 cgroup_get(child);
2627                                 prepare_to_wait(&child->offline_waitq, &wait,
2628                                                 TASK_UNINTERRUPTIBLE);
2629                                 cgroup_kn_unlock(of->kn);
2630                                 schedule();
2631                                 finish_wait(&child->offline_waitq, &wait);
2632                                 cgroup_put(child);
2633
2634                                 return restart_syscall();
2635                         }
2636
2637                         /* unavailable or not enabled on the parent? */
2638                         if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
2639                             (cgrp->parent &&
2640                              !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
2641                                 ret = -ENOENT;
2642                                 goto out_unlock;
2643                         }
2644                 } else if (disable & (1 << ssid)) {
2645                         if (!(cgrp->child_subsys_mask & (1 << ssid))) {
2646                                 disable &= ~(1 << ssid);
2647                                 continue;
2648                         }
2649
2650                         /* a child has it enabled? */
2651                         cgroup_for_each_live_child(child, cgrp) {
2652                                 if (child->child_subsys_mask & (1 << ssid)) {
2653                                         ret = -EBUSY;
2654                                         goto out_unlock;
2655                                 }
2656                         }
2657                 }
2658         }
2659
2660         if (!enable && !disable) {
2661                 ret = 0;
2662                 goto out_unlock;
2663         }
2664
2665         /*
2666          * Except for the root, child_subsys_mask must be zero for a cgroup
2667          * with tasks so that child cgroups don't compete against tasks.
2668          */
2669         if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
2670                 ret = -EBUSY;
2671                 goto out_unlock;
2672         }
2673
2674         /*
2675          * Create csses for enables and update child_subsys_mask.  This
2676          * changes cgroup_e_css() results which in turn makes the
2677          * subsequent cgroup_update_dfl_csses() associate all tasks in the
2678          * subtree to the updated csses.
2679          */
2680         for_each_subsys(ss, ssid) {
2681                 if (!(enable & (1 << ssid)))
2682                         continue;
2683
2684                 cgroup_for_each_live_child(child, cgrp) {
2685                         ret = create_css(child, ss);
2686                         if (ret)
2687                                 goto err_undo_css;
2688                 }
2689         }
2690
2691         cgrp->child_subsys_mask |= enable;
2692         cgrp->child_subsys_mask &= ~disable;
2693
2694         ret = cgroup_update_dfl_csses(cgrp);
2695         if (ret)
2696                 goto err_undo_css;
2697
2698         /* all tasks are now migrated away from the old csses, kill them */
2699         for_each_subsys(ss, ssid) {
2700                 if (!(disable & (1 << ssid)))
2701                         continue;
2702
2703                 cgroup_for_each_live_child(child, cgrp)
2704                         kill_css(cgroup_css(child, ss));
2705         }
2706
2707         kernfs_activate(cgrp->kn);
2708         ret = 0;
2709 out_unlock:
2710         cgroup_kn_unlock(of->kn);
2711         return ret ?: nbytes;
2712
2713 err_undo_css:
2714         cgrp->child_subsys_mask &= ~enable;
2715         cgrp->child_subsys_mask |= disable;
2716
2717         for_each_subsys(ss, ssid) {
2718                 if (!(enable & (1 << ssid)))
2719                         continue;
2720
2721                 cgroup_for_each_live_child(child, cgrp) {
2722                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
2723                         if (css)
2724                                 kill_css(css);
2725                 }
2726         }
2727         goto out_unlock;
2728 }
2729
2730 static int cgroup_populated_show(struct seq_file *seq, void *v)
2731 {
2732         seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
2733         return 0;
2734 }
2735
2736 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2737                                  size_t nbytes, loff_t off)
2738 {
2739         struct cgroup *cgrp = of->kn->parent->priv;
2740         struct cftype *cft = of->kn->priv;
2741         struct cgroup_subsys_state *css;
2742         int ret;
2743
2744         if (cft->write)
2745                 return cft->write(of, buf, nbytes, off);
2746
2747         /*
2748          * kernfs guarantees that a file isn't deleted with operations in
2749          * flight, which means that the matching css is and stays alive and
2750          * doesn't need to be pinned.  The RCU locking is not necessary
2751          * either.  It's just for the convenience of using cgroup_css().
2752          */
2753         rcu_read_lock();
2754         css = cgroup_css(cgrp, cft->ss);
2755         rcu_read_unlock();
2756
2757         if (cft->write_u64) {
2758                 unsigned long long v;
2759                 ret = kstrtoull(buf, 0, &v);
2760                 if (!ret)
2761                         ret = cft->write_u64(css, cft, v);
2762         } else if (cft->write_s64) {
2763                 long long v;
2764                 ret = kstrtoll(buf, 0, &v);
2765                 if (!ret)
2766                         ret = cft->write_s64(css, cft, v);
2767         } else {
2768                 ret = -EINVAL;
2769         }
2770
2771         return ret ?: nbytes;
2772 }
2773
2774 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2775 {
2776         return seq_cft(seq)->seq_start(seq, ppos);
2777 }
2778
2779 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2780 {
2781         return seq_cft(seq)->seq_next(seq, v, ppos);
2782 }
2783
2784 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2785 {
2786         seq_cft(seq)->seq_stop(seq, v);
2787 }
2788
2789 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2790 {
2791         struct cftype *cft = seq_cft(m);
2792         struct cgroup_subsys_state *css = seq_css(m);
2793
2794         if (cft->seq_show)
2795                 return cft->seq_show(m, arg);
2796
2797         if (cft->read_u64)
2798                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2799         else if (cft->read_s64)
2800                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2801         else
2802                 return -EINVAL;
2803         return 0;
2804 }
2805
2806 static struct kernfs_ops cgroup_kf_single_ops = {
2807         .atomic_write_len       = PAGE_SIZE,
2808         .write                  = cgroup_file_write,
2809         .seq_show               = cgroup_seqfile_show,
2810 };
2811
2812 static struct kernfs_ops cgroup_kf_ops = {
2813         .atomic_write_len       = PAGE_SIZE,
2814         .write                  = cgroup_file_write,
2815         .seq_start              = cgroup_seqfile_start,
2816         .seq_next               = cgroup_seqfile_next,
2817         .seq_stop               = cgroup_seqfile_stop,
2818         .seq_show               = cgroup_seqfile_show,
2819 };
2820
2821 /*
2822  * cgroup_rename - Only allow simple rename of directories in place.
2823  */
2824 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2825                          const char *new_name_str)
2826 {
2827         struct cgroup *cgrp = kn->priv;
2828         int ret;
2829
2830         if (kernfs_type(kn) != KERNFS_DIR)
2831                 return -ENOTDIR;
2832         if (kn->parent != new_parent)
2833                 return -EIO;
2834
2835         /*
2836          * This isn't a proper migration and its usefulness is very
2837          * limited.  Disallow if sane_behavior.
2838          */
2839         if (cgroup_sane_behavior(cgrp))
2840                 return -EPERM;
2841
2842         /*
2843          * We're gonna grab cgroup_mutex which nests outside kernfs
2844          * active_ref.  kernfs_rename() doesn't require active_ref
2845          * protection.  Break them before grabbing cgroup_mutex.
2846          */
2847         kernfs_break_active_protection(new_parent);
2848         kernfs_break_active_protection(kn);
2849
2850         mutex_lock(&cgroup_mutex);
2851
2852         ret = kernfs_rename(kn, new_parent, new_name_str);
2853
2854         mutex_unlock(&cgroup_mutex);
2855
2856         kernfs_unbreak_active_protection(kn);
2857         kernfs_unbreak_active_protection(new_parent);
2858         return ret;
2859 }
2860
2861 /* set uid and gid of cgroup dirs and files to that of the creator */
2862 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2863 {
2864         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2865                                .ia_uid = current_fsuid(),
2866                                .ia_gid = current_fsgid(), };
2867
2868         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2869             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2870                 return 0;
2871
2872         return kernfs_setattr(kn, &iattr);
2873 }
2874
2875 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2876 {
2877         char name[CGROUP_FILE_NAME_MAX];
2878         struct kernfs_node *kn;
2879         struct lock_class_key *key = NULL;
2880         int ret;
2881
2882 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2883         key = &cft->lockdep_key;
2884 #endif
2885         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2886                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2887                                   NULL, false, key);
2888         if (IS_ERR(kn))
2889                 return PTR_ERR(kn);
2890
2891         ret = cgroup_kn_set_ugid(kn);
2892         if (ret) {
2893                 kernfs_remove(kn);
2894                 return ret;
2895         }
2896
2897         if (cft->seq_show == cgroup_populated_show)
2898                 cgrp->populated_kn = kn;
2899         return 0;
2900 }
2901
2902 /**
2903  * cgroup_addrm_files - add or remove files to a cgroup directory
2904  * @cgrp: the target cgroup
2905  * @cfts: array of cftypes to be added
2906  * @is_add: whether to add or remove
2907  *
2908  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2909  * For removals, this function never fails.  If addition fails, this
2910  * function doesn't remove files already added.  The caller is responsible
2911  * for cleaning up.
2912  */
2913 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2914                               bool is_add)
2915 {
2916         struct cftype *cft;
2917         int ret;
2918
2919         lockdep_assert_held(&cgroup_mutex);
2920
2921         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2922                 /* does cft->flags tell us to skip this file on @cgrp? */
2923                 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2924                         continue;
2925                 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2926                         continue;
2927                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2928                         continue;
2929                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2930                         continue;
2931
2932                 if (is_add) {
2933                         ret = cgroup_add_file(cgrp, cft);
2934                         if (ret) {
2935                                 pr_warn("%s: failed to add %s, err=%d\n",
2936                                         __func__, cft->name, ret);
2937                                 return ret;
2938                         }
2939                 } else {
2940                         cgroup_rm_file(cgrp, cft);
2941                 }
2942         }
2943         return 0;
2944 }
2945
2946 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2947 {
2948         LIST_HEAD(pending);
2949         struct cgroup_subsys *ss = cfts[0].ss;
2950         struct cgroup *root = &ss->root->cgrp;
2951         struct cgroup_subsys_state *css;
2952         int ret = 0;
2953
2954         lockdep_assert_held(&cgroup_mutex);
2955
2956         /* add/rm files for all cgroups created before */
2957         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2958                 struct cgroup *cgrp = css->cgroup;
2959
2960                 if (cgroup_is_dead(cgrp))
2961                         continue;
2962
2963                 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2964                 if (ret)
2965                         break;
2966         }
2967
2968         if (is_add && !ret)
2969                 kernfs_activate(root->kn);
2970         return ret;
2971 }
2972
2973 static void cgroup_exit_cftypes(struct cftype *cfts)
2974 {
2975         struct cftype *cft;
2976
2977         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2978                 /* free copy for custom atomic_write_len, see init_cftypes() */
2979                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2980                         kfree(cft->kf_ops);
2981                 cft->kf_ops = NULL;
2982                 cft->ss = NULL;
2983         }
2984 }
2985
2986 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2987 {
2988         struct cftype *cft;
2989
2990         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2991                 struct kernfs_ops *kf_ops;
2992
2993                 WARN_ON(cft->ss || cft->kf_ops);
2994
2995                 if (cft->seq_start)
2996                         kf_ops = &cgroup_kf_ops;
2997                 else
2998                         kf_ops = &cgroup_kf_single_ops;
2999
3000                 /*
3001                  * Ugh... if @cft wants a custom max_write_len, we need to
3002                  * make a copy of kf_ops to set its atomic_write_len.
3003                  */
3004                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
3005                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
3006                         if (!kf_ops) {
3007                                 cgroup_exit_cftypes(cfts);
3008                                 return -ENOMEM;
3009                         }
3010                         kf_ops->atomic_write_len = cft->max_write_len;
3011                 }
3012
3013                 cft->kf_ops = kf_ops;
3014                 cft->ss = ss;
3015         }
3016
3017         return 0;
3018 }
3019
3020 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
3021 {
3022         lockdep_assert_held(&cgroup_mutex);
3023
3024         if (!cfts || !cfts[0].ss)
3025                 return -ENOENT;
3026
3027         list_del(&cfts->node);
3028         cgroup_apply_cftypes(cfts, false);
3029         cgroup_exit_cftypes(cfts);
3030         return 0;
3031 }
3032
3033 /**
3034  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
3035  * @cfts: zero-length name terminated array of cftypes
3036  *
3037  * Unregister @cfts.  Files described by @cfts are removed from all
3038  * existing cgroups and all future cgroups won't have them either.  This
3039  * function can be called anytime whether @cfts' subsys is attached or not.
3040  *
3041  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
3042  * registered.
3043  */
3044 int cgroup_rm_cftypes(struct cftype *cfts)
3045 {
3046         int ret;
3047
3048         mutex_lock(&cgroup_mutex);
3049         ret = cgroup_rm_cftypes_locked(cfts);
3050         mutex_unlock(&cgroup_mutex);
3051         return ret;
3052 }
3053
3054 /**
3055  * cgroup_add_cftypes - add an array of cftypes to a subsystem
3056  * @ss: target cgroup subsystem
3057  * @cfts: zero-length name terminated array of cftypes
3058  *
3059  * Register @cfts to @ss.  Files described by @cfts are created for all
3060  * existing cgroups to which @ss is attached and all future cgroups will
3061  * have them too.  This function can be called anytime whether @ss is
3062  * attached or not.
3063  *
3064  * Returns 0 on successful registration, -errno on failure.  Note that this
3065  * function currently returns 0 as long as @cfts registration is successful
3066  * even if some file creation attempts on existing cgroups fail.
3067  */
3068 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3069 {
3070         int ret;
3071
3072         if (!cfts || cfts[0].name[0] == '\0')
3073                 return 0;
3074
3075         ret = cgroup_init_cftypes(ss, cfts);
3076         if (ret)
3077                 return ret;
3078
3079         mutex_lock(&cgroup_mutex);
3080
3081         list_add_tail(&cfts->node, &ss->cfts);
3082         ret = cgroup_apply_cftypes(cfts, true);
3083         if (ret)
3084                 cgroup_rm_cftypes_locked(cfts);
3085
3086         mutex_unlock(&cgroup_mutex);
3087         return ret;
3088 }
3089
3090 /**
3091  * cgroup_task_count - count the number of tasks in a cgroup.
3092  * @cgrp: the cgroup in question
3093  *
3094  * Return the number of tasks in the cgroup.
3095  */
3096 static int cgroup_task_count(const struct cgroup *cgrp)
3097 {
3098         int count = 0;
3099         struct cgrp_cset_link *link;
3100
3101         down_read(&css_set_rwsem);
3102         list_for_each_entry(link, &cgrp->cset_links, cset_link)
3103                 count += atomic_read(&link->cset->refcount);
3104         up_read(&css_set_rwsem);
3105         return count;
3106 }
3107
3108 /**
3109  * css_next_child - find the next child of a given css
3110  * @pos_css: the current position (%NULL to initiate traversal)
3111  * @parent_css: css whose children to walk
3112  *
3113  * This function returns the next child of @parent_css and should be called
3114  * under either cgroup_mutex or RCU read lock.  The only requirement is
3115  * that @parent_css and @pos_css are accessible.  The next sibling is
3116  * guaranteed to be returned regardless of their states.
3117  */
3118 struct cgroup_subsys_state *
3119 css_next_child(struct cgroup_subsys_state *pos_css,
3120                struct cgroup_subsys_state *parent_css)
3121 {
3122         struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3123         struct cgroup *cgrp = parent_css->cgroup;
3124         struct cgroup *next;
3125
3126         cgroup_assert_mutex_or_rcu_locked();
3127
3128         /*
3129          * @pos could already have been removed.  Once a cgroup is removed,
3130          * its ->sibling.next is no longer updated when its next sibling
3131          * changes.  As CGRP_DEAD assertion is serialized and happens
3132          * before the cgroup is taken off the ->sibling list, if we see it
3133          * unasserted, it's guaranteed that the next sibling hasn't
3134          * finished its grace period even if it's already removed, and thus
3135          * safe to dereference from this RCU critical section.  If
3136          * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3137          * to be visible as %true here.
3138          *
3139          * If @pos is dead, its next pointer can't be dereferenced;
3140          * however, as each cgroup is given a monotonically increasing
3141          * unique serial number and always appended to the sibling list,
3142          * the next one can be found by walking the parent's children until
3143          * we see a cgroup with higher serial number than @pos's.  While
3144          * this path can be slower, it's taken only when either the current
3145          * cgroup is removed or iteration and removal race.
3146          */
3147         if (!pos) {
3148                 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3149         } else if (likely(!cgroup_is_dead(pos))) {
3150                 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3151         } else {
3152                 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3153                         if (next->serial_nr > pos->serial_nr)
3154                                 break;
3155         }
3156
3157         /*
3158          * @next, if not pointing to the head, can be dereferenced and is
3159          * the next sibling; however, it might have @ss disabled.  If so,
3160          * fast-forward to the next enabled one.
3161          */
3162         while (&next->sibling != &cgrp->children) {
3163                 struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
3164
3165                 if (next_css)
3166                         return next_css;
3167                 next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
3168         }
3169         return NULL;
3170 }
3171
3172 /**
3173  * css_next_descendant_pre - find the next descendant for pre-order walk
3174  * @pos: the current position (%NULL to initiate traversal)
3175  * @root: css whose descendants to walk
3176  *
3177  * To be used by css_for_each_descendant_pre().  Find the next descendant
3178  * to visit for pre-order traversal of @root's descendants.  @root is
3179  * included in the iteration and the first node to be visited.
3180  *
3181  * While this function requires cgroup_mutex or RCU read locking, it
3182  * doesn't require the whole traversal to be contained in a single critical
3183  * section.  This function will return the correct next descendant as long
3184  * as both @pos and @root are accessible and @pos is a descendant of @root.
3185  */
3186 struct cgroup_subsys_state *
3187 css_next_descendant_pre(struct cgroup_subsys_state *pos,
3188                         struct cgroup_subsys_state *root)
3189 {
3190         struct cgroup_subsys_state *next;
3191
3192         cgroup_assert_mutex_or_rcu_locked();
3193
3194         /* if first iteration, visit @root */
3195         if (!pos)
3196                 return root;
3197
3198         /* visit the first child if exists */
3199         next = css_next_child(NULL, pos);
3200         if (next)
3201                 return next;
3202
3203         /* no child, visit my or the closest ancestor's next sibling */
3204         while (pos != root) {
3205                 next = css_next_child(pos, css_parent(pos));
3206                 if (next)
3207                         return next;
3208                 pos = css_parent(pos);
3209         }
3210
3211         return NULL;
3212 }
3213
3214 /**
3215  * css_rightmost_descendant - return the rightmost descendant of a css
3216  * @pos: css of interest
3217  *
3218  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
3219  * is returned.  This can be used during pre-order traversal to skip
3220  * subtree of @pos.
3221  *
3222  * While this function requires cgroup_mutex or RCU read locking, it
3223  * doesn't require the whole traversal to be contained in a single critical
3224  * section.  This function will return the correct rightmost descendant as
3225  * long as @pos is accessible.
3226  */
3227 struct cgroup_subsys_state *
3228 css_rightmost_descendant(struct cgroup_subsys_state *pos)
3229 {
3230         struct cgroup_subsys_state *last, *tmp;
3231
3232         cgroup_assert_mutex_or_rcu_locked();
3233
3234         do {
3235                 last = pos;
3236                 /* ->prev isn't RCU safe, walk ->next till the end */
3237                 pos = NULL;
3238                 css_for_each_child(tmp, last)
3239                         pos = tmp;
3240         } while (pos);
3241
3242         return last;
3243 }
3244
3245 static struct cgroup_subsys_state *
3246 css_leftmost_descendant(struct cgroup_subsys_state *pos)
3247 {
3248         struct cgroup_subsys_state *last;
3249
3250         do {
3251                 last = pos;
3252                 pos = css_next_child(NULL, pos);
3253         } while (pos);
3254
3255         return last;
3256 }
3257
3258 /**
3259  * css_next_descendant_post - find the next descendant for post-order walk
3260  * @pos: the current position (%NULL to initiate traversal)
3261  * @root: css whose descendants to walk
3262  *
3263  * To be used by css_for_each_descendant_post().  Find the next descendant
3264  * to visit for post-order traversal of @root's descendants.  @root is
3265  * included in the iteration and the last node to be visited.
3266  *
3267  * While this function requires cgroup_mutex or RCU read locking, it
3268  * doesn't require the whole traversal to be contained in a single critical
3269  * section.  This function will return the correct next descendant as long
3270  * as both @pos and @cgroup are accessible and @pos is a descendant of
3271  * @cgroup.
3272  */
3273 struct cgroup_subsys_state *
3274 css_next_descendant_post(struct cgroup_subsys_state *pos,
3275                          struct cgroup_subsys_state *root)
3276 {
3277         struct cgroup_subsys_state *next;
3278
3279         cgroup_assert_mutex_or_rcu_locked();
3280
3281         /* if first iteration, visit leftmost descendant which may be @root */
3282         if (!pos)
3283                 return css_leftmost_descendant(root);
3284
3285         /* if we visited @root, we're done */
3286         if (pos == root)
3287                 return NULL;
3288
3289         /* if there's an unvisited sibling, visit its leftmost descendant */
3290         next = css_next_child(pos, css_parent(pos));
3291         if (next)
3292                 return css_leftmost_descendant(next);
3293
3294         /* no sibling left, visit parent */
3295         return css_parent(pos);
3296 }
3297
3298 /**
3299  * css_advance_task_iter - advance a task itererator to the next css_set
3300  * @it: the iterator to advance
3301  *
3302  * Advance @it to the next css_set to walk.
3303  */
3304 static void css_advance_task_iter(struct css_task_iter *it)
3305 {
3306         struct list_head *l = it->cset_pos;
3307         struct cgrp_cset_link *link;
3308         struct css_set *cset;
3309
3310         /* Advance to the next non-empty css_set */
3311         do {
3312                 l = l->next;
3313                 if (l == it->cset_head) {
3314                         it->cset_pos = NULL;
3315                         return;
3316                 }
3317
3318                 if (it->ss) {
3319                         cset = container_of(l, struct css_set,
3320                                             e_cset_node[it->ss->id]);
3321                 } else {
3322                         link = list_entry(l, struct cgrp_cset_link, cset_link);
3323                         cset = link->cset;
3324                 }
3325         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
3326
3327         it->cset_pos = l;
3328
3329         if (!list_empty(&cset->tasks))
3330                 it->task_pos = cset->tasks.next;
3331         else
3332                 it->task_pos = cset->mg_tasks.next;
3333
3334         it->tasks_head = &cset->tasks;
3335         it->mg_tasks_head = &cset->mg_tasks;
3336 }
3337
3338 /**
3339  * css_task_iter_start - initiate task iteration
3340  * @css: the css to walk tasks of
3341  * @it: the task iterator to use
3342  *
3343  * Initiate iteration through the tasks of @css.  The caller can call
3344  * css_task_iter_next() to walk through the tasks until the function
3345  * returns NULL.  On completion of iteration, css_task_iter_end() must be
3346  * called.
3347  *
3348  * Note that this function acquires a lock which is released when the
3349  * iteration finishes.  The caller can't sleep while iteration is in
3350  * progress.
3351  */
3352 void css_task_iter_start(struct cgroup_subsys_state *css,
3353                          struct css_task_iter *it)
3354         __acquires(css_set_rwsem)
3355 {
3356         /* no one should try to iterate before mounting cgroups */
3357         WARN_ON_ONCE(!use_task_css_set_links);
3358
3359         down_read(&css_set_rwsem);
3360
3361         it->ss = css->ss;
3362
3363         if (it->ss)
3364                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
3365         else
3366                 it->cset_pos = &css->cgroup->cset_links;
3367
3368         it->cset_head = it->cset_pos;
3369
3370         css_advance_task_iter(it);
3371 }
3372
3373 /**
3374  * css_task_iter_next - return the next task for the iterator
3375  * @it: the task iterator being iterated
3376  *
3377  * The "next" function for task iteration.  @it should have been
3378  * initialized via css_task_iter_start().  Returns NULL when the iteration
3379  * reaches the end.
3380  */
3381 struct task_struct *css_task_iter_next(struct css_task_iter *it)
3382 {
3383         struct task_struct *res;
3384         struct list_head *l = it->task_pos;
3385
3386         /* If the iterator cg is NULL, we have no tasks */
3387         if (!it->cset_pos)
3388                 return NULL;
3389         res = list_entry(l, struct task_struct, cg_list);
3390
3391         /*
3392          * Advance iterator to find next entry.  cset->tasks is consumed
3393          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
3394          * next cset.
3395          */
3396         l = l->next;
3397
3398         if (l == it->tasks_head)
3399                 l = it->mg_tasks_head->next;
3400
3401         if (l == it->mg_tasks_head)
3402                 css_advance_task_iter(it);
3403         else
3404                 it->task_pos = l;
3405
3406         return res;
3407 }
3408
3409 /**
3410  * css_task_iter_end - finish task iteration
3411  * @it: the task iterator to finish
3412  *
3413  * Finish task iteration started by css_task_iter_start().
3414  */
3415 void css_task_iter_end(struct css_task_iter *it)
3416         __releases(css_set_rwsem)
3417 {
3418         up_read(&css_set_rwsem);
3419 }
3420
3421 /**
3422  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3423  * @to: cgroup to which the tasks will be moved
3424  * @from: cgroup in which the tasks currently reside
3425  *
3426  * Locking rules between cgroup_post_fork() and the migration path
3427  * guarantee that, if a task is forking while being migrated, the new child
3428  * is guaranteed to be either visible in the source cgroup after the
3429  * parent's migration is complete or put into the target cgroup.  No task
3430  * can slip out of migration through forking.
3431  */
3432 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3433 {
3434         LIST_HEAD(preloaded_csets);
3435         struct cgrp_cset_link *link;
3436         struct css_task_iter it;
3437         struct task_struct *task;
3438         int ret;
3439
3440         mutex_lock(&cgroup_mutex);
3441
3442         /* all tasks in @from are being moved, all csets are source */
3443         down_read(&css_set_rwsem);
3444         list_for_each_entry(link, &from->cset_links, cset_link)
3445                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3446         up_read(&css_set_rwsem);
3447
3448         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3449         if (ret)
3450                 goto out_err;
3451
3452         /*
3453          * Migrate tasks one-by-one until @form is empty.  This fails iff
3454          * ->can_attach() fails.
3455          */
3456         do {
3457                 css_task_iter_start(&from->self, &it);
3458                 task = css_task_iter_next(&it);
3459                 if (task)
3460                         get_task_struct(task);
3461                 css_task_iter_end(&it);
3462
3463                 if (task) {
3464                         ret = cgroup_migrate(to, task, false);
3465                         put_task_struct(task);
3466                 }
3467         } while (task && !ret);
3468 out_err:
3469         cgroup_migrate_finish(&preloaded_csets);
3470         mutex_unlock(&cgroup_mutex);
3471         return ret;
3472 }
3473
3474 /*
3475  * Stuff for reading the 'tasks'/'procs' files.
3476  *
3477  * Reading this file can return large amounts of data if a cgroup has
3478  * *lots* of attached tasks. So it may need several calls to read(),
3479  * but we cannot guarantee that the information we produce is correct
3480  * unless we produce it entirely atomically.
3481  *
3482  */
3483
3484 /* which pidlist file are we talking about? */
3485 enum cgroup_filetype {
3486         CGROUP_FILE_PROCS,
3487         CGROUP_FILE_TASKS,
3488 };
3489
3490 /*
3491  * A pidlist is a list of pids that virtually represents the contents of one
3492  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
3493  * a pair (one each for procs, tasks) for each pid namespace that's relevant
3494  * to the cgroup.
3495  */
3496 struct cgroup_pidlist {
3497         /*
3498          * used to find which pidlist is wanted. doesn't change as long as
3499          * this particular list stays in the list.
3500         */
3501         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
3502         /* array of xids */
3503         pid_t *list;
3504         /* how many elements the above list has */
3505         int length;
3506         /* each of these stored in a list by its cgroup */
3507         struct list_head links;
3508         /* pointer to the cgroup we belong to, for list removal purposes */
3509         struct cgroup *owner;
3510         /* for delayed destruction */
3511         struct delayed_work destroy_dwork;
3512 };
3513
3514 /*
3515  * The following two functions "fix" the issue where there are more pids
3516  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
3517  * TODO: replace with a kernel-wide solution to this problem
3518  */
3519 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
3520 static void *pidlist_allocate(int count)
3521 {
3522         if (PIDLIST_TOO_LARGE(count))
3523                 return vmalloc(count * sizeof(pid_t));
3524         else
3525                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
3526 }
3527
3528 static void pidlist_free(void *p)
3529 {
3530         if (is_vmalloc_addr(p))
3531                 vfree(p);
3532         else
3533                 kfree(p);
3534 }
3535
3536 /*
3537  * Used to destroy all pidlists lingering waiting for destroy timer.  None
3538  * should be left afterwards.
3539  */
3540 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3541 {
3542         struct cgroup_pidlist *l, *tmp_l;
3543
3544         mutex_lock(&cgrp->pidlist_mutex);
3545         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3546                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3547         mutex_unlock(&cgrp->pidlist_mutex);
3548
3549         flush_workqueue(cgroup_pidlist_destroy_wq);
3550         BUG_ON(!list_empty(&cgrp->pidlists));
3551 }
3552
3553 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3554 {
3555         struct delayed_work *dwork = to_delayed_work(work);
3556         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3557                                                 destroy_dwork);
3558         struct cgroup_pidlist *tofree = NULL;
3559
3560         mutex_lock(&l->owner->pidlist_mutex);
3561
3562         /*
3563          * Destroy iff we didn't get queued again.  The state won't change
3564          * as destroy_dwork can only be queued while locked.
3565          */
3566         if (!delayed_work_pending(dwork)) {
3567                 list_del(&l->links);
3568                 pidlist_free(l->list);
3569                 put_pid_ns(l->key.ns);
3570                 tofree = l;
3571         }
3572
3573         mutex_unlock(&l->owner->pidlist_mutex);
3574         kfree(tofree);
3575 }
3576
3577 /*
3578  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3579  * Returns the number of unique elements.
3580  */
3581 static int pidlist_uniq(pid_t *list, int length)
3582 {
3583         int src, dest = 1;
3584
3585         /*
3586          * we presume the 0th element is unique, so i starts at 1. trivial
3587          * edge cases first; no work needs to be done for either
3588          */
3589         if (length == 0 || length == 1)
3590                 return length;
3591         /* src and dest walk down the list; dest counts unique elements */
3592         for (src = 1; src < length; src++) {
3593                 /* find next unique element */
3594                 while (list[src] == list[src-1]) {
3595                         src++;
3596                         if (src == length)
3597                                 goto after;
3598                 }
3599                 /* dest always points to where the next unique element goes */
3600                 list[dest] = list[src];
3601                 dest++;
3602         }
3603 after:
3604         return dest;
3605 }
3606
3607 /*
3608  * The two pid files - task and cgroup.procs - guaranteed that the result
3609  * is sorted, which forced this whole pidlist fiasco.  As pid order is
3610  * different per namespace, each namespace needs differently sorted list,
3611  * making it impossible to use, for example, single rbtree of member tasks
3612  * sorted by task pointer.  As pidlists can be fairly large, allocating one
3613  * per open file is dangerous, so cgroup had to implement shared pool of
3614  * pidlists keyed by cgroup and namespace.
3615  *
3616  * All this extra complexity was caused by the original implementation
3617  * committing to an entirely unnecessary property.  In the long term, we
3618  * want to do away with it.  Explicitly scramble sort order if
3619  * sane_behavior so that no such expectation exists in the new interface.
3620  *
3621  * Scrambling is done by swapping every two consecutive bits, which is
3622  * non-identity one-to-one mapping which disturbs sort order sufficiently.
3623  */
3624 static pid_t pid_fry(pid_t pid)
3625 {
3626         unsigned a = pid & 0x55555555;
3627         unsigned b = pid & 0xAAAAAAAA;
3628
3629         return (a << 1) | (b >> 1);
3630 }
3631
3632 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3633 {
3634         if (cgroup_sane_behavior(cgrp))
3635                 return pid_fry(pid);
3636         else
3637                 return pid;
3638 }
3639
3640 static int cmppid(const void *a, const void *b)
3641 {
3642         return *(pid_t *)a - *(pid_t *)b;
3643 }
3644
3645 static int fried_cmppid(const void *a, const void *b)
3646 {
3647         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3648 }
3649
3650 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3651                                                   enum cgroup_filetype type)
3652 {
3653         struct cgroup_pidlist *l;
3654         /* don't need task_nsproxy() if we're looking at ourself */
3655         struct pid_namespace *ns = task_active_pid_ns(current);
3656
3657         lockdep_assert_held(&cgrp->pidlist_mutex);
3658
3659         list_for_each_entry(l, &cgrp->pidlists, links)
3660                 if (l->key.type == type && l->key.ns == ns)
3661                         return l;
3662         return NULL;
3663 }
3664
3665 /*
3666  * find the appropriate pidlist for our purpose (given procs vs tasks)
3667  * returns with the lock on that pidlist already held, and takes care
3668  * of the use count, or returns NULL with no locks held if we're out of
3669  * memory.
3670  */
3671 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3672                                                 enum cgroup_filetype type)
3673 {
3674         struct cgroup_pidlist *l;
3675
3676         lockdep_assert_held(&cgrp->pidlist_mutex);
3677
3678         l = cgroup_pidlist_find(cgrp, type);
3679         if (l)
3680                 return l;
3681
3682         /* entry not found; create a new one */
3683         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3684         if (!l)
3685                 return l;
3686
3687         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3688         l->key.type = type;
3689         /* don't need task_nsproxy() if we're looking at ourself */
3690         l->key.ns = get_pid_ns(task_active_pid_ns(current));
3691         l->owner = cgrp;
3692         list_add(&l->links, &cgrp->pidlists);
3693         return l;
3694 }
3695
3696 /*
3697  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3698  */
3699 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3700                               struct cgroup_pidlist **lp)
3701 {
3702         pid_t *array;
3703         int length;
3704         int pid, n = 0; /* used for populating the array */
3705         struct css_task_iter it;
3706         struct task_struct *tsk;
3707         struct cgroup_pidlist *l;
3708
3709         lockdep_assert_held(&cgrp->pidlist_mutex);
3710
3711         /*
3712          * If cgroup gets more users after we read count, we won't have
3713          * enough space - tough.  This race is indistinguishable to the
3714          * caller from the case that the additional cgroup users didn't
3715          * show up until sometime later on.
3716          */
3717         length = cgroup_task_count(cgrp);
3718         array = pidlist_allocate(length);
3719         if (!array)
3720                 return -ENOMEM;
3721         /* now, populate the array */
3722         css_task_iter_start(&cgrp->self, &it);
3723         while ((tsk = css_task_iter_next(&it))) {
3724                 if (unlikely(n == length))
3725                         break;
3726                 /* get tgid or pid for procs or tasks file respectively */
3727                 if (type == CGROUP_FILE_PROCS)
3728                         pid = task_tgid_vnr(tsk);
3729                 else
3730                         pid = task_pid_vnr(tsk);
3731                 if (pid > 0) /* make sure to only use valid results */
3732                         array[n++] = pid;
3733         }
3734         css_task_iter_end(&it);
3735         length = n;
3736         /* now sort & (if procs) strip out duplicates */
3737         if (cgroup_sane_behavior(cgrp))
3738                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3739         else
3740                 sort(array, length, sizeof(pid_t), cmppid, NULL);
3741         if (type == CGROUP_FILE_PROCS)
3742                 length = pidlist_uniq(array, length);
3743
3744         l = cgroup_pidlist_find_create(cgrp, type);
3745         if (!l) {
3746                 mutex_unlock(&cgrp->pidlist_mutex);
3747                 pidlist_free(array);
3748                 return -ENOMEM;
3749         }
3750
3751         /* store array, freeing old if necessary */
3752         pidlist_free(l->list);
3753         l->list = array;
3754         l->length = length;
3755         *lp = l;
3756         return 0;
3757 }
3758
3759 /**
3760  * cgroupstats_build - build and fill cgroupstats
3761  * @stats: cgroupstats to fill information into
3762  * @dentry: A dentry entry belonging to the cgroup for which stats have
3763  * been requested.
3764  *
3765  * Build and fill cgroupstats so that taskstats can export it to user
3766  * space.
3767  */
3768 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3769 {
3770         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3771         struct cgroup *cgrp;
3772         struct css_task_iter it;
3773         struct task_struct *tsk;
3774
3775         /* it should be kernfs_node belonging to cgroupfs and is a directory */
3776         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3777             kernfs_type(kn) != KERNFS_DIR)
3778                 return -EINVAL;
3779
3780         mutex_lock(&cgroup_mutex);
3781
3782         /*
3783          * We aren't being called from kernfs and there's no guarantee on
3784          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
3785          * @kn->priv is RCU safe.  Let's do the RCU dancing.
3786          */
3787         rcu_read_lock();
3788         cgrp = rcu_dereference(kn->priv);
3789         if (!cgrp || cgroup_is_dead(cgrp)) {
3790                 rcu_read_unlock();
3791                 mutex_unlock(&cgroup_mutex);
3792                 return -ENOENT;
3793         }
3794         rcu_read_unlock();
3795
3796         css_task_iter_start(&cgrp->self, &it);
3797         while ((tsk = css_task_iter_next(&it))) {
3798                 switch (tsk->state) {
3799                 case TASK_RUNNING:
3800                         stats->nr_running++;
3801                         break;
3802                 case TASK_INTERRUPTIBLE:
3803                         stats->nr_sleeping++;
3804                         break;
3805                 case TASK_UNINTERRUPTIBLE:
3806                         stats->nr_uninterruptible++;
3807                         break;
3808                 case TASK_STOPPED:
3809                         stats->nr_stopped++;
3810                         break;
3811                 default:
3812                         if (delayacct_is_task_waiting_on_io(tsk))
3813                                 stats->nr_io_wait++;
3814                         break;
3815                 }
3816         }
3817         css_task_iter_end(&it);
3818
3819         mutex_unlock(&cgroup_mutex);
3820         return 0;
3821 }
3822
3823
3824 /*
3825  * seq_file methods for the tasks/procs files. The seq_file position is the
3826  * next pid to display; the seq_file iterator is a pointer to the pid
3827  * in the cgroup->l->list array.
3828  */
3829
3830 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3831 {
3832         /*
3833          * Initially we receive a position value that corresponds to
3834          * one more than the last pid shown (or 0 on the first call or
3835          * after a seek to the start). Use a binary-search to find the
3836          * next pid to display, if any
3837          */
3838         struct kernfs_open_file *of = s->private;
3839         struct cgroup *cgrp = seq_css(s)->cgroup;
3840         struct cgroup_pidlist *l;
3841         enum cgroup_filetype type = seq_cft(s)->private;
3842         int index = 0, pid = *pos;
3843         int *iter, ret;
3844
3845         mutex_lock(&cgrp->pidlist_mutex);
3846
3847         /*
3848          * !NULL @of->priv indicates that this isn't the first start()
3849          * after open.  If the matching pidlist is around, we can use that.
3850          * Look for it.  Note that @of->priv can't be used directly.  It
3851          * could already have been destroyed.
3852          */
3853         if (of->priv)
3854                 of->priv = cgroup_pidlist_find(cgrp, type);
3855
3856         /*
3857          * Either this is the first start() after open or the matching
3858          * pidlist has been destroyed inbetween.  Create a new one.
3859          */
3860         if (!of->priv) {
3861                 ret = pidlist_array_load(cgrp, type,
3862                                          (struct cgroup_pidlist **)&of->priv);
3863                 if (ret)
3864                         return ERR_PTR(ret);
3865         }
3866         l = of->priv;
3867
3868         if (pid) {
3869                 int end = l->length;
3870
3871                 while (index < end) {
3872                         int mid = (index + end) / 2;
3873                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3874                                 index = mid;
3875                                 break;
3876                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3877                                 index = mid + 1;
3878                         else
3879                                 end = mid;
3880                 }
3881         }
3882         /* If we're off the end of the array, we're done */
3883         if (index >= l->length)
3884                 return NULL;
3885         /* Update the abstract position to be the actual pid that we found */
3886         iter = l->list + index;
3887         *pos = cgroup_pid_fry(cgrp, *iter);
3888         return iter;
3889 }
3890
3891 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3892 {
3893         struct kernfs_open_file *of = s->private;
3894         struct cgroup_pidlist *l = of->priv;
3895
3896         if (l)
3897                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3898                                  CGROUP_PIDLIST_DESTROY_DELAY);
3899         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3900 }
3901
3902 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3903 {
3904         struct kernfs_open_file *of = s->private;
3905         struct cgroup_pidlist *l = of->priv;
3906         pid_t *p = v;
3907         pid_t *end = l->list + l->length;
3908         /*
3909          * Advance to the next pid in the array. If this goes off the
3910          * end, we're done
3911          */
3912         p++;
3913         if (p >= end) {
3914                 return NULL;
3915         } else {
3916                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3917                 return p;
3918         }
3919 }
3920
3921 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3922 {
3923         return seq_printf(s, "%d\n", *(int *)v);
3924 }
3925
3926 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3927                                          struct cftype *cft)
3928 {
3929         return notify_on_release(css->cgroup);
3930 }
3931
3932 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3933                                           struct cftype *cft, u64 val)
3934 {
3935         clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3936         if (val)
3937                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3938         else
3939                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3940         return 0;
3941 }
3942
3943 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3944                                       struct cftype *cft)
3945 {
3946         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3947 }
3948
3949 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3950                                        struct cftype *cft, u64 val)
3951 {
3952         if (val)
3953                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3954         else
3955                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3956         return 0;
3957 }
3958
3959 static struct cftype cgroup_base_files[] = {
3960         {
3961                 .name = "cgroup.procs",
3962                 .seq_start = cgroup_pidlist_start,
3963                 .seq_next = cgroup_pidlist_next,
3964                 .seq_stop = cgroup_pidlist_stop,
3965                 .seq_show = cgroup_pidlist_show,
3966                 .private = CGROUP_FILE_PROCS,
3967                 .write = cgroup_procs_write,
3968                 .mode = S_IRUGO | S_IWUSR,
3969         },
3970         {
3971                 .name = "cgroup.clone_children",
3972                 .flags = CFTYPE_INSANE,
3973                 .read_u64 = cgroup_clone_children_read,
3974                 .write_u64 = cgroup_clone_children_write,
3975         },
3976         {
3977                 .name = "cgroup.sane_behavior",
3978                 .flags = CFTYPE_ONLY_ON_ROOT,
3979                 .seq_show = cgroup_sane_behavior_show,
3980         },
3981         {
3982                 .name = "cgroup.controllers",
3983                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
3984                 .seq_show = cgroup_root_controllers_show,
3985         },
3986         {
3987                 .name = "cgroup.controllers",
3988                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
3989                 .seq_show = cgroup_controllers_show,
3990         },
3991         {
3992                 .name = "cgroup.subtree_control",
3993                 .flags = CFTYPE_ONLY_ON_DFL,
3994                 .seq_show = cgroup_subtree_control_show,
3995                 .write = cgroup_subtree_control_write,
3996         },
3997         {
3998                 .name = "cgroup.populated",
3999                 .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
4000                 .seq_show = cgroup_populated_show,
4001         },
4002
4003         /*
4004          * Historical crazy stuff.  These don't have "cgroup."  prefix and
4005          * don't exist if sane_behavior.  If you're depending on these, be
4006          * prepared to be burned.
4007          */
4008         {
4009                 .name = "tasks",
4010                 .flags = CFTYPE_INSANE,         /* use "procs" instead */
4011                 .seq_start = cgroup_pidlist_start,
4012                 .seq_next = cgroup_pidlist_next,
4013                 .seq_stop = cgroup_pidlist_stop,
4014                 .seq_show = cgroup_pidlist_show,
4015                 .private = CGROUP_FILE_TASKS,
4016                 .write = cgroup_tasks_write,
4017                 .mode = S_IRUGO | S_IWUSR,
4018         },
4019         {
4020                 .name = "notify_on_release",
4021                 .flags = CFTYPE_INSANE,
4022                 .read_u64 = cgroup_read_notify_on_release,
4023                 .write_u64 = cgroup_write_notify_on_release,
4024         },
4025         {
4026                 .name = "release_agent",
4027                 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
4028                 .seq_show = cgroup_release_agent_show,
4029                 .write = cgroup_release_agent_write,
4030                 .max_write_len = PATH_MAX - 1,
4031         },
4032         { }     /* terminate */
4033 };
4034
4035 /**
4036  * cgroup_populate_dir - create subsys files in a cgroup directory
4037  * @cgrp: target cgroup
4038  * @subsys_mask: mask of the subsystem ids whose files should be added
4039  *
4040  * On failure, no file is added.
4041  */
4042 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
4043 {
4044         struct cgroup_subsys *ss;
4045         int i, ret = 0;
4046
4047         /* process cftsets of each subsystem */
4048         for_each_subsys(ss, i) {
4049                 struct cftype *cfts;
4050
4051                 if (!(subsys_mask & (1 << i)))
4052                         continue;
4053
4054                 list_for_each_entry(cfts, &ss->cfts, node) {
4055                         ret = cgroup_addrm_files(cgrp, cfts, true);
4056                         if (ret < 0)
4057                                 goto err;
4058                 }
4059         }
4060         return 0;
4061 err:
4062         cgroup_clear_dir(cgrp, subsys_mask);
4063         return ret;
4064 }
4065
4066 /*
4067  * css destruction is four-stage process.
4068  *
4069  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
4070  *    Implemented in kill_css().
4071  *
4072  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4073  *    and thus css_tryget_online() is guaranteed to fail, the css can be
4074  *    offlined by invoking offline_css().  After offlining, the base ref is
4075  *    put.  Implemented in css_killed_work_fn().
4076  *
4077  * 3. When the percpu_ref reaches zero, the only possible remaining
4078  *    accessors are inside RCU read sections.  css_release() schedules the
4079  *    RCU callback.
4080  *
4081  * 4. After the grace period, the css can be freed.  Implemented in
4082  *    css_free_work_fn().
4083  *
4084  * It is actually hairier because both step 2 and 4 require process context
4085  * and thus involve punting to css->destroy_work adding two additional
4086  * steps to the already complex sequence.
4087  */
4088 static void css_free_work_fn(struct work_struct *work)
4089 {
4090         struct cgroup_subsys_state *css =
4091                 container_of(work, struct cgroup_subsys_state, destroy_work);
4092         struct cgroup *cgrp = css->cgroup;
4093
4094         if (css->parent)
4095                 css_put(css->parent);
4096
4097         css->ss->css_free(css);
4098         cgroup_put(cgrp);
4099 }
4100
4101 static void css_free_rcu_fn(struct rcu_head *rcu_head)
4102 {
4103         struct cgroup_subsys_state *css =
4104                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4105
4106         INIT_WORK(&css->destroy_work, css_free_work_fn);
4107         queue_work(cgroup_destroy_wq, &css->destroy_work);
4108 }
4109
4110 static void css_release(struct percpu_ref *ref)
4111 {
4112         struct cgroup_subsys_state *css =
4113                 container_of(ref, struct cgroup_subsys_state, refcnt);
4114         struct cgroup_subsys *ss = css->ss;
4115
4116         cgroup_idr_remove(&ss->css_idr, css->id);
4117
4118         call_rcu(&css->rcu_head, css_free_rcu_fn);
4119 }
4120
4121 static void init_and_link_css(struct cgroup_subsys_state *css,
4122                               struct cgroup_subsys *ss, struct cgroup *cgrp)
4123 {
4124         cgroup_get(cgrp);
4125
4126         css->cgroup = cgrp;
4127         css->ss = ss;
4128         css->flags = 0;
4129
4130         if (cgrp->parent) {
4131                 css->parent = cgroup_css(cgrp->parent, ss);
4132                 css_get(css->parent);
4133         } else {
4134                 css->flags |= CSS_ROOT;
4135         }
4136
4137         BUG_ON(cgroup_css(cgrp, ss));
4138 }
4139
4140 /* invoke ->css_online() on a new CSS and mark it online if successful */
4141 static int online_css(struct cgroup_subsys_state *css)
4142 {
4143         struct cgroup_subsys *ss = css->ss;
4144         int ret = 0;
4145
4146         lockdep_assert_held(&cgroup_mutex);
4147
4148         if (ss->css_online)
4149                 ret = ss->css_online(css);
4150         if (!ret) {
4151                 css->flags |= CSS_ONLINE;
4152                 css->cgroup->nr_css++;
4153                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4154         }
4155         return ret;
4156 }
4157
4158 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4159 static void offline_css(struct cgroup_subsys_state *css)
4160 {
4161         struct cgroup_subsys *ss = css->ss;
4162
4163         lockdep_assert_held(&cgroup_mutex);
4164
4165         if (!(css->flags & CSS_ONLINE))
4166                 return;
4167
4168         if (ss->css_offline)
4169                 ss->css_offline(css);
4170
4171         css->flags &= ~CSS_ONLINE;
4172         css->cgroup->nr_css--;
4173         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
4174
4175         wake_up_all(&css->cgroup->offline_waitq);
4176 }
4177
4178 /**
4179  * create_css - create a cgroup_subsys_state
4180  * @cgrp: the cgroup new css will be associated with
4181  * @ss: the subsys of new css
4182  *
4183  * Create a new css associated with @cgrp - @ss pair.  On success, the new
4184  * css is online and installed in @cgrp with all interface files created.
4185  * Returns 0 on success, -errno on failure.
4186  */
4187 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4188 {
4189         struct cgroup *parent = cgrp->parent;
4190         struct cgroup_subsys_state *css;
4191         int err;
4192
4193         lockdep_assert_held(&cgroup_mutex);
4194
4195         css = ss->css_alloc(cgroup_css(parent, ss));
4196         if (IS_ERR(css))
4197                 return PTR_ERR(css);
4198
4199         init_and_link_css(css, ss, cgrp);
4200
4201         err = percpu_ref_init(&css->refcnt, css_release);
4202         if (err)
4203                 goto err_free_css;
4204
4205         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
4206         if (err < 0)
4207                 goto err_free_percpu_ref;
4208         css->id = err;
4209
4210         err = cgroup_populate_dir(cgrp, 1 << ss->id);
4211         if (err)
4212                 goto err_free_id;
4213
4214         /* @css is ready to be brought online now, make it visible */
4215         cgroup_idr_replace(&ss->css_idr, css, css->id);
4216
4217         err = online_css(css);
4218         if (err)
4219                 goto err_clear_dir;
4220
4221         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4222             parent->parent) {
4223                 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
4224                         current->comm, current->pid, ss->name);
4225                 if (!strcmp(ss->name, "memory"))
4226                         pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
4227                 ss->warned_broken_hierarchy = true;
4228         }
4229
4230         return 0;
4231
4232 err_clear_dir:
4233         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4234 err_free_id:
4235         cgroup_idr_remove(&ss->css_idr, css->id);
4236 err_free_percpu_ref:
4237         percpu_ref_cancel_init(&css->refcnt);
4238 err_free_css:
4239         call_rcu(&css->rcu_head, css_free_rcu_fn);
4240         return err;
4241 }
4242
4243 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4244                         umode_t mode)
4245 {
4246         struct cgroup *parent, *cgrp;
4247         struct cgroup_root *root;
4248         struct cgroup_subsys *ss;
4249         struct kernfs_node *kn;
4250         int ssid, ret;
4251
4252         parent = cgroup_kn_lock_live(parent_kn);
4253         if (!parent)
4254                 return -ENODEV;
4255         root = parent->root;
4256
4257         /* allocate the cgroup and its ID, 0 is reserved for the root */
4258         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4259         if (!cgrp) {
4260                 ret = -ENOMEM;
4261                 goto out_unlock;
4262         }
4263
4264         /*
4265          * Temporarily set the pointer to NULL, so idr_find() won't return
4266          * a half-baked cgroup.
4267          */
4268         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
4269         if (cgrp->id < 0) {
4270                 ret = -ENOMEM;
4271                 goto out_free_cgrp;
4272         }
4273
4274         init_cgroup_housekeeping(cgrp);
4275
4276         cgrp->parent = parent;
4277         cgrp->self.parent = &parent->self;
4278         cgrp->root = root;
4279
4280         if (notify_on_release(parent))
4281                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
4282
4283         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4284                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4285
4286         /* create the directory */
4287         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
4288         if (IS_ERR(kn)) {
4289                 ret = PTR_ERR(kn);
4290                 goto out_free_id;
4291         }
4292         cgrp->kn = kn;
4293
4294         /*
4295          * This extra ref will be put in cgroup_free_fn() and guarantees
4296          * that @cgrp->kn is always accessible.
4297          */
4298         kernfs_get(kn);
4299
4300         cgrp->serial_nr = cgroup_serial_nr_next++;
4301
4302         /* allocation complete, commit to creation */
4303         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4304         atomic_inc(&root->nr_cgrps);
4305         cgroup_get(parent);
4306
4307         /*
4308          * @cgrp is now fully operational.  If something fails after this
4309          * point, it'll be released via the normal destruction path.
4310          */
4311         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4312
4313         ret = cgroup_kn_set_ugid(kn);
4314         if (ret)
4315                 goto out_destroy;
4316
4317         ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4318         if (ret)
4319                 goto out_destroy;
4320
4321         /* let's create and online css's */
4322         for_each_subsys(ss, ssid) {
4323                 if (parent->child_subsys_mask & (1 << ssid)) {
4324                         ret = create_css(cgrp, ss);
4325                         if (ret)
4326                                 goto out_destroy;
4327                 }
4328         }
4329
4330         /*
4331          * On the default hierarchy, a child doesn't automatically inherit
4332          * child_subsys_mask from the parent.  Each is configured manually.
4333          */
4334         if (!cgroup_on_dfl(cgrp))
4335                 cgrp->child_subsys_mask = parent->child_subsys_mask;
4336
4337         kernfs_activate(kn);
4338
4339         ret = 0;
4340         goto out_unlock;
4341
4342 out_free_id:
4343         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
4344 out_free_cgrp:
4345         kfree(cgrp);
4346 out_unlock:
4347         cgroup_kn_unlock(parent_kn);
4348         return ret;
4349
4350 out_destroy:
4351         cgroup_destroy_locked(cgrp);
4352         goto out_unlock;
4353 }
4354
4355 /*
4356  * This is called when the refcnt of a css is confirmed to be killed.
4357  * css_tryget_online() is now guaranteed to fail.
4358  */
4359 static void css_killed_work_fn(struct work_struct *work)
4360 {
4361         struct cgroup_subsys_state *css =
4362                 container_of(work, struct cgroup_subsys_state, destroy_work);
4363         struct cgroup *cgrp = css->cgroup;
4364
4365         mutex_lock(&cgroup_mutex);
4366
4367         /*
4368          * css_tryget_online() is guaranteed to fail now.  Tell subsystems
4369          * to initate destruction.
4370          */
4371         offline_css(css);
4372
4373         /*
4374          * If @cgrp is marked dead, it's waiting for refs of all css's to
4375          * be disabled before proceeding to the second phase of cgroup
4376          * destruction.  If we are the last one, kick it off.
4377          */
4378         if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4379                 cgroup_destroy_css_killed(cgrp);
4380
4381         mutex_unlock(&cgroup_mutex);
4382
4383         /*
4384          * Put the css refs from kill_css().  Each css holds an extra
4385          * reference to the cgroup's dentry and cgroup removal proceeds
4386          * regardless of css refs.  On the last put of each css, whenever
4387          * that may be, the extra dentry ref is put so that dentry
4388          * destruction happens only after all css's are released.
4389          */
4390         css_put(css);
4391 }
4392
4393 /* css kill confirmation processing requires process context, bounce */
4394 static void css_killed_ref_fn(struct percpu_ref *ref)
4395 {
4396         struct cgroup_subsys_state *css =
4397                 container_of(ref, struct cgroup_subsys_state, refcnt);
4398
4399         INIT_WORK(&css->destroy_work, css_killed_work_fn);
4400         queue_work(cgroup_destroy_wq, &css->destroy_work);
4401 }
4402
4403 /**
4404  * kill_css - destroy a css
4405  * @css: css to destroy
4406  *
4407  * This function initiates destruction of @css by removing cgroup interface
4408  * files and putting its base reference.  ->css_offline() will be invoked
4409  * asynchronously once css_tryget_online() is guaranteed to fail and when
4410  * the reference count reaches zero, @css will be released.
4411  */
4412 static void kill_css(struct cgroup_subsys_state *css)
4413 {
4414         lockdep_assert_held(&cgroup_mutex);
4415
4416         /*
4417          * This must happen before css is disassociated with its cgroup.
4418          * See seq_css() for details.
4419          */
4420         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4421
4422         /*
4423          * Killing would put the base ref, but we need to keep it alive
4424          * until after ->css_offline().
4425          */
4426         css_get(css);
4427
4428         /*
4429          * cgroup core guarantees that, by the time ->css_offline() is
4430          * invoked, no new css reference will be given out via
4431          * css_tryget_online().  We can't simply call percpu_ref_kill() and
4432          * proceed to offlining css's because percpu_ref_kill() doesn't
4433          * guarantee that the ref is seen as killed on all CPUs on return.
4434          *
4435          * Use percpu_ref_kill_and_confirm() to get notifications as each
4436          * css is confirmed to be seen as killed on all CPUs.
4437          */
4438         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4439 }
4440
4441 /**
4442  * cgroup_destroy_locked - the first stage of cgroup destruction
4443  * @cgrp: cgroup to be destroyed
4444  *
4445  * css's make use of percpu refcnts whose killing latency shouldn't be
4446  * exposed to userland and are RCU protected.  Also, cgroup core needs to
4447  * guarantee that css_tryget_online() won't succeed by the time
4448  * ->css_offline() is invoked.  To satisfy all the requirements,
4449  * destruction is implemented in the following two steps.
4450  *
4451  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
4452  *     userland visible parts and start killing the percpu refcnts of
4453  *     css's.  Set up so that the next stage will be kicked off once all
4454  *     the percpu refcnts are confirmed to be killed.
4455  *
4456  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4457  *     rest of destruction.  Once all cgroup references are gone, the
4458  *     cgroup is RCU-freed.
4459  *
4460  * This function implements s1.  After this step, @cgrp is gone as far as
4461  * the userland is concerned and a new cgroup with the same name may be
4462  * created.  As cgroup doesn't care about the names internally, this
4463  * doesn't cause any problem.
4464  */
4465 static int cgroup_destroy_locked(struct cgroup *cgrp)
4466         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4467 {
4468         struct cgroup *child;
4469         struct cgroup_subsys_state *css;
4470         bool empty;
4471         int ssid;
4472
4473         lockdep_assert_held(&cgroup_mutex);
4474
4475         /*
4476          * css_set_rwsem synchronizes access to ->cset_links and prevents
4477          * @cgrp from being removed while put_css_set() is in progress.
4478          */
4479         down_read(&css_set_rwsem);
4480         empty = list_empty(&cgrp->cset_links);
4481         up_read(&css_set_rwsem);
4482         if (!empty)
4483                 return -EBUSY;
4484
4485         /*
4486          * Make sure there's no live children.  We can't test ->children
4487          * emptiness as dead children linger on it while being destroyed;
4488          * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4489          */
4490         empty = true;
4491         rcu_read_lock();
4492         list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4493                 empty = cgroup_is_dead(child);
4494                 if (!empty)
4495                         break;
4496         }
4497         rcu_read_unlock();
4498         if (!empty)
4499                 return -EBUSY;
4500
4501         /*
4502          * Mark @cgrp dead.  This prevents further task migration and child
4503          * creation by disabling cgroup_lock_live_group().  Note that
4504          * CGRP_DEAD assertion is depended upon by css_next_child() to
4505          * resume iteration after dropping RCU read lock.  See
4506          * css_next_child() for details.
4507          */
4508         set_bit(CGRP_DEAD, &cgrp->flags);
4509
4510         /*
4511          * Initiate massacre of all css's.  cgroup_destroy_css_killed()
4512          * will be invoked to perform the rest of destruction once the
4513          * percpu refs of all css's are confirmed to be killed.
4514          */
4515         for_each_css(css, ssid, cgrp)
4516                 kill_css(css);
4517
4518         /* CGRP_DEAD is set, remove from ->release_list for the last time */
4519         raw_spin_lock(&release_list_lock);
4520         if (!list_empty(&cgrp->release_list))
4521                 list_del_init(&cgrp->release_list);
4522         raw_spin_unlock(&release_list_lock);
4523
4524         /*
4525          * If @cgrp has css's attached, the second stage of cgroup
4526          * destruction is kicked off from css_killed_work_fn() after the
4527          * refs of all attached css's are killed.  If @cgrp doesn't have
4528          * any css, we kick it off here.
4529          */
4530         if (!cgrp->nr_css)
4531                 cgroup_destroy_css_killed(cgrp);
4532
4533         /*
4534          * Remove @cgrp directory along with the base files.  @cgrp has an
4535          * extra ref on its kn.
4536          */
4537         kernfs_remove(cgrp->kn);
4538
4539         return 0;
4540 };
4541
4542 /**
4543  * cgroup_destroy_css_killed - the second step of cgroup destruction
4544  * @cgrp: the cgroup whose csses have just finished offlining
4545  *
4546  * This function is invoked from a work item for a cgroup which is being
4547  * destroyed after all css's are offlined and performs the rest of
4548  * destruction.  This is the second step of destruction described in the
4549  * comment above cgroup_destroy_locked().
4550  */
4551 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4552 {
4553         struct cgroup *parent = cgrp->parent;
4554
4555         lockdep_assert_held(&cgroup_mutex);
4556
4557         /* delete this cgroup from parent->children */
4558         list_del_rcu(&cgrp->sibling);
4559
4560         cgroup_put(cgrp);
4561
4562         set_bit(CGRP_RELEASABLE, &parent->flags);
4563         check_for_release(parent);
4564 }
4565
4566 static int cgroup_rmdir(struct kernfs_node *kn)
4567 {
4568         struct cgroup *cgrp;
4569         int ret = 0;
4570
4571         cgrp = cgroup_kn_lock_live(kn);
4572         if (!cgrp)
4573                 return 0;
4574         cgroup_get(cgrp);       /* for @kn->priv clearing */
4575
4576         ret = cgroup_destroy_locked(cgrp);
4577
4578         cgroup_kn_unlock(kn);
4579
4580         /*
4581          * There are two control paths which try to determine cgroup from
4582          * dentry without going through kernfs - cgroupstats_build() and
4583          * css_tryget_online_from_dir().  Those are supported by RCU
4584          * protecting clearing of cgrp->kn->priv backpointer, which should
4585          * happen after all files under it have been removed.
4586          */
4587         if (!ret)
4588                 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4589
4590         cgroup_put(cgrp);
4591         return ret;
4592 }
4593
4594 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4595         .remount_fs             = cgroup_remount,
4596         .show_options           = cgroup_show_options,
4597         .mkdir                  = cgroup_mkdir,
4598         .rmdir                  = cgroup_rmdir,
4599         .rename                 = cgroup_rename,
4600 };
4601
4602 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4603 {
4604         struct cgroup_subsys_state *css;
4605
4606         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4607
4608         mutex_lock(&cgroup_mutex);
4609
4610         idr_init(&ss->css_idr);
4611         INIT_LIST_HEAD(&ss->cfts);
4612
4613         /* Create the root cgroup state for this subsystem */
4614         ss->root = &cgrp_dfl_root;
4615         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4616         /* We don't handle early failures gracefully */
4617         BUG_ON(IS_ERR(css));
4618         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
4619         if (early) {
4620                 /* idr_alloc() can't be called safely during early init */
4621                 css->id = 1;
4622         } else {
4623                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
4624                 BUG_ON(css->id < 0);
4625         }
4626
4627         /* Update the init_css_set to contain a subsys
4628          * pointer to this state - since the subsystem is
4629          * newly registered, all tasks and hence the
4630          * init_css_set is in the subsystem's root cgroup. */
4631         init_css_set.subsys[ss->id] = css;
4632
4633         need_forkexit_callback |= ss->fork || ss->exit;
4634
4635         /* At system boot, before all subsystems have been
4636          * registered, no tasks have been forked, so we don't
4637          * need to invoke fork callbacks here. */
4638         BUG_ON(!list_empty(&init_task.tasks));
4639
4640         BUG_ON(online_css(css));
4641
4642         cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4643
4644         mutex_unlock(&cgroup_mutex);
4645 }
4646
4647 /**
4648  * cgroup_init_early - cgroup initialization at system boot
4649  *
4650  * Initialize cgroups at system boot, and initialize any
4651  * subsystems that request early init.
4652  */
4653 int __init cgroup_init_early(void)
4654 {
4655         static struct cgroup_sb_opts __initdata opts =
4656                 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4657         struct cgroup_subsys *ss;
4658         int i;
4659
4660         init_cgroup_root(&cgrp_dfl_root, &opts);
4661         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4662
4663         for_each_subsys(ss, i) {
4664                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4665                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4666                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4667                      ss->id, ss->name);
4668                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4669                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4670
4671                 ss->id = i;
4672                 ss->name = cgroup_subsys_name[i];
4673
4674                 if (ss->early_init)
4675                         cgroup_init_subsys(ss, true);
4676         }
4677         return 0;
4678 }
4679
4680 /**
4681  * cgroup_init - cgroup initialization
4682  *
4683  * Register cgroup filesystem and /proc file, and initialize
4684  * any subsystems that didn't request early init.
4685  */
4686 int __init cgroup_init(void)
4687 {
4688         struct cgroup_subsys *ss;
4689         unsigned long key;
4690         int ssid, err;
4691
4692         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4693
4694         mutex_lock(&cgroup_mutex);
4695
4696         /* Add init_css_set to the hash table */
4697         key = css_set_hash(init_css_set.subsys);
4698         hash_add(css_set_table, &init_css_set.hlist, key);
4699
4700         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4701
4702         mutex_unlock(&cgroup_mutex);
4703
4704         for_each_subsys(ss, ssid) {
4705                 if (ss->early_init) {
4706                         struct cgroup_subsys_state *css =
4707                                 init_css_set.subsys[ss->id];
4708
4709                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
4710                                                    GFP_KERNEL);
4711                         BUG_ON(css->id < 0);
4712                 } else {
4713                         cgroup_init_subsys(ss, false);
4714                 }
4715
4716                 list_add_tail(&init_css_set.e_cset_node[ssid],
4717                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
4718
4719                 /*
4720                  * cftype registration needs kmalloc and can't be done
4721                  * during early_init.  Register base cftypes separately.
4722                  */
4723                 if (ss->base_cftypes)
4724                         WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4725         }
4726
4727         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4728         if (!cgroup_kobj)
4729                 return -ENOMEM;
4730
4731         err = register_filesystem(&cgroup_fs_type);
4732         if (err < 0) {
4733                 kobject_put(cgroup_kobj);
4734                 return err;
4735         }
4736
4737         proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4738         return 0;
4739 }
4740
4741 static int __init cgroup_wq_init(void)
4742 {
4743         /*
4744          * There isn't much point in executing destruction path in
4745          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4746          * Use 1 for @max_active.
4747          *
4748          * We would prefer to do this in cgroup_init() above, but that
4749          * is called before init_workqueues(): so leave this until after.
4750          */
4751         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4752         BUG_ON(!cgroup_destroy_wq);
4753
4754         /*
4755          * Used to destroy pidlists and separate to serve as flush domain.
4756          * Cap @max_active to 1 too.
4757          */
4758         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4759                                                     0, 1);
4760         BUG_ON(!cgroup_pidlist_destroy_wq);
4761
4762         return 0;
4763 }
4764 core_initcall(cgroup_wq_init);
4765
4766 /*
4767  * proc_cgroup_show()
4768  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4769  *  - Used for /proc/<pid>/cgroup.
4770  */
4771
4772 /* TODO: Use a proper seq_file iterator */
4773 int proc_cgroup_show(struct seq_file *m, void *v)
4774 {
4775         struct pid *pid;
4776         struct task_struct *tsk;
4777         char *buf, *path;
4778         int retval;
4779         struct cgroup_root *root;
4780
4781         retval = -ENOMEM;
4782         buf = kmalloc(PATH_MAX, GFP_KERNEL);
4783         if (!buf)
4784                 goto out;
4785
4786         retval = -ESRCH;
4787         pid = m->private;
4788         tsk = get_pid_task(pid, PIDTYPE_PID);
4789         if (!tsk)
4790                 goto out_free;
4791
4792         retval = 0;
4793
4794         mutex_lock(&cgroup_mutex);
4795         down_read(&css_set_rwsem);
4796
4797         for_each_root(root) {
4798                 struct cgroup_subsys *ss;
4799                 struct cgroup *cgrp;
4800                 int ssid, count = 0;
4801
4802                 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4803                         continue;
4804
4805                 seq_printf(m, "%d:", root->hierarchy_id);
4806                 for_each_subsys(ss, ssid)
4807                         if (root->subsys_mask & (1 << ssid))
4808                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4809                 if (strlen(root->name))
4810                         seq_printf(m, "%sname=%s", count ? "," : "",
4811                                    root->name);
4812                 seq_putc(m, ':');
4813                 cgrp = task_cgroup_from_root(tsk, root);
4814                 path = cgroup_path(cgrp, buf, PATH_MAX);
4815                 if (!path) {
4816                         retval = -ENAMETOOLONG;
4817                         goto out_unlock;
4818                 }
4819                 seq_puts(m, path);
4820                 seq_putc(m, '\n');
4821         }
4822
4823 out_unlock:
4824         up_read(&css_set_rwsem);
4825         mutex_unlock(&cgroup_mutex);
4826         put_task_struct(tsk);
4827 out_free:
4828         kfree(buf);
4829 out:
4830         return retval;
4831 }
4832
4833 /* Display information about each subsystem and each hierarchy */
4834 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4835 {
4836         struct cgroup_subsys *ss;
4837         int i;
4838
4839         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4840         /*
4841          * ideally we don't want subsystems moving around while we do this.
4842          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4843          * subsys/hierarchy state.
4844          */
4845         mutex_lock(&cgroup_mutex);
4846
4847         for_each_subsys(ss, i)
4848                 seq_printf(m, "%s\t%d\t%d\t%d\n",
4849                            ss->name, ss->root->hierarchy_id,
4850                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4851
4852         mutex_unlock(&cgroup_mutex);
4853         return 0;
4854 }
4855
4856 static int cgroupstats_open(struct inode *inode, struct file *file)
4857 {
4858         return single_open(file, proc_cgroupstats_show, NULL);
4859 }
4860
4861 static const struct file_operations proc_cgroupstats_operations = {
4862         .open = cgroupstats_open,
4863         .read = seq_read,
4864         .llseek = seq_lseek,
4865         .release = single_release,
4866 };
4867
4868 /**
4869  * cgroup_fork - initialize cgroup related fields during copy_process()
4870  * @child: pointer to task_struct of forking parent process.
4871  *
4872  * A task is associated with the init_css_set until cgroup_post_fork()
4873  * attaches it to the parent's css_set.  Empty cg_list indicates that
4874  * @child isn't holding reference to its css_set.
4875  */
4876 void cgroup_fork(struct task_struct *child)
4877 {
4878         RCU_INIT_POINTER(child->cgroups, &init_css_set);
4879         INIT_LIST_HEAD(&child->cg_list);
4880 }
4881
4882 /**
4883  * cgroup_post_fork - called on a new task after adding it to the task list
4884  * @child: the task in question
4885  *
4886  * Adds the task to the list running through its css_set if necessary and
4887  * call the subsystem fork() callbacks.  Has to be after the task is
4888  * visible on the task list in case we race with the first call to
4889  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4890  * list.
4891  */
4892 void cgroup_post_fork(struct task_struct *child)
4893 {
4894         struct cgroup_subsys *ss;
4895         int i;
4896
4897         /*
4898          * This may race against cgroup_enable_task_cg_links().  As that
4899          * function sets use_task_css_set_links before grabbing
4900          * tasklist_lock and we just went through tasklist_lock to add
4901          * @child, it's guaranteed that either we see the set
4902          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4903          * @child during its iteration.
4904          *
4905          * If we won the race, @child is associated with %current's
4906          * css_set.  Grabbing css_set_rwsem guarantees both that the
4907          * association is stable, and, on completion of the parent's
4908          * migration, @child is visible in the source of migration or
4909          * already in the destination cgroup.  This guarantee is necessary
4910          * when implementing operations which need to migrate all tasks of
4911          * a cgroup to another.
4912          *
4913          * Note that if we lose to cgroup_enable_task_cg_links(), @child
4914          * will remain in init_css_set.  This is safe because all tasks are
4915          * in the init_css_set before cg_links is enabled and there's no
4916          * operation which transfers all tasks out of init_css_set.
4917          */
4918         if (use_task_css_set_links) {
4919                 struct css_set *cset;
4920
4921                 down_write(&css_set_rwsem);
4922                 cset = task_css_set(current);
4923                 if (list_empty(&child->cg_list)) {
4924                         rcu_assign_pointer(child->cgroups, cset);
4925                         list_add(&child->cg_list, &cset->tasks);
4926                         get_css_set(cset);
4927                 }
4928                 up_write(&css_set_rwsem);
4929         }
4930
4931         /*
4932          * Call ss->fork().  This must happen after @child is linked on
4933          * css_set; otherwise, @child might change state between ->fork()
4934          * and addition to css_set.
4935          */
4936         if (need_forkexit_callback) {
4937                 for_each_subsys(ss, i)
4938                         if (ss->fork)
4939                                 ss->fork(child);
4940         }
4941 }
4942
4943 /**
4944  * cgroup_exit - detach cgroup from exiting task
4945  * @tsk: pointer to task_struct of exiting process
4946  *
4947  * Description: Detach cgroup from @tsk and release it.
4948  *
4949  * Note that cgroups marked notify_on_release force every task in
4950  * them to take the global cgroup_mutex mutex when exiting.
4951  * This could impact scaling on very large systems.  Be reluctant to
4952  * use notify_on_release cgroups where very high task exit scaling
4953  * is required on large systems.
4954  *
4955  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
4956  * call cgroup_exit() while the task is still competent to handle
4957  * notify_on_release(), then leave the task attached to the root cgroup in
4958  * each hierarchy for the remainder of its exit.  No need to bother with
4959  * init_css_set refcnting.  init_css_set never goes away and we can't race
4960  * with migration path - PF_EXITING is visible to migration path.
4961  */
4962 void cgroup_exit(struct task_struct *tsk)
4963 {
4964         struct cgroup_subsys *ss;
4965         struct css_set *cset;
4966         bool put_cset = false;
4967         int i;
4968
4969         /*
4970          * Unlink from @tsk from its css_set.  As migration path can't race
4971          * with us, we can check cg_list without grabbing css_set_rwsem.
4972          */
4973         if (!list_empty(&tsk->cg_list)) {
4974                 down_write(&css_set_rwsem);
4975                 list_del_init(&tsk->cg_list);
4976                 up_write(&css_set_rwsem);
4977                 put_cset = true;
4978         }
4979
4980         /* Reassign the task to the init_css_set. */
4981         cset = task_css_set(tsk);
4982         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4983
4984         if (need_forkexit_callback) {
4985                 /* see cgroup_post_fork() for details */
4986                 for_each_subsys(ss, i) {
4987                         if (ss->exit) {
4988                                 struct cgroup_subsys_state *old_css = cset->subsys[i];
4989                                 struct cgroup_subsys_state *css = task_css(tsk, i);
4990
4991                                 ss->exit(css, old_css, tsk);
4992                         }
4993                 }
4994         }
4995
4996         if (put_cset)
4997                 put_css_set(cset, true);
4998 }
4999
5000 static void check_for_release(struct cgroup *cgrp)
5001 {
5002         if (cgroup_is_releasable(cgrp) &&
5003             list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
5004                 /*
5005                  * Control Group is currently removeable. If it's not
5006                  * already queued for a userspace notification, queue
5007                  * it now
5008                  */
5009                 int need_schedule_work = 0;
5010
5011                 raw_spin_lock(&release_list_lock);
5012                 if (!cgroup_is_dead(cgrp) &&
5013                     list_empty(&cgrp->release_list)) {
5014                         list_add(&cgrp->release_list, &release_list);
5015                         need_schedule_work = 1;
5016                 }
5017                 raw_spin_unlock(&release_list_lock);
5018                 if (need_schedule_work)
5019                         schedule_work(&release_agent_work);
5020         }
5021 }
5022
5023 /*
5024  * Notify userspace when a cgroup is released, by running the
5025  * configured release agent with the name of the cgroup (path
5026  * relative to the root of cgroup file system) as the argument.
5027  *
5028  * Most likely, this user command will try to rmdir this cgroup.
5029  *
5030  * This races with the possibility that some other task will be
5031  * attached to this cgroup before it is removed, or that some other
5032  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
5033  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
5034  * unused, and this cgroup will be reprieved from its death sentence,
5035  * to continue to serve a useful existence.  Next time it's released,
5036  * we will get notified again, if it still has 'notify_on_release' set.
5037  *
5038  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
5039  * means only wait until the task is successfully execve()'d.  The
5040  * separate release agent task is forked by call_usermodehelper(),
5041  * then control in this thread returns here, without waiting for the
5042  * release agent task.  We don't bother to wait because the caller of
5043  * this routine has no use for the exit status of the release agent
5044  * task, so no sense holding our caller up for that.
5045  */
5046 static void cgroup_release_agent(struct work_struct *work)
5047 {
5048         BUG_ON(work != &release_agent_work);
5049         mutex_lock(&cgroup_mutex);
5050         raw_spin_lock(&release_list_lock);
5051         while (!list_empty(&release_list)) {
5052                 char *argv[3], *envp[3];
5053                 int i;
5054                 char *pathbuf = NULL, *agentbuf = NULL, *path;
5055                 struct cgroup *cgrp = list_entry(release_list.next,
5056                                                     struct cgroup,
5057                                                     release_list);
5058                 list_del_init(&cgrp->release_list);
5059                 raw_spin_unlock(&release_list_lock);
5060                 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5061                 if (!pathbuf)
5062                         goto continue_free;
5063                 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5064                 if (!path)
5065                         goto continue_free;
5066                 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5067                 if (!agentbuf)
5068                         goto continue_free;
5069
5070                 i = 0;
5071                 argv[i++] = agentbuf;
5072                 argv[i++] = path;
5073                 argv[i] = NULL;
5074
5075                 i = 0;
5076                 /* minimal command environment */
5077                 envp[i++] = "HOME=/";
5078                 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5079                 envp[i] = NULL;
5080
5081                 /* Drop the lock while we invoke the usermode helper,
5082                  * since the exec could involve hitting disk and hence
5083                  * be a slow process */
5084                 mutex_unlock(&cgroup_mutex);
5085                 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5086                 mutex_lock(&cgroup_mutex);
5087  continue_free:
5088                 kfree(pathbuf);
5089                 kfree(agentbuf);
5090                 raw_spin_lock(&release_list_lock);
5091         }
5092         raw_spin_unlock(&release_list_lock);
5093         mutex_unlock(&cgroup_mutex);
5094 }
5095
5096 static int __init cgroup_disable(char *str)
5097 {
5098         struct cgroup_subsys *ss;
5099         char *token;
5100         int i;
5101
5102         while ((token = strsep(&str, ",")) != NULL) {
5103                 if (!*token)
5104                         continue;
5105
5106                 for_each_subsys(ss, i) {
5107                         if (!strcmp(token, ss->name)) {
5108                                 ss->disabled = 1;
5109                                 printk(KERN_INFO "Disabling %s control group"
5110                                         " subsystem\n", ss->name);
5111                                 break;
5112                         }
5113                 }
5114         }
5115         return 1;
5116 }
5117 __setup("cgroup_disable=", cgroup_disable);
5118
5119 /**
5120  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5121  * @dentry: directory dentry of interest
5122  * @ss: subsystem of interest
5123  *
5124  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5125  * to get the corresponding css and return it.  If such css doesn't exist
5126  * or can't be pinned, an ERR_PTR value is returned.
5127  */
5128 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5129                                                        struct cgroup_subsys *ss)
5130 {
5131         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
5132         struct cgroup_subsys_state *css = NULL;
5133         struct cgroup *cgrp;
5134
5135         /* is @dentry a cgroup dir? */
5136         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5137             kernfs_type(kn) != KERNFS_DIR)
5138                 return ERR_PTR(-EBADF);
5139
5140         rcu_read_lock();
5141
5142         /*
5143          * This path doesn't originate from kernfs and @kn could already
5144          * have been or be removed at any point.  @kn->priv is RCU
5145          * protected for this access.  See cgroup_rmdir() for details.
5146          */
5147         cgrp = rcu_dereference(kn->priv);
5148         if (cgrp)
5149                 css = cgroup_css(cgrp, ss);
5150
5151         if (!css || !css_tryget_online(css))
5152                 css = ERR_PTR(-ENOENT);
5153
5154         rcu_read_unlock();
5155         return css;
5156 }
5157
5158 /**
5159  * css_from_id - lookup css by id
5160  * @id: the cgroup id
5161  * @ss: cgroup subsys to be looked into
5162  *
5163  * Returns the css if there's valid one with @id, otherwise returns NULL.
5164  * Should be called under rcu_read_lock().
5165  */
5166 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5167 {
5168         WARN_ON_ONCE(!rcu_read_lock_held());
5169         return idr_find(&ss->css_idr, id);
5170 }
5171
5172 #ifdef CONFIG_CGROUP_DEBUG
5173 static struct cgroup_subsys_state *
5174 debug_css_alloc(struct cgroup_subsys_state *parent_css)
5175 {
5176         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5177
5178         if (!css)
5179                 return ERR_PTR(-ENOMEM);
5180
5181         return css;
5182 }
5183
5184 static void debug_css_free(struct cgroup_subsys_state *css)
5185 {
5186         kfree(css);
5187 }
5188
5189 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5190                                 struct cftype *cft)
5191 {
5192         return cgroup_task_count(css->cgroup);
5193 }
5194
5195 static u64 current_css_set_read(struct cgroup_subsys_state *css,
5196                                 struct cftype *cft)
5197 {
5198         return (u64)(unsigned long)current->cgroups;
5199 }
5200
5201 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5202                                          struct cftype *cft)
5203 {
5204         u64 count;
5205
5206         rcu_read_lock();
5207         count = atomic_read(&task_css_set(current)->refcount);
5208         rcu_read_unlock();
5209         return count;
5210 }
5211
5212 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5213 {
5214         struct cgrp_cset_link *link;
5215         struct css_set *cset;
5216         char *name_buf;
5217
5218         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
5219         if (!name_buf)
5220                 return -ENOMEM;
5221
5222         down_read(&css_set_rwsem);
5223         rcu_read_lock();
5224         cset = rcu_dereference(current->cgroups);
5225         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5226                 struct cgroup *c = link->cgrp;
5227
5228                 cgroup_name(c, name_buf, NAME_MAX + 1);
5229                 seq_printf(seq, "Root %d group %s\n",
5230                            c->root->hierarchy_id, name_buf);
5231         }
5232         rcu_read_unlock();
5233         up_read(&css_set_rwsem);
5234         kfree(name_buf);
5235         return 0;
5236 }
5237
5238 #define MAX_TASKS_SHOWN_PER_CSS 25
5239 static int cgroup_css_links_read(struct seq_file *seq, void *v)
5240 {
5241         struct cgroup_subsys_state *css = seq_css(seq);
5242         struct cgrp_cset_link *link;
5243
5244         down_read(&css_set_rwsem);
5245         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5246                 struct css_set *cset = link->cset;
5247                 struct task_struct *task;
5248                 int count = 0;
5249
5250                 seq_printf(seq, "css_set %p\n", cset);
5251
5252                 list_for_each_entry(task, &cset->tasks, cg_list) {
5253                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5254                                 goto overflow;
5255                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5256                 }
5257
5258                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5259                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5260                                 goto overflow;
5261                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
5262                 }
5263                 continue;
5264         overflow:
5265                 seq_puts(seq, "  ...\n");
5266         }
5267         up_read(&css_set_rwsem);
5268         return 0;
5269 }
5270
5271 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5272 {
5273         return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5274 }
5275
5276 static struct cftype debug_files[] =  {
5277         {
5278                 .name = "taskcount",
5279                 .read_u64 = debug_taskcount_read,
5280         },
5281
5282         {
5283                 .name = "current_css_set",
5284                 .read_u64 = current_css_set_read,
5285         },
5286
5287         {
5288                 .name = "current_css_set_refcount",
5289                 .read_u64 = current_css_set_refcount_read,
5290         },
5291
5292         {
5293                 .name = "current_css_set_cg_links",
5294                 .seq_show = current_css_set_cg_links_read,
5295         },
5296
5297         {
5298                 .name = "cgroup_css_links",
5299                 .seq_show = cgroup_css_links_read,
5300         },
5301
5302         {
5303                 .name = "releasable",
5304                 .read_u64 = releasable_read,
5305         },
5306
5307         { }     /* terminate */
5308 };
5309
5310 struct cgroup_subsys debug_cgrp_subsys = {
5311         .css_alloc = debug_css_alloc,
5312         .css_free = debug_css_free,
5313         .base_cftypes = debug_files,
5314 };
5315 #endif /* CONFIG_CGROUP_DEBUG */