Merge remote-tracking branch 'cgroup/for-next'

author Stephen Rothwell <sfr@canb.auug.org.au>

Thu, 5 Nov 2015 04:03:01 +0000 (15:03 +1100)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Thu, 5 Nov 2015 04:03:04 +0000 (15:03 +1100)
author Stephen Rothwell <sfr@canb.auug.org.au>
Thu, 5 Nov 2015 04:03:01 +0000 (15:03 +1100)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Thu, 5 Nov 2015 04:03:04 +0000 (15:03 +1100)
diff --combined Documentation/cgroups/unified-hierarchy.txt

index 5c07337b51c1b345483f4ba2699e6ce8b7ea2616,0cd27a4e0055fb94950a9bb89299a05d315278f3..781b1d475bcfc9666d0d3dad80832c74adb48b22
--- 1/Documentation/cgroups/unified-hierarchy.txt
--- 2/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@@ -107,12 -107,6 +107,6 @@@ root of unified hierarchy can be bound 
   allows mixing unified hierarchy with the traditional multiple
   hierarchies in a fully backward compatible way.
   
- For development purposes, the following boot parameter makes all
- controllers to appear on the unified hierarchy whether supported or
- not.
- 
-  cgroup__DEVEL__legacy_files_on_dfl
- 
   A controller can be moved across hierarchies only after the controller
   is no longer referenced in its current hierarchy.  Because per-cgroup
   controller states are destroyed asynchronously and controllers may
@@@ -341,11 -335,11 +335,11 @@@ is riddled with issues
     unnecessarily complicated and probably done this way because event
     delivery itself was expensive.
   
- Unified hierarchy implements an interface file "cgroup.populated"
- which can be used to monitor whether the cgroup's subhierarchy has
- tasks in it or not.  Its value is 0 if there is no task in the cgroup
- and its descendants; otherwise, 1.  poll and [id]notify events are
- triggered when the value changes.
+ Unified hierarchy implements "populated" field in "cgroup.events"
+ interface file which can be used to monitor whether the cgroup's
+ subhierarchy has tasks in it or not.  Its value is 0 if there is no
+ task in the cgroup and its descendants; otherwise, 1.  poll and
+ [id]notify events are triggered when the value changes.
   
   This is significantly lighter and simpler and trivially allows
   delegating management of subhierarchy - subhierarchy monitoring can
@@@ -374,6 -368,10 +368,10 @@@ supported and the interface files "rele
   
   - The "cgroup.clone_children" file is removed.
   
+ - /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
+   to before exiting.  If the cgroup is removed before the zombie is
+   reaped, " (deleted)" is appeneded to the path.
+ 
   
   5-3. Controller File Conventions
   
@@@ -435,6 -433,11 +433,11 @@@ may be specified in any order and not a
     the first entry in the file.  Specific entries can use "default" as
     its value to indicate inheritance of the default value.
   
+ - For events which are not very high frequency, an interface file
+   "events" should be created which lists event key value pairs.
+   Whenever a notifiable event happens, file modified event should be
+   generated on the file.
+ 
   
   5-4. Per-Controller Changes
   
@@@ -491,7 -494,7 +494,7 @@@
         ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
         read/write IOs per second.  "max" indicates no limit.  Writing
         to the file follows the same format but the individual
- -      settings may be ommitted or specified in any order.
+ +      settings may be omitted or specified in any order.
   
         This file is available only on non-root cgroups.
   
diff --combined block/blk-cgroup.c

index 55512dd626336eae49b758def08d601bc3515b74,4fa54161d423097da1441a41004ce416bf547280..5bcdfc10c23a6340367c4b9781496a49b5c81efe
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -370,9 -370,6 +370,9 @@@ static void blkg_destroy_all(struct req
                 blkg_destroy(blkg);
                 spin_unlock(&blkcg->lock);
         }
+ +
+ +      q->root_blkg = NULL;
+ +      q->root_rl.blkg = NULL;
   }
   
   /*
@@@ -899,6 -896,7 +899,7 @@@ static int blkcg_print_stat(struct seq_
   struct cftype blkcg_files[] = {
         {
                 .name = "stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = blkcg_print_stat,
         },
         { }     /* terminate */
diff --combined include/linux/backing-dev.h

index c85f74946a8bab65ff3f16cddea6a4446b0a4799,08d9a8eac42c35fb103747ff105091f4af0f27bd..c82794f20110420582d496ae478bc600f9400233
--- 1/include/linux/backing-dev.h
--- 2/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@@ -13,23 -13,18 +13,22 @@@
   #include <linux/sched.h>
   #include <linux/blkdev.h>
   #include <linux/writeback.h>
- #include <linux/memcontrol.h>
   #include <linux/blk-cgroup.h>
   #include <linux/backing-dev-defs.h>
   #include <linux/slab.h>
   
   int __must_check bdi_init(struct backing_dev_info *bdi);
- -void bdi_destroy(struct backing_dev_info *bdi);
+ +void bdi_exit(struct backing_dev_info *bdi);
   
   __printf(3, 4)
   int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                 const char *fmt, ...);
   int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+ +void bdi_unregister(struct backing_dev_info *bdi);
+ +
   int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+ +void bdi_destroy(struct backing_dev_info *bdi);
+ +
   void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                         bool range_cyclic, enum wb_reason reason);
   void wb_start_background_writeback(struct bdi_writeback *wb);
@@@ -267,8 -262,8 +266,8 @@@ static inline bool inode_cgwb_enabled(s
   {
         struct backing_dev_info *bdi = inode_to_bdi(inode);
   
-       return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
-               cgroup_on_dfl(blkcg_root_css->cgroup) &&
+       return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+               cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                 bdi_cap_account_dirty(bdi) &&
                 (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
                 (inode->i_sb->s_iflags & SB_I_CGROUPWB);
@@@ -412,6 -407,61 +411,6 @@@ static inline void unlocked_inode_to_wb
         rcu_read_unlock();
   }
   
- -struct wb_iter {
- -      int                     start_memcg_id;
- -      struct radix_tree_iter  tree_iter;
- -      void                    **slot;
- -};
- -
- -static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
- -                                                 struct backing_dev_info *bdi)
- -{
- -      struct radix_tree_iter *titer = &iter->tree_iter;
- -
- -      WARN_ON_ONCE(!rcu_read_lock_held());
- -
- -      if (iter->start_memcg_id >= 0) {
- -              iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
- -              iter->start_memcg_id = -1;
- -      } else {
- -              iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
- -      }
- -
- -      if (!iter->slot)
- -              iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
- -      if (iter->slot)
- -              return *iter->slot;
- -      return NULL;
- -}
- -
- -static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
- -                                                 struct backing_dev_info *bdi,
- -                                                 int start_memcg_id)
- -{
- -      iter->start_memcg_id = start_memcg_id;
- -
- -      if (start_memcg_id)
- -              return __wb_iter_next(iter, bdi);
- -      else
- -              return &bdi->wb;
- -}
- -
- -/**
- - * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- - * @wb_cur: cursor struct bdi_writeback pointer
- - * @bdi: bdi to walk wb's of
- - * @iter: pointer to struct wb_iter to be used as iteration buffer
- - * @start_memcg_id: memcg ID to start iteration from
- - *
- - * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- - * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
- - * to be used as temp storage during iteration.  rcu_read_lock() must be
- - * held throughout iteration.
- - */
- -#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)            \
- -      for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
- -           (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
- -
   #else /* CONFIG_CGROUP_WRITEBACK */
   
   static inline bool inode_cgwb_enabled(struct inode *inode)
@@@ -471,6 -521,14 +470,6 @@@ static inline void wb_blkcg_offline(str
   {
   }
   
- -struct wb_iter {
- -      int             next_id;
- -};
- -
- -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
- -      for ((iter)->next_id = (start_blkcg_id);                        \
- -           ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
- -
   static inline int inode_congested(struct inode *inode, int cong_bits)
   {
         return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
diff --combined include/linux/init_task.h

index 810a34f604247065d78c2820ed34b0bf8d3aabf6,d0b380ee7d67abbd421bf69fdd63ff10b2aa88b1..1c1ff7e4faa4bf158166b789605107f6a65baf44
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@ -25,13 -25,6 +25,6 @@@
   extern struct files_struct init_files;
   extern struct fs_struct init_fs;
   
- #ifdef CONFIG_CGROUPS
- #define INIT_GROUP_RWSEM(sig)                                         \
-       .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
- #else
- #define INIT_GROUP_RWSEM(sig)
- #endif
- 
   #ifdef CONFIG_CPUSETS
   #define INIT_CPUSET_SEQ(tsk)                                                  \
         .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@@ -59,13 -52,11 +52,12 @@@
         .rlim           = INIT_RLIMITS,                                 \
         .cputimer       = {                                             \
                 .cputime_atomic = INIT_CPUTIME_ATOMIC,                  \
- -              .running        = 0,                                    \
+ +              .running        = false,                                \
+ +              .checking_timer = false,                                \
         },                                                              \
         INIT_PREV_CPUTIME(sig)                                          \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
-       INIT_GROUP_RWSEM(sig)                                           \
   }
   
   extern struct nsproxy init_nsproxy;
diff --combined include/linux/jump_label.h

index f1094238ab2a0f0fddeb40e3c7aadde7c2a89015,c9ca050de8466afdd28189f42598620912c4b53d..8dde55974f186bca7c1488866aaacdd805c347b3
--- 1/include/linux/jump_label.h
--- 2/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@@ -21,8 -21,8 +21,8 @@@
    *
    * DEFINE_STATIC_KEY_TRUE(key);
    * DEFINE_STATIC_KEY_FALSE(key);
- - * static_key_likely()
- - * statick_key_unlikely()
+ + * static_branch_likely()
+ + * static_branch_unlikely()
    *
    * Jump labels provide an interface to generate dynamic branches using
    * self-modifying code. Assuming toolchain and architecture support, if we
@@@ -45,10 -45,12 +45,10 @@@
    * statement, setting the key to true requires us to patch in a jump
    * to the out-of-line of true branch.
    *
- - * In addtion to static_branch_{enable,disable}, we can also reference count
+ + * In addition to static_branch_{enable,disable}, we can also reference count
    * the key or branch direction via static_branch_{inc,dec}. Thus,
    * static_branch_inc() can be thought of as a 'make more true' and
- - * static_branch_dec() as a 'make more false'. The inc()/dec()
- - * interface is meant to be used exclusively from the inc()/dec() for a given
- - * key.
+ + * static_branch_dec() as a 'make more false'.
    *
    * Since this relies on modifying code, the branch modifying functions
    * must be considered absolute slow paths (machine wide synchronization etc.).
@@@ -214,11 -216,6 +214,6 @@@ static inline int jump_label_apply_nops
   #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
   #define jump_label_enabled static_key_enabled
   
- static inline bool static_key_enabled(struct static_key *key)
- {
-       return static_key_count(key) > 0;
- }
- 
   static inline void static_key_enable(struct static_key *key)
   {
         int count = static_key_count(key);
@@@ -265,6 -262,17 +260,17 @@@ struct static_key_false 
   #define DEFINE_STATIC_KEY_FALSE(name) \
         struct static_key_false name = STATIC_KEY_FALSE_INIT
   
+ extern bool ____wrong_branch_error(void);
+ 
+ #define static_key_enabled(x)                                                 \
+ ({                                                                            \
+       if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&     \
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+               ____wrong_branch_error();                                       \
+       static_key_count((struct static_key *)x) > 0;                           \
+ })
+ 
   #ifdef HAVE_JUMP_LABEL
   
   /*
@@@ -323,8 -331,6 +329,6 @@@
    * See jump_label_type() / jump_label_init_type().
    */
   
- extern bool ____wrong_branch_error(void);
- 
   #define static_branch_likely(x)                                                       \
   ({                                                                            \
         bool branch;                                                            \
diff --combined include/linux/memcontrol.h

index 3e3318ddfc0e3e09a0e15825f78eb6052d628d78,c83c699a6605b982ca8fbec56f9510d44e2c9cbd..27251ed428f7db8adaf54c58b7f9e41deda9048d
--- 1/include/linux/memcontrol.h
--- 2/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -213,6 -213,9 +213,9 @@@ struct mem_cgroup 
         /* OOM-Killer disable */
         int             oom_kill_disable;
   
+       /* handle for "memory.events" */
+       struct cgroup_file events_file;
+ 
         /* protect arrays of thresholds */
         struct mutex thresholds_lock;
   
@@@ -242,6 -245,7 +245,6 @@@
          * percpu counter.
          */
         struct mem_cgroup_stat_cpu __percpu *stat;
- -      spinlock_t pcp_counter_lock;
   
   #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
         struct cg_proto tcp_mem;
@@@ -285,6 -289,7 +288,7 @@@ static inline void mem_cgroup_events(st
                        unsigned int nr)
   {
         this_cpu_add(memcg->stat->events[idx], nr);
+       cgroup_file_notify(&memcg->events_file);
   }
   
   bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@@ -346,9 -351,7 +350,7 @@@ ino_t page_cgroup_ino(struct page *page
   
   static inline bool mem_cgroup_disabled(void)
   {
-       if (memory_cgrp_subsys.disabled)
-               return true;
-       return false;
+       return !cgroup_subsys_enabled(memory_cgrp_subsys);
   }
   
   /*
@@@ -676,9 -679,8 +678,9 @@@ enum 
   
   struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
   struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
- -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- -                       unsigned long *pdirty, unsigned long *pwriteback);
+ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ +                       unsigned long *pheadroom, unsigned long *pdirty,
+ +                       unsigned long *pwriteback);
   
   #else /* CONFIG_CGROUP_WRITEBACK */
   
@@@ -688,8 -690,7 +690,8 @@@ static inline struct wb_domain *mem_cgr
   }
   
   static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
- -                                     unsigned long *pavail,
+ +                                     unsigned long *pfilepages,
+ +                                     unsigned long *pheadroom,
                                        unsigned long *pdirty,
                                        unsigned long *pwriteback)
   {
diff --combined include/linux/sched.h

index c115d617739d8e6f1f388c13069ad32cfca1bb3b,a4ab9daa387c0bbcaca1923620ceb2ed74bfd84e..4effb1025fbb1555bc9c3ce6f80d98db004271a2
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -599,42 -599,33 +599,42 @@@ struct task_cputime_atomic 
                 .sum_exec_runtime = ATOMIC64_INIT(0),           \
         }
   
- -#ifdef CONFIG_PREEMPT_COUNT
- -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
- -#else
- -#define PREEMPT_DISABLED      PREEMPT_ENABLED
- -#endif
+ +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+ +
+ +/*
+ + * Disable preemption until the scheduler is running -- use an unconditional
+ + * value so that it also works on !PREEMPT_COUNT kernels.
+ + *
+ + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ + */
+ +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
   
   /*
- - * Disable preemption until the scheduler is running.
- - * Reset by start_kernel()->sched_init()->init_idle().
+ + * Initial preempt_count value; reflects the preempt_count schedule invariant
+ + * which states that during context switches:
    *
- - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
- - * before the scheduler is active -- see should_resched().
+ + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ + *
+ + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ + * Note: See finish_task_switch().
    */
- -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+ +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
   
   /**
    * struct thread_group_cputimer - thread group interval timer counts
    * @cputime_atomic:   atomic thread group interval timers.
- - * @running:          non-zero when there are timers running and
- - *                    @cputime receives updates.
+ + * @running:          true when there are timers running and
+ + *                    @cputime_atomic receives updates.
+ + * @checking_timer:   true when a thread in the group is in the
+ + *                    process of checking for thread group timers.
    *
    * This structure contains the version of task_cputime, above, that is
    * used for thread group CPU timer calculations.
    */
   struct thread_group_cputimer {
         struct task_cputime_atomic cputime_atomic;
- -      int running;
+ +      bool running;
+ +      bool checking_timer;
   };
   
   #include <linux/rwsem.h>
@@@ -771,18 -762,6 +771,6 @@@ struct signal_struct 
         unsigned audit_tty_log_passwd;
         struct tty_audit_buf *tty_audit_buf;
   #endif
- #ifdef CONFIG_CGROUPS
-       /*
-        * group_rwsem prevents new tasks from entering the threadgroup and
-        * member tasks from exiting,a more specifically, setting of
-        * PF_EXITING.  fork and exit paths are protected with this rwsem
-        * using threadgroup_change_begin/end().  Users which require
-        * threadgroup to remain stable should use threadgroup_[un]lock()
-        * which also takes care of exec path.  Currently, cgroup is the
-        * only user.
-        */
-       struct rw_semaphore group_rwsem;
- #endif
   
         oom_flags_t oom_flags;
         short oom_score_adj;            /* OOM kill score adjustment */
@@@ -849,7 -828,7 +837,7 @@@ struct user_struct 
         struct hlist_node uidhash_node;
         kuid_t uid;
   
- -#ifdef CONFIG_PERF_EVENTS
+ +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
         atomic_long_t locked_vm;
   #endif
   };
@@@ -1148,6 -1127,8 +1136,6 @@@ struct sched_domain_topology_level 
   #endif
   };
   
- -extern struct sched_domain_topology_level *sched_domain_topology;
- -
   extern void set_sched_topology(struct sched_domain_topology_level *tl);
   extern void wake_up_if_idle(int cpu);
   
@@@ -1196,10 -1177,10 +1184,10 @@@ struct load_weight 
   
   /*
    * The load_avg/util_avg accumulates an infinite geometric series.
- - * 1) load_avg factors the amount of time that a sched_entity is
- - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
- - * such weights of all runnable and blocked sched_entities.
- - * 2) util_avg factors frequency scaling into the amount of time
+ + * 1) load_avg factors frequency scaling into the amount of time that a
+ + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ + * aggregated such weights of all runnable and blocked sched_entities.
+ + * 2) util_avg factors frequency and cpu scaling into the amount of time
    * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
    * For cfs_rq, it is the aggregated such times of all runnable and
    * blocked sched_entities.
@@@ -1349,12 -1330,10 +1337,12 @@@ struct sched_dl_entity 
   
   union rcu_special {
         struct {
- -              bool blocked;
- -              bool need_qs;
- -      } b;
- -      short s;
+ +              u8 blocked;
+ +              u8 need_qs;
+ +              u8 exp_need_qs;
+ +              u8 pad; /* Otherwise the compiler can store garbage here. */
+ +      } b; /* Bits. */
+ +      u32 s; /* Set of bits. */
   };
   struct rcu_node;
   
diff --combined kernel/events/core.c

index 39db20c6248e47c940bd8721c41ade530e1eb5c9,e9874949c78734d2c662845aedb69d79ccd5b839..1a734e0adfa78259dac6cae57239a2d5f992a9ac
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -196,7 -196,7 +196,7 @@@ static int perf_sample_period_ns __read
   static int perf_sample_allowed_ns __read_mostly =
         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
   
- -void update_perf_cpu_limits(void)
+ +static void update_perf_cpu_limits(void)
   {
         u64 tmp = perf_sample_period_ns;
   
@@@ -472,7 -472,7 +472,7 @@@ perf_cgroup_set_timestamp(struct task_s
    * mode SWOUT : schedule out everything
    * mode SWIN : schedule in based on cgroup for next
    */
- -void perf_cgroup_switch(struct task_struct *task, int mode)
+ +static void perf_cgroup_switch(struct task_struct *task, int mode)
   {
         struct perf_cpu_context *cpuctx;
         struct pmu *pmu;
@@@ -1243,7 -1243,11 +1243,7 @@@ static inline void perf_event__state_in
                                               PERF_EVENT_STATE_INACTIVE;
   }
   
- -/*
- - * Called at perf_event creation and when events are attached/detached from a
- - * group.
- - */
- -static void perf_event__read_size(struct perf_event *event)
+ +static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
   {
         int entry = sizeof(u64); /* value */
         int size = 0;
@@@ -1259,7 -1263,7 +1259,7 @@@
                 entry += sizeof(u64);
   
         if (event->attr.read_format & PERF_FORMAT_GROUP) {
- -              nr += event->group_leader->nr_siblings;
+ +              nr += nr_siblings;
                 size += sizeof(u64);
         }
   
@@@ -1267,11 -1271,14 +1267,11 @@@
         event->read_size = size;
   }
   
- -static void perf_event__header_size(struct perf_event *event)
+ +static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
   {
         struct perf_sample_data *data;
- -      u64 sample_type = event->attr.sample_type;
         u16 size = 0;
   
- -      perf_event__read_size(event);
- -
         if (sample_type & PERF_SAMPLE_IP)
                 size += sizeof(data->ip);
   
@@@ -1296,17 -1303,6 +1296,17 @@@
         event->header_size = size;
   }
   
+ +/*
+ + * Called at perf_event creation and when events are attached/detached from a
+ + * group.
+ + */
+ +static void perf_event__header_size(struct perf_event *event)
+ +{
+ +      __perf_event_read_size(event,
+ +                             event->group_leader->nr_siblings);
+ +      __perf_event_header_size(event, event->attr.sample_type);
+ +}
+ +
   static void perf_event__id_header_size(struct perf_event *event)
   {
         struct perf_sample_data *data;
@@@ -1334,27 -1330,6 +1334,27 @@@
         event->id_header_size = size;
   }
   
+ +static bool perf_event_validate_size(struct perf_event *event)
+ +{
+ +      /*
+ +       * The values computed here will be over-written when we actually
+ +       * attach the event.
+ +       */
+ +      __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+ +      __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+ +      perf_event__id_header_size(event);
+ +
+ +      /*
+ +       * Sum the lot; should not exceed the 64k limit we have on records.
+ +       * Conservative limit to allow for callchains and other variable fields.
+ +       */
+ +      if (event->read_size + event->header_size +
+ +          event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
   static void perf_group_attach(struct perf_event *event)
   {
         struct perf_event *group_leader = event->group_leader, *pos;
@@@ -1939,7 -1914,7 +1939,7 @@@ group_sched_in(struct perf_event *group
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
   
- -      pmu->start_txn(pmu);
+ +      pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
   
         if (event_sched_in(group_event, cpuctx, ctx)) {
                 pmu->cancel_txn(pmu);
@@@ -3209,22 -3184,14 +3209,22 @@@ void perf_event_exec(void
         rcu_read_unlock();
   }
   
+ +struct perf_read_data {
+ +      struct perf_event *event;
+ +      bool group;
+ +      int ret;
+ +};
+ +
   /*
    * Cross CPU call to read the hardware event
    */
   static void __perf_event_read(void *info)
   {
- -      struct perf_event *event = info;
+ +      struct perf_read_data *data = info;
+ +      struct perf_event *sub, *event = data->event;
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ +      struct pmu *pmu = event->pmu;
   
         /*
          * If this is a task context, we need to check whether it is
@@@ -3241,35 -3208,9 +3241,35 @@@
                 update_context_time(ctx);
                 update_cgrp_time_from_event(event);
         }
+ +
         update_event_times(event);
- -      if (event->state == PERF_EVENT_STATE_ACTIVE)
- -              event->pmu->read(event);
+ +      if (event->state != PERF_EVENT_STATE_ACTIVE)
+ +              goto unlock;
+ +
+ +      if (!data->group) {
+ +              pmu->read(event);
+ +              data->ret = 0;
+ +              goto unlock;
+ +      }
+ +
+ +      pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+ +
+ +      pmu->read(event);
+ +
+ +      list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ +              update_event_times(sub);
+ +              if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ +                      /*
+ +                       * Use sibling's PMU rather than @event's since
+ +                       * sibling could be on different (eg: software) PMU.
+ +                       */
+ +                      sub->pmu->read(sub);
+ +              }
+ +      }
+ +
+ +      data->ret = pmu->commit_txn(pmu);
+ +
+ +unlock:
         raw_spin_unlock(&ctx->lock);
   }
   
@@@ -3334,23 -3275,15 +3334,23 @@@ u64 perf_event_read_local(struct perf_e
         return val;
   }
   
- -static u64 perf_event_read(struct perf_event *event)
+ +static int perf_event_read(struct perf_event *event, bool group)
   {
+ +      int ret = 0;
+ +
         /*
          * If event is enabled and currently active on a CPU, update the
          * value in the event structure:
          */
         if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ +              struct perf_read_data data = {
+ +                      .event = event,
+ +                      .group = group,
+ +                      .ret = 0,
+ +              };
                 smp_call_function_single(event->oncpu,
- -                                       __perf_event_read, event, 1);
+ +                                       __perf_event_read, &data, 1);
+ +              ret = data.ret;
         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                 struct perf_event_context *ctx = event->ctx;
                 unsigned long flags;
@@@ -3365,14 -3298,11 +3365,14 @@@
                         update_context_time(ctx);
                         update_cgrp_time_from_event(event);
                 }
- -              update_event_times(event);
+ +              if (group)
+ +                      update_group_times(event);
+ +              else
+ +                      update_event_times(event);
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
   
- -      return perf_event_count(event);
+ +      return ret;
   }
   
   /*
@@@ -3814,7 -3744,7 +3814,7 @@@ static void put_event(struct perf_even
          *     see the comment there.
          *
          *  2) there is a lock-inversion with mmap_sem through
- -       *     perf_event_read_group(), which takes faults while
+ +       *     perf_read_group(), which takes faults while
          *     holding ctx->mutex, however this is called after
          *     the last filedesc died, so there is no possibility
          *     to trigger the AB-BA case.
@@@ -3888,18 -3818,14 +3888,18 @@@ u64 perf_event_read_value(struct perf_e
         *running = 0;
   
         mutex_lock(&event->child_mutex);
- -      total += perf_event_read(event);
+ +
+ +      (void)perf_event_read(event, false);
+ +      total += perf_event_count(event);
+ +
         *enabled += event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
         *running += event->total_time_running +
                         atomic64_read(&event->child_total_time_running);
   
         list_for_each_entry(child, &event->child_list, child_list) {
- -              total += perf_event_read(child);
+ +              (void)perf_event_read(child, false);
+ +              total += perf_event_count(child);
                 *enabled += child->total_time_enabled;
                 *running += child->total_time_running;
         }
@@@ -3909,95 -3835,55 +3909,95 @@@
   }
   EXPORT_SYMBOL_GPL(perf_event_read_value);
   
- -static int perf_event_read_group(struct perf_event *event,
- -                                 u64 read_format, char __user *buf)
+ +static int __perf_read_group_add(struct perf_event *leader,
+ +                                      u64 read_format, u64 *values)
   {
- -      struct perf_event *leader = event->group_leader, *sub;
- -      struct perf_event_context *ctx = leader->ctx;
- -      int n = 0, size = 0, ret;
- -      u64 count, enabled, running;
- -      u64 values[5];
+ +      struct perf_event *sub;
+ +      int n = 1; /* skip @nr */
+ +      int ret;
   
- -      lockdep_assert_held(&ctx->mutex);
+ +      ret = perf_event_read(leader, true);
+ +      if (ret)
+ +              return ret;
   
- -      count = perf_event_read_value(leader, &enabled, &running);
+ +      /*
+ +       * Since we co-schedule groups, {enabled,running} times of siblings
+ +       * will be identical to those of the leader, so we only publish one
+ +       * set.
+ +       */
+ +      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ +              values[n++] += leader->total_time_enabled +
+ +                      atomic64_read(&leader->child_total_time_enabled);
+ +      }
   
- -      values[n++] = 1 + leader->nr_siblings;
- -      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- -              values[n++] = enabled;
- -      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- -              values[n++] = running;
- -      values[n++] = count;
+ +      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ +              values[n++] += leader->total_time_running +
+ +                      atomic64_read(&leader->child_total_time_running);
+ +      }
+ +
+ +      /*
+ +       * Write {count,id} tuples for every sibling.
+ +       */
+ +      values[n++] += perf_event_count(leader);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
   
- -      size = n * sizeof(u64);
+ +      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ +              values[n++] += perf_event_count(sub);
+ +              if (read_format & PERF_FORMAT_ID)
+ +                      values[n++] = primary_event_id(sub);
+ +      }
   
- -      if (copy_to_user(buf, values, size))
- -              return -EFAULT;
+ +      return 0;
+ +}
   
- -      ret = size;
+ +static int perf_read_group(struct perf_event *event,
+ +                                 u64 read_format, char __user *buf)
+ +{
+ +      struct perf_event *leader = event->group_leader, *child;
+ +      struct perf_event_context *ctx = leader->ctx;
+ +      int ret;
+ +      u64 *values;
   
- -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- -              n = 0;
+ +      lockdep_assert_held(&ctx->mutex);
   
- -              values[n++] = perf_event_read_value(sub, &enabled, &running);
- -              if (read_format & PERF_FORMAT_ID)
- -                      values[n++] = primary_event_id(sub);
+ +      values = kzalloc(event->read_size, GFP_KERNEL);
+ +      if (!values)
+ +              return -ENOMEM;
   
- -              size = n * sizeof(u64);
+ +      values[0] = 1 + leader->nr_siblings;
   
- -              if (copy_to_user(buf + ret, values, size)) {
- -                      return -EFAULT;
- -              }
+ +      /*
+ +       * By locking the child_mutex of the leader we effectively
+ +       * lock the child list of all siblings.. XXX explain how.
+ +       */
+ +      mutex_lock(&leader->child_mutex);
   
- -              ret += size;
+ +      ret = __perf_read_group_add(leader, read_format, values);
+ +      if (ret)
+ +              goto unlock;
+ +
+ +      list_for_each_entry(child, &leader->child_list, child_list) {
+ +              ret = __perf_read_group_add(child, read_format, values);
+ +              if (ret)
+ +                      goto unlock;
         }
   
+ +      mutex_unlock(&leader->child_mutex);
+ +
+ +      ret = event->read_size;
+ +      if (copy_to_user(buf, values, event->read_size))
+ +              ret = -EFAULT;
+ +      goto out;
+ +
+ +unlock:
+ +      mutex_unlock(&leader->child_mutex);
+ +out:
+ +      kfree(values);
         return ret;
   }
   
- -static int perf_event_read_one(struct perf_event *event,
+ +static int perf_read_one(struct perf_event *event,
                                  u64 read_format, char __user *buf)
   {
         u64 enabled, running;
@@@ -4035,7 -3921,7 +4035,7 @@@ static bool is_event_hup(struct perf_ev
    * Read the performance event - simple non blocking version for now
    */
   static ssize_t
- -perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+ +__perf_read(struct perf_event *event, char __user *buf, size_t count)
   {
         u64 read_format = event->attr.read_format;
         int ret;
@@@ -4053,9 -3939,9 +4053,9 @@@
   
         WARN_ON_ONCE(event->ctx->parent_ctx);
         if (read_format & PERF_FORMAT_GROUP)
- -              ret = perf_event_read_group(event, read_format, buf);
+ +              ret = perf_read_group(event, read_format, buf);
         else
- -              ret = perf_event_read_one(event, read_format, buf);
+ +              ret = perf_read_one(event, read_format, buf);
   
         return ret;
   }
@@@ -4068,7 -3954,7 +4068,7 @@@ perf_read(struct file *file, char __use
         int ret;
   
         ctx = perf_event_ctx_lock(event);
- -      ret = perf_read_hw(event, buf, count);
+ +      ret = __perf_read(event, buf, count);
         perf_event_ctx_unlock(event, ctx);
   
         return ret;
@@@ -4099,7 -3985,7 +4099,7 @@@ static unsigned int perf_poll(struct fi
   
   static void _perf_event_reset(struct perf_event *event)
   {
- -      (void)perf_event_read(event);
+ +      (void)perf_event_read(event, false);
         local64_set(&event->count, 0);
         perf_event_update_userpage(event);
   }
@@@ -5375,15 -5261,9 +5375,15 @@@ void perf_output_sample(struct perf_out
   
         if (sample_type & PERF_SAMPLE_RAW) {
                 if (data->raw) {
- -                      perf_output_put(handle, data->raw->size);
- -                      __output_copy(handle, data->raw->data,
- -                                         data->raw->size);
+ +                      u32 raw_size = data->raw->size;
+ +                      u32 real_size = round_up(raw_size + sizeof(u32),
+ +                                               sizeof(u64)) - sizeof(u32);
+ +                      u64 zero = 0;
+ +
+ +                      perf_output_put(handle, real_size);
+ +                      __output_copy(handle, data->raw->data, raw_size);
+ +                      if (real_size - raw_size)
+ +                              __output_copy(handle, &zero, real_size - raw_size);
                 } else {
                         struct {
                                 u32     size;
@@@ -5515,7 -5395,8 +5515,7 @@@ void perf_prepare_sample(struct perf_ev
                 else
                         size += sizeof(u32);
   
- -              WARN_ON_ONCE(size & (sizeof(u64)-1));
- -              header->size += size;
+ +              header->size += round_up(size, sizeof(u64));
         }
   
         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@@ -7386,49 -7267,24 +7386,49 @@@ static void perf_pmu_nop_void(struct pm
   {
   }
   
+ +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+ +{
+ +}
+ +
   static int perf_pmu_nop_int(struct pmu *pmu)
   {
         return 0;
   }
   
- -static void perf_pmu_start_txn(struct pmu *pmu)
+ +static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+ +
+ +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
   {
+ +      __this_cpu_write(nop_txn_flags, flags);
+ +
+ +      if (flags & ~PERF_PMU_TXN_ADD)
+ +              return;
+ +
         perf_pmu_disable(pmu);
   }
   
   static int perf_pmu_commit_txn(struct pmu *pmu)
   {
+ +      unsigned int flags = __this_cpu_read(nop_txn_flags);
+ +
+ +      __this_cpu_write(nop_txn_flags, 0);
+ +
+ +      if (flags & ~PERF_PMU_TXN_ADD)
+ +              return 0;
+ +
         perf_pmu_enable(pmu);
         return 0;
   }
   
   static void perf_pmu_cancel_txn(struct pmu *pmu)
   {
+ +      unsigned int flags =  __this_cpu_read(nop_txn_flags);
+ +
+ +      __this_cpu_write(nop_txn_flags, 0);
+ +
+ +      if (flags & ~PERF_PMU_TXN_ADD)
+ +              return;
+ +
         perf_pmu_enable(pmu);
   }
   
@@@ -7667,7 -7523,7 +7667,7 @@@ got_cpu_context
                         pmu->commit_txn = perf_pmu_commit_txn;
                         pmu->cancel_txn = perf_pmu_cancel_txn;
                 } else {
- -                      pmu->start_txn  = perf_pmu_nop_void;
+ +                      pmu->start_txn  = perf_pmu_nop_txn;
                         pmu->commit_txn = perf_pmu_nop_int;
                         pmu->cancel_txn = perf_pmu_nop_void;
                 }
@@@ -7755,7 -7611,7 +7755,7 @@@ static int perf_try_init_event(struct p
         return ret;
   }
   
- -struct pmu *perf_init_event(struct perf_event *event)
+ +static struct pmu *perf_init_event(struct perf_event *event)
   {
         struct pmu *pmu = NULL;
         int idx;
@@@ -8441,35 -8297,13 +8441,35 @@@ SYSCALL_DEFINE5(perf_event_open
   
         if (move_group) {
                 gctx = group_leader->ctx;
+ +              mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ +      } else {
+ +              mutex_lock(&ctx->mutex);
+ +      }
+ +
+ +      if (!perf_event_validate_size(event)) {
+ +              err = -E2BIG;
+ +              goto err_locked;
+ +      }
+ +
+ +      /*
+ +       * Must be under the same ctx::mutex as perf_install_in_context(),
+ +       * because we need to serialize with concurrent event creation.
+ +       */
+ +      if (!exclusive_event_installable(event, ctx)) {
+ +              /* exclusive and group stuff are assumed mutually exclusive */
+ +              WARN_ON_ONCE(move_group);
+ +
+ +              err = -EBUSY;
+ +              goto err_locked;
+ +      }
+ +
+ +      WARN_ON_ONCE(ctx->parent_ctx);
   
+ +      if (move_group) {
                 /*
                  * See perf_event_ctx_lock() for comments on the details
                  * of swizzling perf_event::ctx.
                  */
- -              mutex_lock_double(&gctx->mutex, &ctx->mutex);
- -
                 perf_remove_from_context(group_leader, false);
   
                 list_for_each_entry(sibling, &group_leader->sibling_list,
@@@ -8477,7 -8311,13 +8477,7 @@@
                         perf_remove_from_context(sibling, false);
                         put_ctx(gctx);
                 }
- -      } else {
- -              mutex_lock(&ctx->mutex);
- -      }
- -
- -      WARN_ON_ONCE(ctx->parent_ctx);
   
- -      if (move_group) {
                 /*
                  * Wait for everybody to stop referencing the events through
                  * the old lists, before installing it on new lists.
@@@ -8509,29 -8349,22 +8509,29 @@@
                 perf_event__state_init(group_leader);
                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
                 get_ctx(ctx);
- -      }
   
- -      if (!exclusive_event_installable(event, ctx)) {
- -              err = -EBUSY;
- -              mutex_unlock(&ctx->mutex);
- -              fput(event_file);
- -              goto err_context;
+ +              /*
+ +               * Now that all events are installed in @ctx, nothing
+ +               * references @gctx anymore, so drop the last reference we have
+ +               * on it.
+ +               */
+ +              put_ctx(gctx);
         }
   
+ +      /*
+ +       * Precalculate sample_data sizes; do while holding ctx::mutex such
+ +       * that we're serialized against further additions and before
+ +       * perf_install_in_context() which is the point the event is active and
+ +       * can use these values.
+ +       */
+ +      perf_event__header_size(event);
+ +      perf_event__id_header_size(event);
+ +
         perf_install_in_context(ctx, event, event->cpu);
         perf_unpin_context(ctx);
   
- -      if (move_group) {
+ +      if (move_group)
                 mutex_unlock(&gctx->mutex);
- -              put_ctx(gctx);
- -      }
         mutex_unlock(&ctx->mutex);
   
         put_online_cpus();
@@@ -8542,6 -8375,12 +8542,6 @@@
         list_add_tail(&event->owner_entry, &current->perf_event_list);
         mutex_unlock(&current->perf_event_mutex);
   
- -      /*
- -       * Precalculate sample_data sizes
- -       */
- -      perf_event__header_size(event);
- -      perf_event__id_header_size(event);
- -
         /*
          * Drop the reference on the group_event after placing the
          * new event on the sibling_list. This ensures destruction
@@@ -8552,12 -8391,6 +8552,12 @@@
         fd_install(event_fd, event_file);
         return event_fd;
   
+ +err_locked:
+ +      if (move_group)
+ +              mutex_unlock(&gctx->mutex);
+ +      mutex_unlock(&ctx->mutex);
+ +/* err_file: */
+ +      fput(event_file);
   err_context:
         perf_unpin_context(ctx);
         put_ctx(ctx);
@@@ -9460,17 -9293,9 +9460,9 @@@ static void perf_cgroup_attach(struct c
                 task_function_call(task, __perf_cgroup_move, task);
   }
   
- static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-                            struct cgroup_subsys_state *old_css,
-                            struct task_struct *task)
- {
-       task_function_call(task, __perf_cgroup_move, task);
- }
- 
   struct cgroup_subsys perf_event_cgrp_subsys = {
         .css_alloc      = perf_cgroup_css_alloc,
         .css_free       = perf_cgroup_css_free,
-       .exit           = perf_cgroup_exit,
         .attach         = perf_cgroup_attach,
   };
   #endif /* CONFIG_CGROUP_PERF */
diff --combined kernel/fork.c

index 6ac894244d3978fb800f7a1a02912bb2901e5e84,118743bb596498edb919f09b92cbca956f479595..825ecc32454d23f4e60216bedfb2de31fe504699
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -251,6 -251,7 +251,7 @@@ void __put_task_struct(struct task_stru
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);
   
+       cgroup_free(tsk);
         task_numa_free(tsk);
         security_task_free(tsk);
         exit_creds(tsk);
@@@ -1101,7 -1102,7 +1102,7 @@@ static void posix_cpu_timers_init_group
         cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
         if (cpu_limit != RLIM_INFINITY) {
                 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
- -              sig->cputimer.running = 1;
+ +              sig->cputimer.running = true;
         }
   
         /* The timer lists. */
@@@ -1149,10 -1150,6 +1150,6 @@@ static int copy_signal(unsigned long cl
         tty_audit_fork(sig);
         sched_autogroup_fork(sig);
   
- #ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
- #endif
- 
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
   
diff --combined kernel/sched/core.c

index aa5973220ad213a960092012bf4493f296dab90b,2cad9ba9103682a14fc596f8ee813b52a70b928c..4d568ac9319eaf04c9d00673483678bc5e14f22e
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -621,21 -621,18 +621,21 @@@ int get_nohz_timer_target(void
         int i, cpu = smp_processor_id();
         struct sched_domain *sd;
   
- -      if (!idle_cpu(cpu))
+ +      if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                 return cpu;
   
         rcu_read_lock();
         for_each_domain(cpu, sd) {
                 for_each_cpu(i, sched_domain_span(sd)) {
- -                      if (!idle_cpu(i)) {
+ +                      if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                 cpu = i;
                                 goto unlock;
                         }
                 }
         }
+ +
+ +      if (!is_housekeeping_cpu(cpu))
+ +              cpu = housekeeping_any_cpu();
   unlock:
         rcu_read_unlock();
         return cpu;
@@@ -817,7 -814,7 +817,7 @@@ static void set_load_weight(struct task
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
- -      if (p->policy == SCHED_IDLE) {
+ +      if (idle_policy(p->policy)) {
                 load->weight = scale_load(WEIGHT_IDLEPRIO);
                 load->inv_weight = WMULT_IDLEPRIO;
                 return;
@@@ -827,19 -824,17 +827,19 @@@
         load->inv_weight = prio_to_wmult[prio];
   }
   
- -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         update_rq_clock(rq);
- -      sched_info_queued(rq, p);
+ +      if (!(flags & ENQUEUE_RESTORE))
+ +              sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
   }
   
- -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         update_rq_clock(rq);
- -      sched_info_dequeued(rq, p);
+ +      if (!(flags & DEQUEUE_SAVE))
+ +              sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
   }
   
@@@ -1180,7 -1175,7 +1180,7 @@@ void do_set_cpus_allowed(struct task_st
                  * holding rq->lock.
                  */
                 lockdep_assert_held(&rq->lock);
- -              dequeue_task(rq, p, 0);
+ +              dequeue_task(rq, p, DEQUEUE_SAVE);
         }
         if (running)
                 put_prev_task(rq, p);
@@@ -1190,7 -1185,7 +1190,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
- -              enqueue_task(rq, p, 0);
+ +              enqueue_task(rq, p, ENQUEUE_RESTORE);
   }
   
   /*
@@@ -1294,7 -1289,7 +1294,7 @@@ void set_task_cpu(struct task_struct *p
   
         if (task_cpu(p) != new_cpu) {
                 if (p->sched_class->migrate_task_rq)
- -                      p->sched_class->migrate_task_rq(p, new_cpu);
+ +                      p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
         }
@@@ -1335,16 -1330,12 +1335,16 @@@ static int migrate_swap_stop(void *data
         struct rq *src_rq, *dst_rq;
         int ret = -EAGAIN;
   
+ +      if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ +              return -EAGAIN;
+ +
         src_rq = cpu_rq(arg->src_cpu);
         dst_rq = cpu_rq(arg->dst_cpu);
   
         double_raw_lock(&arg->src_task->pi_lock,
                         &arg->dst_task->pi_lock);
         double_rq_lock(src_rq, dst_rq);
+ +
         if (task_cpu(arg->dst_task) != arg->dst_cpu)
                 goto unlock;
   
@@@ -1580,15 -1571,13 +1580,15 @@@ static int select_fallback_rq(int cpu, 
                         goto out;
                 }
   
+ +              /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
- -                      /* No more Mr. Nice Guy. */
- -                      cpuset_cpus_allowed_fallback(p);
- -                      state = possible;
- -                      break;
- -
+ +                      if (IS_ENABLED(CONFIG_CPUSETS)) {
+ +                              cpuset_cpus_allowed_fallback(p);
+ +                              state = possible;
+ +                              break;
+ +                      }
+ +                      /* fall-through */
                 case possible:
                         do_set_cpus_allowed(p, cpu_possible_mask);
                         state = fail;
@@@ -1700,7 -1689,7 +1700,7 @@@ ttwu_stat(struct task_struct *p, int cp
   #endif /* CONFIG_SCHEDSTATS */
   }
   
- -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+ +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
   {
         activate_task(rq, p, en_flags);
         p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -2122,17 -2111,23 +2122,17 @@@ static void __sched_fork(unsigned long 
   #endif /* CONFIG_NUMA_BALANCING */
   }
   
+ +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+ +
   #ifdef CONFIG_NUMA_BALANCING
- -#ifdef CONFIG_SCHED_DEBUG
+ +
   void set_numabalancing_state(bool enabled)
   {
         if (enabled)
- -              sched_feat_set("NUMA");
+ +              static_branch_enable(&sched_numa_balancing);
         else
- -              sched_feat_set("NO_NUMA");
+ +              static_branch_disable(&sched_numa_balancing);
   }
- -#else
- -__read_mostly bool numabalancing_enabled;
- -
- -void set_numabalancing_state(bool enabled)
- -{
- -      numabalancing_enabled = enabled;
- -}
- -#endif /* CONFIG_SCHED_DEBUG */
   
   #ifdef CONFIG_PROC_SYSCTL
   int sysctl_numa_balancing(struct ctl_table *table, int write,
@@@ -2140,7 -2135,7 +2140,7 @@@
   {
         struct ctl_table t;
         int err;
- -      int state = numabalancing_enabled;
+ +      int state = static_branch_likely(&sched_numa_balancing);
   
         if (write && !capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -2351,8 -2346,6 +2351,8 @@@ void wake_up_new_task(struct task_struc
         struct rq *rq;
   
         raw_spin_lock_irqsave(&p->pi_lock, flags);
+ +      /* Initialize new task's runnable average */
+ +      init_entity_runnable_average(&p->se);
   #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
@@@ -2362,21 -2355,16 +2362,21 @@@
         set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
   #endif
   
- -      /* Initialize new task's runnable average */
- -      init_entity_runnable_average(&p->se);
         rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
- -      if (p->sched_class->task_woken)
+ +      if (p->sched_class->task_woken) {
+ +              /*
+ +               * Nothing relies on rq->lock after this, so its fine to
+ +               * drop it.
+ +               */
+ +              lockdep_unpin_lock(&rq->lock);
                 p->sched_class->task_woken(rq, p);
+ +              lockdep_pin_lock(&rq->lock);
+ +      }
   #endif
         task_rq_unlock(rq, p, &flags);
   }
@@@ -2485,6 -2473,7 +2485,6 @@@ static inline voi
   prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
   {
- -      trace_sched_switch(prev, next);
         sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
@@@ -2518,22 -2507,6 +2518,22 @@@ static struct rq *finish_task_switch(st
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
   
+ +      /*
+ +       * The previous task will have left us with a preempt_count of 2
+ +       * because it left us after:
+ +       *
+ +       *      schedule()
+ +       *        preempt_disable();                    // 1
+ +       *        __schedule()
+ +       *          raw_spin_lock_irq(&rq->lock)        // 2
+ +       *
+ +       * Also, see FORK_PREEMPT_COUNT.
+ +       */
+ +      if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ +                    "corrupted preempt_count: %s/%d/0x%x\n",
+ +                    current->comm, current->pid, preempt_count()))
+ +              preempt_count_set(FORK_PREEMPT_COUNT);
+ +
         rq->prev_mm = NULL;
   
         /*
@@@ -2541,11 -2514,11 +2541,11 @@@
          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
          * schedule one last time. The schedule call will never return, and
          * the scheduled task must drop that reference.
- -       * The test for TASK_DEAD must occur while the runqueue locks are
- -       * still held, otherwise prev could be scheduled on another cpu, die
- -       * there before we look at prev->state, and then the reference would
- -       * be dropped twice.
- -       *              Manfred Spraul <manfred@colorfullife.com>
+ +       *
+ +       * We must observe prev->state before clearing prev->on_cpu (in
+ +       * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ +       * running on another CPU and we could rave with its RUNNING -> DEAD
+ +       * transition, resulting in a double drop.
          */
         prev_state = prev->state;
         vtime_task_switch(prev);
@@@ -2618,15 -2591,8 +2618,15 @@@ asmlinkage __visible void schedule_tail
   {
         struct rq *rq;
   
- -      /* finish_task_switch() drops rq->lock and enables preemtion */
- -      preempt_disable();
+ +      /*
+ +       * New tasks start with FORK_PREEMPT_COUNT, see there and
+ +       * finish_task_switch() for details.
+ +       *
+ +       * finish_task_switch() will drop rq->lock() and lower preempt_count
+ +       * and the preempt_enable() will end up enabling preemption (on
+ +       * PREEMPT_COUNT kernels).
+ +       */
+ +
         rq = finish_task_switch(prev);
         balance_callback(rq);
         preempt_enable();
@@@ -2700,20 -2666,13 +2700,20 @@@ unsigned long nr_running(void
   
   /*
    * Check if only the current task is running on the cpu.
+ + *
+ + * Caution: this function does not check that the caller has disabled
+ + * preemption, thus the result might have a time-of-check-to-time-of-use
+ + * race.  The caller is responsible to use it correctly, for example:
+ + *
+ + * - from a non-preemptable section (of course)
+ + *
+ + * - from a thread that is bound to a single CPU
+ + *
+ + * - in a loop with very short iterations (e.g. a polling loop)
    */
   bool single_task_running(void)
   {
- -      if (cpu_rq(smp_processor_id())->nr_running == 1)
- -              return true;
- -      else
- -              return false;
+ +      return raw_rq()->nr_running == 1;
   }
   EXPORT_SYMBOL(single_task_running);
   
@@@ -2984,13 -2943,15 +2984,13 @@@ static noinline void __schedule_bug(str
   static inline void schedule_debug(struct task_struct *prev)
   {
   #ifdef CONFIG_SCHED_STACK_END_CHECK
- -      BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+ +      BUG_ON(task_stack_end_corrupted(prev));
   #endif
- -      /*
- -       * Test if we are atomic. Since do_exit() needs to call into
- -       * schedule() atomically, we ignore that path. Otherwise whine
- -       * if we are scheduling when we should not.
- -       */
- -      if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+ +
+ +      if (unlikely(in_atomic_preempt_off())) {
                 __schedule_bug(prev);
+ +              preempt_count_set(PREEMPT_DISABLED);
+ +      }
         rcu_sleep_check();
   
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -3076,7 -3037,7 +3076,7 @@@ again
    *
    * WARNING: must be called with preemption disabled!
    */
- -static void __sched __schedule(void)
+ +static void __sched notrace __schedule(bool preempt)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@@ -3088,17 -3049,6 +3088,17 @@@
         rcu_note_context_switch();
         prev = rq->curr;
   
+ +      /*
+ +       * do_exit() calls schedule() with preemption disabled as an exception;
+ +       * however we must fix that up, otherwise the next task will see an
+ +       * inconsistent (higher) preempt count.
+ +       *
+ +       * It also avoids the below schedule_debug() test from complaining
+ +       * about this.
+ +       */
+ +      if (unlikely(prev->state == TASK_DEAD))
+ +              preempt_enable_no_resched_notrace();
+ +
         schedule_debug(prev);
   
         if (sched_feat(HRTICK))
@@@ -3116,7 -3066,7 +3116,7 @@@
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
   
         switch_count = &prev->nivcsw;
- -      if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ +      if (!preempt && prev->state) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
                 } else {
@@@ -3152,7 -3102,6 +3152,7 @@@
                 rq->curr = next;
                 ++*switch_count;
   
+ +              trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
                 cpu = cpu_of(rq);
         } else {
@@@ -3182,7 -3131,7 +3182,7 @@@ asmlinkage __visible void __sched sched
         sched_submit_work(tsk);
         do {
                 preempt_disable();
- -              __schedule();
+ +              __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
   }
@@@ -3222,9 -3171,9 +3222,9 @@@ void __sched schedule_preempt_disabled(
   static void __sched notrace preempt_schedule_common(void)
   {
         do {
- -              preempt_active_enter();
- -              __schedule();
- -              preempt_active_exit();
+ +              preempt_disable_notrace();
+ +              __schedule(true);
+ +              preempt_enable_no_resched_notrace();
   
                 /*
                  * Check again in case we missed a preemption opportunity
@@@ -3275,17 -3224,24 +3275,17 @@@ asmlinkage __visible void __sched notra
                 return;
   
         do {
- -              /*
- -               * Use raw __prempt_count() ops that don't call function.
- -               * We can't call functions before disabling preemption which
- -               * disarm preemption tracing recursions.
- -               */
- -              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
- -              barrier();
+ +              preempt_disable_notrace();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
- -              __schedule();
+ +              __schedule(true);
                 exception_exit(prev_ctx);
   
- -              barrier();
- -              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ +              preempt_enable_no_resched_notrace();
         } while (need_resched());
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@@ -3308,11 -3264,11 +3308,11 @@@ asmlinkage __visible void __sched preem
         prev_state = exception_enter();
   
         do {
- -              preempt_active_enter();
+ +              preempt_disable();
                 local_irq_enable();
- -              __schedule();
+ +              __schedule(true);
                 local_irq_disable();
- -              preempt_active_exit();
+ +              sched_preempt_enable_no_resched();
         } while (need_resched());
   
         exception_exit(prev_state);
@@@ -3340,7 -3296,7 +3340,7 @@@ EXPORT_SYMBOL(default_wake_function)
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
- -      int oldprio, queued, running, enqueue_flag = 0;
+ +      int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
         struct rq *rq;
         const struct sched_class *prev_class;
   
@@@ -3372,7 -3328,7 +3372,7 @@@
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
- -              dequeue_task(rq, p, 0);
+ +              dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -3390,7 -3346,7 +3390,7 @@@
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
- -                      enqueue_flag = ENQUEUE_REPLENISH;
+ +                      enqueue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@@ -3398,7 -3354,7 +3398,7 @@@
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
- -                      enqueue_flag = ENQUEUE_HEAD;
+ +                      enqueue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@@ -3450,7 -3406,7 +3450,7 @@@ void set_user_nice(struct task_struct *
         }
         queued = task_on_rq_queued(p);
         if (queued)
- -              dequeue_task(rq, p, 0);
+ +              dequeue_task(rq, p, DEQUEUE_SAVE);
   
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@@ -3459,7 -3415,7 +3459,7 @@@
         delta = p->prio - old_prio;
   
         if (queued) {
- -              enqueue_task(rq, p, 0);
+ +              enqueue_task(rq, p, ENQUEUE_RESTORE);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@@ -3780,7 -3736,10 +3780,7 @@@ recheck
         } else {
                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
   
- -              if (policy != SCHED_DEADLINE &&
- -                              policy != SCHED_FIFO && policy != SCHED_RR &&
- -                              policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- -                              policy != SCHED_IDLE)
+ +              if (!valid_policy(policy))
                         return -EINVAL;
         }
   
@@@ -3836,7 -3795,7 +3836,7 @@@
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
- -              if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ +              if (idle_policy(p->policy) && !idle_policy(policy)) {
                         if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
@@@ -3961,7 -3920,7 +3961,7 @@@ change
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
- -              dequeue_task(rq, p, 0);
+ +              dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -3971,15 -3930,11 +3971,15 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
+ +              int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
- -              enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ +              if (oldprio <= p->prio)
+ +                      enqueue_flags |= ENQUEUE_HEAD;
+ +
+ +              enqueue_task(rq, p, enqueue_flags);
         }
   
         check_class_changed(rq, p, prev_class, oldprio);
@@@ -4057,7 -4012,6 +4057,7 @@@ int sched_setscheduler_nocheck(struct t
   {
         return _sched_setscheduler(p, policy, param, false);
   }
+ +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
   
   static int
   do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@@ -4970,15 -4924,7 +4970,15 @@@ void init_idle(struct task_struct *idle
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
   
- -      do_set_cpus_allowed(idle, cpumask_of(cpu));
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * Its possible that init_idle() gets called multiple times on a task,
+ +       * in that case do_set_cpus_allowed() will not do the right thing.
+ +       *
+ +       * And since this is boot we can forgo the serialization.
+ +       */
+ +      set_cpus_allowed_common(idle, cpumask_of(cpu));
+ +#endif
         /*
          * We're having a chicken and egg problem, even though we are
          * holding rq->lock, the cpu isn't yet set to this cpu so the
@@@ -4995,7 -4941,7 +4995,7 @@@
   
         rq->curr = rq->idle = idle;
         idle->on_rq = TASK_ON_RQ_QUEUED;
- -#if defined(CONFIG_SMP)
+ +#ifdef CONFIG_SMP
         idle->on_cpu = 1;
   #endif
         raw_spin_unlock(&rq->lock);
@@@ -5010,7 -4956,7 +5010,7 @@@
         idle->sched_class = &idle_sched_class;
         ftrace_graph_init_idle_task(idle, cpu);
         vtime_init_idle(idle, cpu);
- -#if defined(CONFIG_SMP)
+ +#ifdef CONFIG_SMP
         sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
   #endif
   }
@@@ -5129,7 -5075,7 +5129,7 @@@ void sched_setnuma(struct task_struct *
         running = task_current(rq, p);
   
         if (queued)
- -              dequeue_task(rq, p, 0);
+ +              dequeue_task(rq, p, DEQUEUE_SAVE);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -5138,7 -5084,7 +5138,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
- -              enqueue_task(rq, p, 0);
+ +              enqueue_task(rq, p, ENQUEUE_RESTORE);
         task_rq_unlock(rq, p, &flags);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -5232,47 -5178,24 +5232,47 @@@ static void migrate_tasks(struct rq *de
                         break;
   
                 /*
- -               * Ensure rq->lock covers the entire task selection
- -               * until the migration.
+ +               * pick_next_task assumes pinned rq->lock.
                  */
                 lockdep_pin_lock(&rq->lock);
                 next = pick_next_task(rq, &fake_task);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
   
+ +              /*
+ +               * Rules for changing task_struct::cpus_allowed are holding
+ +               * both pi_lock and rq->lock, such that holding either
+ +               * stabilizes the mask.
+ +               *
+ +               * Drop rq->lock is not quite as disastrous as it usually is
+ +               * because !cpu_active at this point, which means load-balance
+ +               * will not interfere. Also, stop-machine.
+ +               */
+ +              lockdep_unpin_lock(&rq->lock);
+ +              raw_spin_unlock(&rq->lock);
+ +              raw_spin_lock(&next->pi_lock);
+ +              raw_spin_lock(&rq->lock);
+ +
+ +              /*
+ +               * Since we're inside stop-machine, _nothing_ should have
+ +               * changed the task, WARN if weird stuff happened, because in
+ +               * that case the above rq->lock drop is a fail too.
+ +               */
+ +              if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ +                      raw_spin_unlock(&next->pi_lock);
+ +                      continue;
+ +              }
+ +
                 /* Find suitable destination for @next, with force if needed. */
                 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
   
- -              lockdep_unpin_lock(&rq->lock);
                 rq = __migrate_task(rq, next, dest_cpu);
                 if (rq != dead_rq) {
                         raw_spin_unlock(&rq->lock);
                         rq = dead_rq;
                         raw_spin_lock(&rq->lock);
                 }
+ +              raw_spin_unlock(&next->pi_lock);
         }
   
         rq->stop = stop;
@@@ -5559,27 -5482,21 +5559,27 @@@ static void set_cpu_rq_start_time(void
   static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
   {
+ +      int cpu = (long)hcpu;
+ +
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_STARTING:
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
+ +
         case CPU_ONLINE:
                 /*
                  * At this point a starting CPU has marked itself as online via
                  * set_cpu_online(). But it might not yet have marked itself
                  * as active, which is essential from here on.
- -               *
- -               * Thus, fall-through and help the starting CPU along.
                  */
+ +              set_cpu_active(cpu, true);
+ +              stop_machine_unpark(cpu);
+ +              return NOTIFY_OK;
+ +
         case CPU_DOWN_FAILED:
- -              set_cpu_active((long)hcpu, true);
+ +              set_cpu_active(cpu, true);
                 return NOTIFY_OK;
+ +
         default:
                 return NOTIFY_DONE;
         }
@@@ -6511,8 -6428,7 +6511,8 @@@ static struct sched_domain_topology_lev
         { NULL, },
   };
   
- -struct sched_domain_topology_level *sched_domain_topology = default_topology;
+ +static struct sched_domain_topology_level *sched_domain_topology =
+ +      default_topology;
   
   #define for_each_sd_topology(tl)                      \
         for (tl = sched_domain_topology; tl->mask; tl++)
@@@ -7281,6 -7197,9 +7281,6 @@@ void __init sched_init_smp(void
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
   
- -      /* nohz_full won't take effect without isolating the cpus. */
- -      tick_nohz_full_add_cpus_to(cpu_isolated_map);
- -
         sched_init_numa();
   
         /*
@@@ -7513,7 -7432,7 +7513,7 @@@ void __init sched_init(void
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   static inline int preempt_count_equals(int preempt_offset)
   {
- -      int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+ +      int nested = preempt_count() + rcu_preempt_depth();
   
         return (nested == preempt_offset);
   }
@@@ -7760,7 -7679,7 +7760,7 @@@ void sched_move_task(struct task_struc
         queued = task_on_rq_queued(tsk);
   
         if (queued)
- -              dequeue_task(rq, tsk, 0);
+ +              dequeue_task(rq, tsk, DEQUEUE_SAVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
   
@@@ -7776,7 -7695,7 +7776,7 @@@
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->task_move_group)
- -              tsk->sched_class->task_move_group(tsk, queued);
+ +              tsk->sched_class->task_move_group(tsk);
         else
   #endif
                 set_task_rq(tsk, task_cpu(tsk));
@@@ -7784,7 -7703,7 +7784,7 @@@
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
- -              enqueue_task(rq, tsk, 0);
+ +              enqueue_task(rq, tsk, ENQUEUE_RESTORE);
   
         task_rq_unlock(rq, tsk, &flags);
   }
@@@ -8244,13 -8163,6 +8244,6 @@@ static void cpu_cgroup_attach(struct cg
                 sched_move_task(task);
   }
   
- static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-                           struct cgroup_subsys_state *old_css,
-                           struct task_struct *task)
- {
-       sched_move_task(task);
- }
- 
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@@ -8582,7 -8494,6 +8575,6 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
         .fork           = cpu_cgroup_fork,
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
-       .exit           = cpu_cgroup_exit,
         .legacy_cftypes = cpu_files,
         .early_init     = 1,
   };
diff --combined mm/memcontrol.c

index c57c4423c68837d14816c5ff230435e1567e7c20,0ddd0ff2b52ecb0811f87dff872d9db8b4ce3579..b732edfddb767025185f27c8879903591c2b0c82
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -434,7 -434,7 +434,7 @@@ struct cgroup_subsys_state *mem_cgroup_
   
         memcg = page->mem_cgroup;
   
-       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+       if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 memcg = root_mem_cgroup;
   
         rcu_read_unlock();
@@@ -644,14 -644,12 +644,14 @@@ mem_cgroup_largest_soft_limit_node(stru
   }
   
   /*
+ + * Return page count for single (non recursive) @memcg.
+ + *
    * Implementation Note: reading percpu statistics for memcg.
    *
    * Both of vmstat[] and percpu_counter has threshold and do periodic
    * synchronization to implement "quick" read. There are trade-off between
    * reading cost and precision of value. Then, we may have a chance to implement
- - * a periodic synchronizion of counter in memcg's counter.
+ + * a periodic synchronization of counter in memcg's counter.
    *
    * But this _read() function is used for user interface now. The user accounts
    * memory usage by memory cgroup and he _always_ requires exact value because
@@@ -661,24 -659,17 +661,24 @@@
    *
    * If there are kernel internal actions which can make use of some not-exact
    * value, and reading all cpu value can be performance bottleneck in some
- - * common workload, threashold and synchonization as vmstat[] should be
+ + * common workload, threshold and synchronization as vmstat[] should be
    * implemented.
    */
- -static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
- -                               enum mem_cgroup_stat_index idx)
+ +static unsigned long
+ +mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
   {
         long val = 0;
         int cpu;
   
+ +      /* Per-cpu values can be negative, use a signed accumulator */
         for_each_possible_cpu(cpu)
                 val += per_cpu(memcg->stat->count[idx], cpu);
+ +      /*
+ +       * Summing races with updates, so val may be negative.  Avoid exposing
+ +       * transient negative values.
+ +       */
+ +      if (val < 0)
+ +              val = 0;
         return val;
   }
   
@@@ -1263,7 -1254,7 +1263,7 @@@ void mem_cgroup_print_oom_info(struct m
                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                                 continue;
- -                      pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+ +                      pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
                                 K(mem_cgroup_read_stat(iter, i)));
                 }
   
@@@ -2828,11 -2819,14 +2828,11 @@@ static unsigned long tree_stat(struct m
                                enum mem_cgroup_stat_index idx)
   {
         struct mem_cgroup *iter;
- -      long val = 0;
+ +      unsigned long val = 0;
   
- -      /* Per-cpu values can be negative, use a signed accumulator */
         for_each_mem_cgroup_tree(iter, memcg)
                 val += mem_cgroup_read_stat(iter, idx);
   
- -      if (val < 0) /* race ? */
- -              val = 0;
         return val;
   }
   
@@@ -2926,7 -2920,7 +2926,7 @@@ static int memcg_activate_kmem(struct m
          * of course permitted.
          */
         mutex_lock(&memcg_create_mutex);
-       if (cgroup_has_tasks(memcg->css.cgroup) ||
+       if (cgroup_is_populated(memcg->css.cgroup) ||
             (memcg->use_hierarchy && memcg_has_children(memcg)))
                 err = -EBUSY;
         mutex_unlock(&memcg_create_mutex);
@@@ -3175,7 -3169,7 +3175,7 @@@ static int memcg_stat_show(struct seq_f
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                         continue;
- -              seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+ +              seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
         }
   
@@@ -3200,13 -3194,13 +3200,13 @@@
                            (u64)memsw * PAGE_SIZE);
   
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- -              long long val = 0;
+ +              unsigned long long val = 0;
   
                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                         continue;
                 for_each_mem_cgroup_tree(mi, memcg)
                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
- -              seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+ +              seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
         }
   
         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@@ -3387,7 -3381,6 +3387,7 @@@ static int __mem_cgroup_usage_register_
         ret = page_counter_memparse(args, "-1", &threshold);
         if (ret)
                 return ret;
+ +      threshold <<= PAGE_SHIFT;
   
         mutex_lock(&memcg->thresholds_lock);
   
@@@ -3741,43 -3734,44 +3741,43 @@@ struct wb_domain *mem_cgroup_wb_domain(
   /**
    * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
    * @wb: bdi_writeback in question
- - * @pavail: out parameter for number of available pages
+ + * @pfilepages: out parameter for number of file pages
+ + * @pheadroom: out parameter for number of allocatable pages according to memcg
    * @pdirty: out parameter for number of dirty pages
    * @pwriteback: out parameter for number of pages under writeback
    *
- - * Determine the numbers of available, dirty, and writeback pages in @wb's
- - * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- - * more involved.
+ + * Determine the numbers of file, headroom, dirty, and writeback pages in
+ + * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ + * is a bit more involved.
    *
- - * A memcg's headroom is "min(max, high) - used".  The available memory is
- - * calculated as the lowest headroom of itself and the ancestors plus the
- - * number of pages already being used for file pages.  Note that this
- - * doesn't consider the actual amount of available memory in the system.
- - * The caller should further cap *@pavail accordingly.
+ + * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ + * headroom is calculated as the lowest headroom of itself and the
+ + * ancestors.  Note that this doesn't consider the actual amount of
+ + * available memory in the system.  The caller should further cap
+ + * *@pheadroom accordingly.
    */
- -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- -                       unsigned long *pdirty, unsigned long *pwriteback)
+ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ +                       unsigned long *pheadroom, unsigned long *pdirty,
+ +                       unsigned long *pwriteback)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
         struct mem_cgroup *parent;
- -      unsigned long head_room = PAGE_COUNTER_MAX;
- -      unsigned long file_pages;
   
         *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
   
         /* this should eventually include NR_UNSTABLE_NFS */
         *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ +      *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ +                                                   (1 << LRU_ACTIVE_FILE));
+ +      *pheadroom = PAGE_COUNTER_MAX;
   
- -      file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- -                                                  (1 << LRU_ACTIVE_FILE));
         while ((parent = parent_mem_cgroup(memcg))) {
                 unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                 unsigned long used = page_counter_read(&memcg->memory);
   
- -              head_room = min(head_room, ceiling - min(ceiling, used));
+ +              *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                 memcg = parent;
         }
- -
- -      *pavail = file_pages + head_room;
   }
   
   #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -4066,8 -4060,7 +4066,7 @@@ static struct cftype mem_cgroup_legacy_
         {
                 .name = "cgroup.event_control",         /* XXX: for compat */
                 .write = memcg_write_event_control,
-               .flags = CFTYPE_NO_PREFIX,
-               .mode = S_IWUGO,
+               .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
         },
         {
                 .name = "swappiness",
@@@ -4185,6 -4178,7 +4184,6 @@@ static struct mem_cgroup *mem_cgroup_al
         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
                 goto out_free_stat;
   
- -      spin_lock_init(&memcg->pcp_counter_lock);
         return memcg;
   
   out_free_stat:
@@@ -4834,7 -4828,7 +4833,7 @@@ static int mem_cgroup_can_attach(struc
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup *from;
-       struct task_struct *p;
+       struct task_struct *leader, *p;
         struct mm_struct *mm;
         unsigned long move_flags;
         int ret = 0;
@@@ -4848,7 -4842,20 +4847,20 @@@
         if (!move_flags)
                 return 0;
   
-       p = cgroup_taskset_first(tset);
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+       }
+       if (!p)
+               return 0;
+ 
         from = mem_cgroup_from_task(p);
   
         VM_BUG_ON(from == memcg);
@@@ -5064,7 -5071,7 +5076,7 @@@ static void mem_cgroup_bind(struct cgro
          * guarantees that @root doesn't have any children, so turning it
          * on for the root memcg is enough.
          */
-       if (cgroup_on_dfl(root_css->cgroup))
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 root_mem_cgroup->use_hierarchy = true;
         else
                 root_mem_cgroup->use_hierarchy = false;
@@@ -5208,6 -5215,7 +5220,7 @@@ static struct cftype memory_files[] = 
         {
                 .name = "events",
                 .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct mem_cgroup, events_file),
                 .seq_show = memory_events_show,
         },
         { }     /* terminate */
diff --combined mm/vmscan.c

index 7f63a9381f71ebbb0c1f9bdda94a913c930280f0,2d978b28a410b25df1acde351630dee387efbbe5..e7057af54b6e267558a99749fac80dc77dd7855f
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -175,7 -175,7 +175,7 @@@ static bool sane_reclaim(struct scan_co
         if (!memcg)
                 return true;
   #ifdef CONFIG_CGROUP_WRITEBACK
-       if (cgroup_on_dfl(memcg->css.cgroup))
- -      if (memcg->css.cgroup)
++      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 return true;
   #endif
         return false;
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Thu, 5 Nov 2015 04:03:01 +0000 (15:03 +1100)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Thu, 5 Nov 2015 04:03:04 +0000 (15:03 +1100)
		1	2
Documentation/cgroups/unified-hierarchy.txt	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/backing-dev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/jump_label.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/memcontrol.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history