]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge remote-tracking branch 'cgroup/for-next'
authorStephen Rothwell <sfr@canb.auug.org.au>
Thu, 5 Nov 2015 04:03:01 +0000 (15:03 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Thu, 5 Nov 2015 04:03:04 +0000 (15:03 +1100)
12 files changed:
1  2 
Documentation/cgroups/unified-hierarchy.txt
block/blk-cgroup.c
include/linux/backing-dev.h
include/linux/init_task.h
include/linux/jump_label.h
include/linux/memcontrol.h
include/linux/sched.h
kernel/events/core.c
kernel/fork.c
kernel/sched/core.c
mm/memcontrol.c
mm/vmscan.c

index 5c07337b51c1b345483f4ba2699e6ce8b7ea2616,0cd27a4e0055fb94950a9bb89299a05d315278f3..781b1d475bcfc9666d0d3dad80832c74adb48b22
@@@ -107,12 -107,6 +107,6 @@@ root of unified hierarchy can be bound 
  allows mixing unified hierarchy with the traditional multiple
  hierarchies in a fully backward compatible way.
  
- For development purposes, the following boot parameter makes all
- controllers to appear on the unified hierarchy whether supported or
- not.
-  cgroup__DEVEL__legacy_files_on_dfl
  A controller can be moved across hierarchies only after the controller
  is no longer referenced in its current hierarchy.  Because per-cgroup
  controller states are destroyed asynchronously and controllers may
@@@ -341,11 -335,11 +335,11 @@@ is riddled with issues
    unnecessarily complicated and probably done this way because event
    delivery itself was expensive.
  
- Unified hierarchy implements an interface file "cgroup.populated"
which can be used to monitor whether the cgroup's subhierarchy has
- tasks in it or not.  Its value is 0 if there is no task in the cgroup
- and its descendants; otherwise, 1.  poll and [id]notify events are
- triggered when the value changes.
+ Unified hierarchy implements "populated" field in "cgroup.events"
interface file which can be used to monitor whether the cgroup's
+ subhierarchy has tasks in it or not.  Its value is 0 if there is no
+ task in the cgroup and its descendants; otherwise, 1.  poll and
[id]notify events are triggered when the value changes.
  
  This is significantly lighter and simpler and trivially allows
  delegating management of subhierarchy - subhierarchy monitoring can
@@@ -374,6 -368,10 +368,10 @@@ supported and the interface files "rele
  
  - The "cgroup.clone_children" file is removed.
  
+ - /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
+   to before exiting.  If the cgroup is removed before the zombie is
+   reaped, " (deleted)" is appeneded to the path.
  
  5-3. Controller File Conventions
  
@@@ -435,6 -433,11 +433,11 @@@ may be specified in any order and not a
    the first entry in the file.  Specific entries can use "default" as
    its value to indicate inheritance of the default value.
  
+ - For events which are not very high frequency, an interface file
+   "events" should be created which lists event key value pairs.
+   Whenever a notifiable event happens, file modified event should be
+   generated on the file.
  
  5-4. Per-Controller Changes
  
        ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
        read/write IOs per second.  "max" indicates no limit.  Writing
        to the file follows the same format but the individual
 -      settings may be ommitted or specified in any order.
 +      settings may be omitted or specified in any order.
  
        This file is available only on non-root cgroups.
  
diff --combined block/blk-cgroup.c
index 55512dd626336eae49b758def08d601bc3515b74,4fa54161d423097da1441a41004ce416bf547280..5bcdfc10c23a6340367c4b9781496a49b5c81efe
@@@ -370,9 -370,6 +370,9 @@@ static void blkg_destroy_all(struct req
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);
        }
 +
 +      q->root_blkg = NULL;
 +      q->root_rl.blkg = NULL;
  }
  
  /*
@@@ -899,6 -896,7 +899,7 @@@ static int blkcg_print_stat(struct seq_
  struct cftype blkcg_files[] = {
        {
                .name = "stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = blkcg_print_stat,
        },
        { }     /* terminate */
index c85f74946a8bab65ff3f16cddea6a4446b0a4799,08d9a8eac42c35fb103747ff105091f4af0f27bd..c82794f20110420582d496ae478bc600f9400233
  #include <linux/sched.h>
  #include <linux/blkdev.h>
  #include <linux/writeback.h>
- #include <linux/memcontrol.h>
  #include <linux/blk-cgroup.h>
  #include <linux/backing-dev-defs.h>
  #include <linux/slab.h>
  
  int __must_check bdi_init(struct backing_dev_info *bdi);
 -void bdi_destroy(struct backing_dev_info *bdi);
 +void bdi_exit(struct backing_dev_info *bdi);
  
  __printf(3, 4)
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 +void bdi_unregister(struct backing_dev_info *bdi);
 +
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 +void bdi_destroy(struct backing_dev_info *bdi);
 +
  void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
                        bool range_cyclic, enum wb_reason reason);
  void wb_start_background_writeback(struct bdi_writeback *wb);
@@@ -267,8 -262,8 +266,8 @@@ static inline bool inode_cgwb_enabled(s
  {
        struct backing_dev_info *bdi = inode_to_bdi(inode);
  
-       return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
-               cgroup_on_dfl(blkcg_root_css->cgroup) &&
+       return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+               cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                bdi_cap_account_dirty(bdi) &&
                (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
@@@ -412,6 -407,61 +411,6 @@@ static inline void unlocked_inode_to_wb
        rcu_read_unlock();
  }
  
 -struct wb_iter {
 -      int                     start_memcg_id;
 -      struct radix_tree_iter  tree_iter;
 -      void                    **slot;
 -};
 -
 -static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 -                                                 struct backing_dev_info *bdi)
 -{
 -      struct radix_tree_iter *titer = &iter->tree_iter;
 -
 -      WARN_ON_ONCE(!rcu_read_lock_held());
 -
 -      if (iter->start_memcg_id >= 0) {
 -              iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
 -              iter->start_memcg_id = -1;
 -      } else {
 -              iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
 -      }
 -
 -      if (!iter->slot)
 -              iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
 -      if (iter->slot)
 -              return *iter->slot;
 -      return NULL;
 -}
 -
 -static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
 -                                                 struct backing_dev_info *bdi,
 -                                                 int start_memcg_id)
 -{
 -      iter->start_memcg_id = start_memcg_id;
 -
 -      if (start_memcg_id)
 -              return __wb_iter_next(iter, bdi);
 -      else
 -              return &bdi->wb;
 -}
 -
 -/**
 - * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
 - * @wb_cur: cursor struct bdi_writeback pointer
 - * @bdi: bdi to walk wb's of
 - * @iter: pointer to struct wb_iter to be used as iteration buffer
 - * @start_memcg_id: memcg ID to start iteration from
 - *
 - * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
 - * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
 - * to be used as temp storage during iteration.  rcu_read_lock() must be
 - * held throughout iteration.
 - */
 -#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)            \
 -      for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
 -           (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 -
  #else /* CONFIG_CGROUP_WRITEBACK */
  
  static inline bool inode_cgwb_enabled(struct inode *inode)
@@@ -471,6 -521,14 +470,6 @@@ static inline void wb_blkcg_offline(str
  {
  }
  
 -struct wb_iter {
 -      int             next_id;
 -};
 -
 -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
 -      for ((iter)->next_id = (start_blkcg_id);                        \
 -           ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
 -
  static inline int inode_congested(struct inode *inode, int cong_bits)
  {
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
index 810a34f604247065d78c2820ed34b0bf8d3aabf6,d0b380ee7d67abbd421bf69fdd63ff10b2aa88b1..1c1ff7e4faa4bf158166b789605107f6a65baf44
  extern struct files_struct init_files;
  extern struct fs_struct init_fs;
  
- #ifdef CONFIG_CGROUPS
- #define INIT_GROUP_RWSEM(sig)                                         \
-       .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
- #else
- #define INIT_GROUP_RWSEM(sig)
- #endif
  #ifdef CONFIG_CPUSETS
  #define INIT_CPUSET_SEQ(tsk)                                                  \
        .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
        .rlim           = INIT_RLIMITS,                                 \
        .cputimer       = {                                             \
                .cputime_atomic = INIT_CPUTIME_ATOMIC,                  \
 -              .running        = 0,                                    \
 +              .running        = false,                                \
 +              .checking_timer = false,                                \
        },                                                              \
        INIT_PREV_CPUTIME(sig)                                          \
        .cred_guard_mutex =                                             \
                 __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
-       INIT_GROUP_RWSEM(sig)                                           \
  }
  
  extern struct nsproxy init_nsproxy;
index f1094238ab2a0f0fddeb40e3c7aadde7c2a89015,c9ca050de8466afdd28189f42598620912c4b53d..8dde55974f186bca7c1488866aaacdd805c347b3
@@@ -21,8 -21,8 +21,8 @@@
   *
   * DEFINE_STATIC_KEY_TRUE(key);
   * DEFINE_STATIC_KEY_FALSE(key);
 - * static_key_likely()
 - * statick_key_unlikely()
 + * static_branch_likely()
 + * static_branch_unlikely()
   *
   * Jump labels provide an interface to generate dynamic branches using
   * self-modifying code. Assuming toolchain and architecture support, if we
   * statement, setting the key to true requires us to patch in a jump
   * to the out-of-line of true branch.
   *
 - * In addtion to static_branch_{enable,disable}, we can also reference count
 + * In addition to static_branch_{enable,disable}, we can also reference count
   * the key or branch direction via static_branch_{inc,dec}. Thus,
   * static_branch_inc() can be thought of as a 'make more true' and
 - * static_branch_dec() as a 'make more false'. The inc()/dec()
 - * interface is meant to be used exclusively from the inc()/dec() for a given
 - * key.
 + * static_branch_dec() as a 'make more false'.
   *
   * Since this relies on modifying code, the branch modifying functions
   * must be considered absolute slow paths (machine wide synchronization etc.).
@@@ -214,11 -216,6 +214,6 @@@ static inline int jump_label_apply_nops
  #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
  #define jump_label_enabled static_key_enabled
  
- static inline bool static_key_enabled(struct static_key *key)
- {
-       return static_key_count(key) > 0;
- }
  static inline void static_key_enable(struct static_key *key)
  {
        int count = static_key_count(key);
@@@ -265,6 -262,17 +260,17 @@@ struct static_key_false 
  #define DEFINE_STATIC_KEY_FALSE(name) \
        struct static_key_false name = STATIC_KEY_FALSE_INIT
  
+ extern bool ____wrong_branch_error(void);
+ #define static_key_enabled(x)                                                 \
+ ({                                                                            \
+       if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&     \
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+           !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+               ____wrong_branch_error();                                       \
+       static_key_count((struct static_key *)x) > 0;                           \
+ })
  #ifdef HAVE_JUMP_LABEL
  
  /*
   * See jump_label_type() / jump_label_init_type().
   */
  
- extern bool ____wrong_branch_error(void);
  #define static_branch_likely(x)                                                       \
  ({                                                                            \
        bool branch;                                                            \
index 3e3318ddfc0e3e09a0e15825f78eb6052d628d78,c83c699a6605b982ca8fbec56f9510d44e2c9cbd..27251ed428f7db8adaf54c58b7f9e41deda9048d
@@@ -213,6 -213,9 +213,9 @@@ struct mem_cgroup 
        /* OOM-Killer disable */
        int             oom_kill_disable;
  
+       /* handle for "memory.events" */
+       struct cgroup_file events_file;
        /* protect arrays of thresholds */
        struct mutex thresholds_lock;
  
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
 -      spinlock_t pcp_counter_lock;
  
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct cg_proto tcp_mem;
@@@ -285,6 -289,7 +288,7 @@@ static inline void mem_cgroup_events(st
                       unsigned int nr)
  {
        this_cpu_add(memcg->stat->events[idx], nr);
+       cgroup_file_notify(&memcg->events_file);
  }
  
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@@ -346,9 -351,7 +350,7 @@@ ino_t page_cgroup_ino(struct page *page
  
  static inline bool mem_cgroup_disabled(void)
  {
-       if (memory_cgrp_subsys.disabled)
-               return true;
-       return false;
+       return !cgroup_subsys_enabled(memory_cgrp_subsys);
  }
  
  /*
@@@ -676,9 -679,8 +678,9 @@@ enum 
  
  struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback);
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback);
  
  #else /* CONFIG_CGROUP_WRITEBACK */
  
@@@ -688,8 -690,7 +690,8 @@@ static inline struct wb_domain *mem_cgr
  }
  
  static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 -                                     unsigned long *pavail,
 +                                     unsigned long *pfilepages,
 +                                     unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
  {
diff --combined include/linux/sched.h
index c115d617739d8e6f1f388c13069ad32cfca1bb3b,a4ab9daa387c0bbcaca1923620ceb2ed74bfd84e..4effb1025fbb1555bc9c3ce6f80d98db004271a2
@@@ -599,42 -599,33 +599,42 @@@ struct task_cputime_atomic 
                .sum_exec_runtime = ATOMIC64_INIT(0),           \
        }
  
 -#ifdef CONFIG_PREEMPT_COUNT
 -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
 -#else
 -#define PREEMPT_DISABLED      PREEMPT_ENABLED
 -#endif
 +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 +
 +/*
 + * Disable preemption until the scheduler is running -- use an unconditional
 + * value so that it also works on !PREEMPT_COUNT kernels.
 + *
 + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 + */
 +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
  
  /*
 - * Disable preemption until the scheduler is running.
 - * Reset by start_kernel()->sched_init()->init_idle().
 + * Initial preempt_count value; reflects the preempt_count schedule invariant
 + * which states that during context switches:
   *
 - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
 - * before the scheduler is active -- see should_resched().
 + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 + *
 + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 + * Note: See finish_task_switch().
   */
 -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
 +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
   * @cputime_atomic:   atomic thread group interval timers.
 - * @running:          non-zero when there are timers running and
 - *                    @cputime receives updates.
 + * @running:          true when there are timers running and
 + *                    @cputime_atomic receives updates.
 + * @checking_timer:   true when a thread in the group is in the
 + *                    process of checking for thread group timers.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
 -      int running;
 +      bool running;
 +      bool checking_timer;
  };
  
  #include <linux/rwsem.h>
@@@ -771,18 -762,6 +771,6 @@@ struct signal_struct 
        unsigned audit_tty_log_passwd;
        struct tty_audit_buf *tty_audit_buf;
  #endif
- #ifdef CONFIG_CGROUPS
-       /*
-        * group_rwsem prevents new tasks from entering the threadgroup and
-        * member tasks from exiting,a more specifically, setting of
-        * PF_EXITING.  fork and exit paths are protected with this rwsem
-        * using threadgroup_change_begin/end().  Users which require
-        * threadgroup to remain stable should use threadgroup_[un]lock()
-        * which also takes care of exec path.  Currently, cgroup is the
-        * only user.
-        */
-       struct rw_semaphore group_rwsem;
- #endif
  
        oom_flags_t oom_flags;
        short oom_score_adj;            /* OOM kill score adjustment */
@@@ -849,7 -828,7 +837,7 @@@ struct user_struct 
        struct hlist_node uidhash_node;
        kuid_t uid;
  
 -#ifdef CONFIG_PERF_EVENTS
 +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
        atomic_long_t locked_vm;
  #endif
  };
@@@ -1148,6 -1127,8 +1136,6 @@@ struct sched_domain_topology_level 
  #endif
  };
  
 -extern struct sched_domain_topology_level *sched_domain_topology;
 -
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@@ -1196,10 -1177,10 +1184,10 @@@ struct load_weight 
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
 - * 1) load_avg factors the amount of time that a sched_entity is
 - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
 - * such weights of all runnable and blocked sched_entities.
 - * 2) util_avg factors frequency scaling into the amount of time
 + * 1) load_avg factors frequency scaling into the amount of time that a
 + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
 + * aggregated such weights of all runnable and blocked sched_entities.
 + * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
@@@ -1349,12 -1330,10 +1337,12 @@@ struct sched_dl_entity 
  
  union rcu_special {
        struct {
 -              bool blocked;
 -              bool need_qs;
 -      } b;
 -      short s;
 +              u8 blocked;
 +              u8 need_qs;
 +              u8 exp_need_qs;
 +              u8 pad; /* Otherwise the compiler can store garbage here. */
 +      } b; /* Bits. */
 +      u32 s; /* Set of bits. */
  };
  struct rcu_node;
  
diff --combined kernel/events/core.c
index 39db20c6248e47c940bd8721c41ade530e1eb5c9,e9874949c78734d2c662845aedb69d79ccd5b839..1a734e0adfa78259dac6cae57239a2d5f992a9ac
@@@ -196,7 -196,7 +196,7 @@@ static int perf_sample_period_ns __read
  static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  
 -void update_perf_cpu_limits(void)
 +static void update_perf_cpu_limits(void)
  {
        u64 tmp = perf_sample_period_ns;
  
@@@ -472,7 -472,7 +472,7 @@@ perf_cgroup_set_timestamp(struct task_s
   * mode SWOUT : schedule out everything
   * mode SWIN : schedule in based on cgroup for next
   */
 -void perf_cgroup_switch(struct task_struct *task, int mode)
 +static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
@@@ -1243,7 -1243,11 +1243,7 @@@ static inline void perf_event__state_in
                                              PERF_EVENT_STATE_INACTIVE;
  }
  
 -/*
 - * Called at perf_event creation and when events are attached/detached from a
 - * group.
 - */
 -static void perf_event__read_size(struct perf_event *event)
 +static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
  {
        int entry = sizeof(u64); /* value */
        int size = 0;
                entry += sizeof(u64);
  
        if (event->attr.read_format & PERF_FORMAT_GROUP) {
 -              nr += event->group_leader->nr_siblings;
 +              nr += nr_siblings;
                size += sizeof(u64);
        }
  
        event->read_size = size;
  }
  
 -static void perf_event__header_size(struct perf_event *event)
 +static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
  {
        struct perf_sample_data *data;
 -      u64 sample_type = event->attr.sample_type;
        u16 size = 0;
  
 -      perf_event__read_size(event);
 -
        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);
  
        event->header_size = size;
  }
  
 +/*
 + * Called at perf_event creation and when events are attached/detached from a
 + * group.
 + */
 +static void perf_event__header_size(struct perf_event *event)
 +{
 +      __perf_event_read_size(event,
 +                             event->group_leader->nr_siblings);
 +      __perf_event_header_size(event, event->attr.sample_type);
 +}
 +
  static void perf_event__id_header_size(struct perf_event *event)
  {
        struct perf_sample_data *data;
        event->id_header_size = size;
  }
  
 +static bool perf_event_validate_size(struct perf_event *event)
 +{
 +      /*
 +       * The values computed here will be over-written when we actually
 +       * attach the event.
 +       */
 +      __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
 +      __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
 +      perf_event__id_header_size(event);
 +
 +      /*
 +       * Sum the lot; should not exceed the 64k limit we have on records.
 +       * Conservative limit to allow for callchains and other variable fields.
 +       */
 +      if (event->read_size + event->header_size +
 +          event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
 +              return false;
 +
 +      return true;
 +}
 +
  static void perf_group_attach(struct perf_event *event)
  {
        struct perf_event *group_leader = event->group_leader, *pos;
@@@ -1939,7 -1914,7 +1939,7 @@@ group_sched_in(struct perf_event *group
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
  
 -      pmu->start_txn(pmu);
 +      pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
  
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
@@@ -3209,22 -3184,14 +3209,22 @@@ void perf_event_exec(void
        rcu_read_unlock();
  }
  
 +struct perf_read_data {
 +      struct perf_event *event;
 +      bool group;
 +      int ret;
 +};
 +
  /*
   * Cross CPU call to read the hardware event
   */
  static void __perf_event_read(void *info)
  {
 -      struct perf_event *event = info;
 +      struct perf_read_data *data = info;
 +      struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 +      struct pmu *pmu = event->pmu;
  
        /*
         * If this is a task context, we need to check whether it is
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
 +
        update_event_times(event);
 -      if (event->state == PERF_EVENT_STATE_ACTIVE)
 -              event->pmu->read(event);
 +      if (event->state != PERF_EVENT_STATE_ACTIVE)
 +              goto unlock;
 +
 +      if (!data->group) {
 +              pmu->read(event);
 +              data->ret = 0;
 +              goto unlock;
 +      }
 +
 +      pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 +
 +      pmu->read(event);
 +
 +      list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +              update_event_times(sub);
 +              if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 +                      /*
 +                       * Use sibling's PMU rather than @event's since
 +                       * sibling could be on different (eg: software) PMU.
 +                       */
 +                      sub->pmu->read(sub);
 +              }
 +      }
 +
 +      data->ret = pmu->commit_txn(pmu);
 +
 +unlock:
        raw_spin_unlock(&ctx->lock);
  }
  
@@@ -3334,23 -3275,15 +3334,23 @@@ u64 perf_event_read_local(struct perf_e
        return val;
  }
  
 -static u64 perf_event_read(struct perf_event *event)
 +static int perf_event_read(struct perf_event *event, bool group)
  {
 +      int ret = 0;
 +
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
 +              struct perf_read_data data = {
 +                      .event = event,
 +                      .group = group,
 +                      .ret = 0,
 +              };
                smp_call_function_single(event->oncpu,
 -                                       __perf_event_read, event, 1);
 +                                       __perf_event_read, &data, 1);
 +              ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
 -              update_event_times(event);
 +              if (group)
 +                      update_group_times(event);
 +              else
 +                      update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
  
 -      return perf_event_count(event);
 +      return ret;
  }
  
  /*
@@@ -3814,7 -3744,7 +3814,7 @@@ static void put_event(struct perf_even
         *     see the comment there.
         *
         *  2) there is a lock-inversion with mmap_sem through
 -       *     perf_event_read_group(), which takes faults while
 +       *     perf_read_group(), which takes faults while
         *     holding ctx->mutex, however this is called after
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
@@@ -3888,18 -3818,14 +3888,18 @@@ u64 perf_event_read_value(struct perf_e
        *running = 0;
  
        mutex_lock(&event->child_mutex);
 -      total += perf_event_read(event);
 +
 +      (void)perf_event_read(event, false);
 +      total += perf_event_count(event);
 +
        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);
  
        list_for_each_entry(child, &event->child_list, child_list) {
 -              total += perf_event_read(child);
 +              (void)perf_event_read(child, false);
 +              total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
  }
  EXPORT_SYMBOL_GPL(perf_event_read_value);
  
 -static int perf_event_read_group(struct perf_event *event,
 -                                 u64 read_format, char __user *buf)
 +static int __perf_read_group_add(struct perf_event *leader,
 +                                      u64 read_format, u64 *values)
  {
 -      struct perf_event *leader = event->group_leader, *sub;
 -      struct perf_event_context *ctx = leader->ctx;
 -      int n = 0, size = 0, ret;
 -      u64 count, enabled, running;
 -      u64 values[5];
 +      struct perf_event *sub;
 +      int n = 1; /* skip @nr */
 +      int ret;
  
 -      lockdep_assert_held(&ctx->mutex);
 +      ret = perf_event_read(leader, true);
 +      if (ret)
 +              return ret;
  
 -      count = perf_event_read_value(leader, &enabled, &running);
 +      /*
 +       * Since we co-schedule groups, {enabled,running} times of siblings
 +       * will be identical to those of the leader, so we only publish one
 +       * set.
 +       */
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 +              values[n++] += leader->total_time_enabled +
 +                      atomic64_read(&leader->child_total_time_enabled);
 +      }
  
 -      values[n++] = 1 + leader->nr_siblings;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 -              values[n++] = enabled;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 -              values[n++] = running;
 -      values[n++] = count;
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 +              values[n++] += leader->total_time_running +
 +                      atomic64_read(&leader->child_total_time_running);
 +      }
 +
 +      /*
 +       * Write {count,id} tuples for every sibling.
 +       */
 +      values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 -      size = n * sizeof(u64);
 +      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +              values[n++] += perf_event_count(sub);
 +              if (read_format & PERF_FORMAT_ID)
 +                      values[n++] = primary_event_id(sub);
 +      }
  
 -      if (copy_to_user(buf, values, size))
 -              return -EFAULT;
 +      return 0;
 +}
  
 -      ret = size;
 +static int perf_read_group(struct perf_event *event,
 +                                 u64 read_format, char __user *buf)
 +{
 +      struct perf_event *leader = event->group_leader, *child;
 +      struct perf_event_context *ctx = leader->ctx;
 +      int ret;
 +      u64 *values;
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 -              n = 0;
 +      lockdep_assert_held(&ctx->mutex);
  
 -              values[n++] = perf_event_read_value(sub, &enabled, &running);
 -              if (read_format & PERF_FORMAT_ID)
 -                      values[n++] = primary_event_id(sub);
 +      values = kzalloc(event->read_size, GFP_KERNEL);
 +      if (!values)
 +              return -ENOMEM;
  
 -              size = n * sizeof(u64);
 +      values[0] = 1 + leader->nr_siblings;
  
 -              if (copy_to_user(buf + ret, values, size)) {
 -                      return -EFAULT;
 -              }
 +      /*
 +       * By locking the child_mutex of the leader we effectively
 +       * lock the child list of all siblings.. XXX explain how.
 +       */
 +      mutex_lock(&leader->child_mutex);
  
 -              ret += size;
 +      ret = __perf_read_group_add(leader, read_format, values);
 +      if (ret)
 +              goto unlock;
 +
 +      list_for_each_entry(child, &leader->child_list, child_list) {
 +              ret = __perf_read_group_add(child, read_format, values);
 +              if (ret)
 +                      goto unlock;
        }
  
 +      mutex_unlock(&leader->child_mutex);
 +
 +      ret = event->read_size;
 +      if (copy_to_user(buf, values, event->read_size))
 +              ret = -EFAULT;
 +      goto out;
 +
 +unlock:
 +      mutex_unlock(&leader->child_mutex);
 +out:
 +      kfree(values);
        return ret;
  }
  
 -static int perf_event_read_one(struct perf_event *event,
 +static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
  {
        u64 enabled, running;
@@@ -4035,7 -3921,7 +4035,7 @@@ static bool is_event_hup(struct perf_ev
   * Read the performance event - simple non blocking version for now
   */
  static ssize_t
 -perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 +__perf_read(struct perf_event *event, char __user *buf, size_t count)
  {
        u64 read_format = event->attr.read_format;
        int ret;
  
        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
 -              ret = perf_event_read_group(event, read_format, buf);
 +              ret = perf_read_group(event, read_format, buf);
        else
 -              ret = perf_event_read_one(event, read_format, buf);
 +              ret = perf_read_one(event, read_format, buf);
  
        return ret;
  }
@@@ -4068,7 -3954,7 +4068,7 @@@ perf_read(struct file *file, char __use
        int ret;
  
        ctx = perf_event_ctx_lock(event);
 -      ret = perf_read_hw(event, buf, count);
 +      ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);
  
        return ret;
@@@ -4099,7 -3985,7 +4099,7 @@@ static unsigned int perf_poll(struct fi
  
  static void _perf_event_reset(struct perf_event *event)
  {
 -      (void)perf_event_read(event);
 +      (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
  }
@@@ -5375,15 -5261,9 +5375,15 @@@ void perf_output_sample(struct perf_out
  
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
 -                      perf_output_put(handle, data->raw->size);
 -                      __output_copy(handle, data->raw->data,
 -                                         data->raw->size);
 +                      u32 raw_size = data->raw->size;
 +                      u32 real_size = round_up(raw_size + sizeof(u32),
 +                                               sizeof(u64)) - sizeof(u32);
 +                      u64 zero = 0;
 +
 +                      perf_output_put(handle, real_size);
 +                      __output_copy(handle, data->raw->data, raw_size);
 +                      if (real_size - raw_size)
 +                              __output_copy(handle, &zero, real_size - raw_size);
                } else {
                        struct {
                                u32     size;
@@@ -5515,7 -5395,8 +5515,7 @@@ void perf_prepare_sample(struct perf_ev
                else
                        size += sizeof(u32);
  
 -              WARN_ON_ONCE(size & (sizeof(u64)-1));
 -              header->size += size;
 +              header->size += round_up(size, sizeof(u64));
        }
  
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@@ -7386,49 -7267,24 +7386,49 @@@ static void perf_pmu_nop_void(struct pm
  {
  }
  
 +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
 +{
 +}
 +
  static int perf_pmu_nop_int(struct pmu *pmu)
  {
        return 0;
  }
  
 -static void perf_pmu_start_txn(struct pmu *pmu)
 +static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 +
 +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
  {
 +      __this_cpu_write(nop_txn_flags, flags);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_disable(pmu);
  }
  
  static int perf_pmu_commit_txn(struct pmu *pmu)
  {
 +      unsigned int flags = __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return 0;
 +
        perf_pmu_enable(pmu);
        return 0;
  }
  
  static void perf_pmu_cancel_txn(struct pmu *pmu)
  {
 +      unsigned int flags =  __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_enable(pmu);
  }
  
@@@ -7667,7 -7523,7 +7667,7 @@@ got_cpu_context
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
 -                      pmu->start_txn  = perf_pmu_nop_void;
 +                      pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
@@@ -7755,7 -7611,7 +7755,7 @@@ static int perf_try_init_event(struct p
        return ret;
  }
  
 -struct pmu *perf_init_event(struct perf_event *event)
 +static struct pmu *perf_init_event(struct perf_event *event)
  {
        struct pmu *pmu = NULL;
        int idx;
@@@ -8441,35 -8297,13 +8441,35 @@@ SYSCALL_DEFINE5(perf_event_open
  
        if (move_group) {
                gctx = group_leader->ctx;
 +              mutex_lock_double(&gctx->mutex, &ctx->mutex);
 +      } else {
 +              mutex_lock(&ctx->mutex);
 +      }
 +
 +      if (!perf_event_validate_size(event)) {
 +              err = -E2BIG;
 +              goto err_locked;
 +      }
 +
 +      /*
 +       * Must be under the same ctx::mutex as perf_install_in_context(),
 +       * because we need to serialize with concurrent event creation.
 +       */
 +      if (!exclusive_event_installable(event, ctx)) {
 +              /* exclusive and group stuff are assumed mutually exclusive */
 +              WARN_ON_ONCE(move_group);
 +
 +              err = -EBUSY;
 +              goto err_locked;
 +      }
 +
 +      WARN_ON_ONCE(ctx->parent_ctx);
  
 +      if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
 -              mutex_lock_double(&gctx->mutex, &ctx->mutex);
 -
                perf_remove_from_context(group_leader, false);
  
                list_for_each_entry(sibling, &group_leader->sibling_list,
                        perf_remove_from_context(sibling, false);
                        put_ctx(gctx);
                }
 -      } else {
 -              mutex_lock(&ctx->mutex);
 -      }
 -
 -      WARN_ON_ONCE(ctx->parent_ctx);
  
 -      if (move_group) {
                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
 -      }
  
 -      if (!exclusive_event_installable(event, ctx)) {
 -              err = -EBUSY;
 -              mutex_unlock(&ctx->mutex);
 -              fput(event_file);
 -              goto err_context;
 +              /*
 +               * Now that all events are installed in @ctx, nothing
 +               * references @gctx anymore, so drop the last reference we have
 +               * on it.
 +               */
 +              put_ctx(gctx);
        }
  
 +      /*
 +       * Precalculate sample_data sizes; do while holding ctx::mutex such
 +       * that we're serialized against further additions and before
 +       * perf_install_in_context() which is the point the event is active and
 +       * can use these values.
 +       */
 +      perf_event__header_size(event);
 +      perf_event__id_header_size(event);
 +
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
  
 -      if (move_group) {
 +      if (move_group)
                mutex_unlock(&gctx->mutex);
 -              put_ctx(gctx);
 -      }
        mutex_unlock(&ctx->mutex);
  
        put_online_cpus();
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
  
 -      /*
 -       * Precalculate sample_data sizes
 -       */
 -      perf_event__header_size(event);
 -      perf_event__id_header_size(event);
 -
        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
        fd_install(event_fd, event_file);
        return event_fd;
  
 +err_locked:
 +      if (move_group)
 +              mutex_unlock(&gctx->mutex);
 +      mutex_unlock(&ctx->mutex);
 +/* err_file: */
 +      fput(event_file);
  err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
@@@ -9460,17 -9293,9 +9460,9 @@@ static void perf_cgroup_attach(struct c
                task_function_call(task, __perf_cgroup_move, task);
  }
  
- static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-                            struct cgroup_subsys_state *old_css,
-                            struct task_struct *task)
- {
-       task_function_call(task, __perf_cgroup_move, task);
- }
  struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc      = perf_cgroup_css_alloc,
        .css_free       = perf_cgroup_css_free,
-       .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
  };
  #endif /* CONFIG_CGROUP_PERF */
diff --combined kernel/fork.c
index 6ac894244d3978fb800f7a1a02912bb2901e5e84,118743bb596498edb919f09b92cbca956f479595..825ecc32454d23f4e60216bedfb2de31fe504699
@@@ -251,6 -251,7 +251,7 @@@ void __put_task_struct(struct task_stru
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
  
+       cgroup_free(tsk);
        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
@@@ -1101,7 -1102,7 +1102,7 @@@ static void posix_cpu_timers_init_group
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 -              sig->cputimer.running = 1;
 +              sig->cputimer.running = true;
        }
  
        /* The timer lists. */
@@@ -1149,10 -1150,6 +1150,6 @@@ static int copy_signal(unsigned long cl
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
  
- #ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
- #endif
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
diff --combined kernel/sched/core.c
index aa5973220ad213a960092012bf4493f296dab90b,2cad9ba9103682a14fc596f8ee813b52a70b928c..4d568ac9319eaf04c9d00673483678bc5e14f22e
@@@ -621,21 -621,18 +621,21 @@@ int get_nohz_timer_target(void
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
  
 -      if (!idle_cpu(cpu))
 +      if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
  
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
 -                      if (!idle_cpu(i)) {
 +                      if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
 +
 +      if (!is_housekeeping_cpu(cpu))
 +              cpu = housekeeping_any_cpu();
  unlock:
        rcu_read_unlock();
        return cpu;
@@@ -817,7 -814,7 +817,7 @@@ static void set_load_weight(struct task
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
 -      if (p->policy == SCHED_IDLE) {
 +      if (idle_policy(p->policy)) {
                load->weight = scale_load(WEIGHT_IDLEPRIO);
                load->inv_weight = WMULT_IDLEPRIO;
                return;
        load->inv_weight = prio_to_wmult[prio];
  }
  
 -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
 -      sched_info_queued(rq, p);
 +      if (!(flags & ENQUEUE_RESTORE))
 +              sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
  }
  
 -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        update_rq_clock(rq);
 -      sched_info_dequeued(rq, p);
 +      if (!(flags & DEQUEUE_SAVE))
 +              sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@@ -1180,7 -1175,7 +1180,7 @@@ void do_set_cpus_allowed(struct task_st
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        }
        if (running)
                put_prev_task(rq, p);
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
  }
  
  /*
@@@ -1294,7 -1289,7 +1294,7 @@@ void set_task_cpu(struct task_struct *p
  
        if (task_cpu(p) != new_cpu) {
                if (p->sched_class->migrate_task_rq)
 -                      p->sched_class->migrate_task_rq(p, new_cpu);
 +                      p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
        }
@@@ -1335,16 -1330,12 +1335,16 @@@ static int migrate_swap_stop(void *data
        struct rq *src_rq, *dst_rq;
        int ret = -EAGAIN;
  
 +      if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
 +              return -EAGAIN;
 +
        src_rq = cpu_rq(arg->src_cpu);
        dst_rq = cpu_rq(arg->dst_cpu);
  
        double_raw_lock(&arg->src_task->pi_lock,
                        &arg->dst_task->pi_lock);
        double_rq_lock(src_rq, dst_rq);
 +
        if (task_cpu(arg->dst_task) != arg->dst_cpu)
                goto unlock;
  
@@@ -1580,15 -1571,13 +1580,15 @@@ static int select_fallback_rq(int cpu, 
                        goto out;
                }
  
 +              /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
 -                      /* No more Mr. Nice Guy. */
 -                      cpuset_cpus_allowed_fallback(p);
 -                      state = possible;
 -                      break;
 -
 +                      if (IS_ENABLED(CONFIG_CPUSETS)) {
 +                              cpuset_cpus_allowed_fallback(p);
 +                              state = possible;
 +                              break;
 +                      }
 +                      /* fall-through */
                case possible:
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
@@@ -1700,7 -1689,7 +1700,7 @@@ ttwu_stat(struct task_struct *p, int cp
  #endif /* CONFIG_SCHEDSTATS */
  }
  
 -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
  {
        activate_task(rq, p, en_flags);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@@ -2122,17 -2111,23 +2122,17 @@@ static void __sched_fork(unsigned long 
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
 +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
 +
  #ifdef CONFIG_NUMA_BALANCING
 -#ifdef CONFIG_SCHED_DEBUG
 +
  void set_numabalancing_state(bool enabled)
  {
        if (enabled)
 -              sched_feat_set("NUMA");
 +              static_branch_enable(&sched_numa_balancing);
        else
 -              sched_feat_set("NO_NUMA");
 +              static_branch_disable(&sched_numa_balancing);
  }
 -#else
 -__read_mostly bool numabalancing_enabled;
 -
 -void set_numabalancing_state(bool enabled)
 -{
 -      numabalancing_enabled = enabled;
 -}
 -#endif /* CONFIG_SCHED_DEBUG */
  
  #ifdef CONFIG_PROC_SYSCTL
  int sysctl_numa_balancing(struct ctl_table *table, int write,
  {
        struct ctl_table t;
        int err;
 -      int state = numabalancing_enabled;
 +      int state = static_branch_likely(&sched_numa_balancing);
  
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
@@@ -2351,8 -2346,6 +2351,8 @@@ void wake_up_new_task(struct task_struc
        struct rq *rq;
  
        raw_spin_lock_irqsave(&p->pi_lock, flags);
 +      /* Initialize new task's runnable average */
 +      init_entity_runnable_average(&p->se);
  #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
 -      /* Initialize new task's runnable average */
 -      init_entity_runnable_average(&p->se);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
 -      if (p->sched_class->task_woken)
 +      if (p->sched_class->task_woken) {
 +              /*
 +               * Nothing relies on rq->lock after this, so its fine to
 +               * drop it.
 +               */
 +              lockdep_unpin_lock(&rq->lock);
                p->sched_class->task_woken(rq, p);
 +              lockdep_pin_lock(&rq->lock);
 +      }
  #endif
        task_rq_unlock(rq, p, &flags);
  }
@@@ -2485,6 -2473,7 +2485,6 @@@ static inline voi
  prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
  {
 -      trace_sched_switch(prev, next);
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
@@@ -2518,22 -2507,6 +2518,22 @@@ static struct rq *finish_task_switch(st
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
  
 +      /*
 +       * The previous task will have left us with a preempt_count of 2
 +       * because it left us after:
 +       *
 +       *      schedule()
 +       *        preempt_disable();                    // 1
 +       *        __schedule()
 +       *          raw_spin_lock_irq(&rq->lock)        // 2
 +       *
 +       * Also, see FORK_PREEMPT_COUNT.
 +       */
 +      if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
 +                    "corrupted preempt_count: %s/%d/0x%x\n",
 +                    current->comm, current->pid, preempt_count()))
 +              preempt_count_set(FORK_PREEMPT_COUNT);
 +
        rq->prev_mm = NULL;
  
        /*
         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
         * schedule one last time. The schedule call will never return, and
         * the scheduled task must drop that reference.
 -       * The test for TASK_DEAD must occur while the runqueue locks are
 -       * still held, otherwise prev could be scheduled on another cpu, die
 -       * there before we look at prev->state, and then the reference would
 -       * be dropped twice.
 -       *              Manfred Spraul <manfred@colorfullife.com>
 +       *
 +       * We must observe prev->state before clearing prev->on_cpu (in
 +       * finish_lock_switch), otherwise a concurrent wakeup can get prev
 +       * running on another CPU and we could rave with its RUNNING -> DEAD
 +       * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
@@@ -2618,15 -2591,8 +2618,15 @@@ asmlinkage __visible void schedule_tail
  {
        struct rq *rq;
  
 -      /* finish_task_switch() drops rq->lock and enables preemtion */
 -      preempt_disable();
 +      /*
 +       * New tasks start with FORK_PREEMPT_COUNT, see there and
 +       * finish_task_switch() for details.
 +       *
 +       * finish_task_switch() will drop rq->lock() and lower preempt_count
 +       * and the preempt_enable() will end up enabling preemption (on
 +       * PREEMPT_COUNT kernels).
 +       */
 +
        rq = finish_task_switch(prev);
        balance_callback(rq);
        preempt_enable();
@@@ -2700,20 -2666,13 +2700,20 @@@ unsigned long nr_running(void
  
  /*
   * Check if only the current task is running on the cpu.
 + *
 + * Caution: this function does not check that the caller has disabled
 + * preemption, thus the result might have a time-of-check-to-time-of-use
 + * race.  The caller is responsible to use it correctly, for example:
 + *
 + * - from a non-preemptable section (of course)
 + *
 + * - from a thread that is bound to a single CPU
 + *
 + * - in a loop with very short iterations (e.g. a polling loop)
   */
  bool single_task_running(void)
  {
 -      if (cpu_rq(smp_processor_id())->nr_running == 1)
 -              return true;
 -      else
 -              return false;
 +      return raw_rq()->nr_running == 1;
  }
  EXPORT_SYMBOL(single_task_running);
  
@@@ -2984,13 -2943,15 +2984,13 @@@ static noinline void __schedule_bug(str
  static inline void schedule_debug(struct task_struct *prev)
  {
  #ifdef CONFIG_SCHED_STACK_END_CHECK
 -      BUG_ON(unlikely(task_stack_end_corrupted(prev)));
 +      BUG_ON(task_stack_end_corrupted(prev));
  #endif
 -      /*
 -       * Test if we are atomic. Since do_exit() needs to call into
 -       * schedule() atomically, we ignore that path. Otherwise whine
 -       * if we are scheduling when we should not.
 -       */
 -      if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
 +
 +      if (unlikely(in_atomic_preempt_off())) {
                __schedule_bug(prev);
 +              preempt_count_set(PREEMPT_DISABLED);
 +      }
        rcu_sleep_check();
  
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@@ -3076,7 -3037,7 +3076,7 @@@ again
   *
   * WARNING: must be called with preemption disabled!
   */
 -static void __sched __schedule(void)
 +static void __sched notrace __schedule(bool preempt)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        rcu_note_context_switch();
        prev = rq->curr;
  
 +      /*
 +       * do_exit() calls schedule() with preemption disabled as an exception;
 +       * however we must fix that up, otherwise the next task will see an
 +       * inconsistent (higher) preempt count.
 +       *
 +       * It also avoids the below schedule_debug() test from complaining
 +       * about this.
 +       */
 +      if (unlikely(prev->state == TASK_DEAD))
 +              preempt_enable_no_resched_notrace();
 +
        schedule_debug(prev);
  
        if (sched_feat(HRTICK))
        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
        switch_count = &prev->nivcsw;
 -      if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 +      if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
                rq->curr = next;
                ++*switch_count;
  
 +              trace_sched_switch(preempt, prev, next);
                rq = context_switch(rq, prev, next); /* unlocks the rq */
                cpu = cpu_of(rq);
        } else {
@@@ -3182,7 -3131,7 +3182,7 @@@ asmlinkage __visible void __sched sched
        sched_submit_work(tsk);
        do {
                preempt_disable();
 -              __schedule();
 +              __schedule(false);
                sched_preempt_enable_no_resched();
        } while (need_resched());
  }
@@@ -3222,9 -3171,9 +3222,9 @@@ void __sched schedule_preempt_disabled(
  static void __sched notrace preempt_schedule_common(void)
  {
        do {
 -              preempt_active_enter();
 -              __schedule();
 -              preempt_active_exit();
 +              preempt_disable_notrace();
 +              __schedule(true);
 +              preempt_enable_no_resched_notrace();
  
                /*
                 * Check again in case we missed a preemption opportunity
@@@ -3275,17 -3224,24 +3275,17 @@@ asmlinkage __visible void __sched notra
                return;
  
        do {
 -              /*
 -               * Use raw __prempt_count() ops that don't call function.
 -               * We can't call functions before disabling preemption which
 -               * disarm preemption tracing recursions.
 -               */
 -              __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 -              barrier();
 +              preempt_disable_notrace();
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
 -              __schedule();
 +              __schedule(true);
                exception_exit(prev_ctx);
  
 -              barrier();
 -              __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 +              preempt_enable_no_resched_notrace();
        } while (need_resched());
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@@ -3308,11 -3264,11 +3308,11 @@@ asmlinkage __visible void __sched preem
        prev_state = exception_enter();
  
        do {
 -              preempt_active_enter();
 +              preempt_disable();
                local_irq_enable();
 -              __schedule();
 +              __schedule(true);
                local_irq_disable();
 -              preempt_active_exit();
 +              sched_preempt_enable_no_resched();
        } while (need_resched());
  
        exception_exit(prev_state);
@@@ -3340,7 -3296,7 +3340,7 @@@ EXPORT_SYMBOL(default_wake_function)
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
 -      int oldprio, queued, running, enqueue_flag = 0;
 +      int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
        struct rq *rq;
        const struct sched_class *prev_class;
  
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
 -                      enqueue_flag = ENQUEUE_REPLENISH;
 +                      enqueue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
 -                      enqueue_flag = ENQUEUE_HEAD;
 +                      enqueue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@@ -3450,7 -3406,7 +3450,7 @@@ void set_user_nice(struct task_struct *
        }
        queued = task_on_rq_queued(p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
  
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
        delta = p->prio - old_prio;
  
        if (queued) {
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@@ -3780,7 -3736,10 +3780,7 @@@ recheck
        } else {
                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
 -              if (policy != SCHED_DEADLINE &&
 -                              policy != SCHED_FIFO && policy != SCHED_RR &&
 -                              policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 -                              policy != SCHED_IDLE)
 +              if (!valid_policy(policy))
                        return -EINVAL;
        }
  
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
 -              if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
 +              if (idle_policy(p->policy) && !idle_policy(policy)) {
                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@@ -3961,7 -3920,7 +3961,7 @@@ change
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
 +              int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
 -              enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 +              if (oldprio <= p->prio)
 +                      enqueue_flags |= ENQUEUE_HEAD;
 +
 +              enqueue_task(rq, p, enqueue_flags);
        }
  
        check_class_changed(rq, p, prev_class, oldprio);
@@@ -4057,7 -4012,6 +4057,7 @@@ int sched_setscheduler_nocheck(struct t
  {
        return _sched_setscheduler(p, policy, param, false);
  }
 +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
  
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@@ -4970,15 -4924,7 +4970,15 @@@ void init_idle(struct task_struct *idle
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
  
 -      do_set_cpus_allowed(idle, cpumask_of(cpu));
 +#ifdef CONFIG_SMP
 +      /*
 +       * Its possible that init_idle() gets called multiple times on a task,
 +       * in that case do_set_cpus_allowed() will not do the right thing.
 +       *
 +       * And since this is boot we can forgo the serialization.
 +       */
 +      set_cpus_allowed_common(idle, cpumask_of(cpu));
 +#endif
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
  
        rq->curr = rq->idle = idle;
        idle->on_rq = TASK_ON_RQ_QUEUED;
 -#if defined(CONFIG_SMP)
 +#ifdef CONFIG_SMP
        idle->on_cpu = 1;
  #endif
        raw_spin_unlock(&rq->lock);
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
        vtime_init_idle(idle, cpu);
 -#if defined(CONFIG_SMP)
 +#ifdef CONFIG_SMP
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
  #endif
  }
@@@ -5129,7 -5075,7 +5129,7 @@@ void sched_setnuma(struct task_struct *
        running = task_current(rq, p);
  
        if (queued)
 -              dequeue_task(rq, p, 0);
 +              dequeue_task(rq, p, DEQUEUE_SAVE);
        if (running)
                put_prev_task(rq, p);
  
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, p, 0);
 +              enqueue_task(rq, p, ENQUEUE_RESTORE);
        task_rq_unlock(rq, p, &flags);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -5232,47 -5178,24 +5232,47 @@@ static void migrate_tasks(struct rq *de
                        break;
  
                /*
 -               * Ensure rq->lock covers the entire task selection
 -               * until the migration.
 +               * pick_next_task assumes pinned rq->lock.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
  
 +              /*
 +               * Rules for changing task_struct::cpus_allowed are holding
 +               * both pi_lock and rq->lock, such that holding either
 +               * stabilizes the mask.
 +               *
 +               * Drop rq->lock is not quite as disastrous as it usually is
 +               * because !cpu_active at this point, which means load-balance
 +               * will not interfere. Also, stop-machine.
 +               */
 +              lockdep_unpin_lock(&rq->lock);
 +              raw_spin_unlock(&rq->lock);
 +              raw_spin_lock(&next->pi_lock);
 +              raw_spin_lock(&rq->lock);
 +
 +              /*
 +               * Since we're inside stop-machine, _nothing_ should have
 +               * changed the task, WARN if weird stuff happened, because in
 +               * that case the above rq->lock drop is a fail too.
 +               */
 +              if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
 +                      raw_spin_unlock(&next->pi_lock);
 +                      continue;
 +              }
 +
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
  
 -              lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
 +              raw_spin_unlock(&next->pi_lock);
        }
  
        rq->stop = stop;
@@@ -5559,27 -5482,21 +5559,27 @@@ static void set_cpu_rq_start_time(void
  static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
  {
 +      int cpu = (long)hcpu;
 +
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
                set_cpu_rq_start_time();
                return NOTIFY_OK;
 +
        case CPU_ONLINE:
                /*
                 * At this point a starting CPU has marked itself as online via
                 * set_cpu_online(). But it might not yet have marked itself
                 * as active, which is essential from here on.
 -               *
 -               * Thus, fall-through and help the starting CPU along.
                 */
 +              set_cpu_active(cpu, true);
 +              stop_machine_unpark(cpu);
 +              return NOTIFY_OK;
 +
        case CPU_DOWN_FAILED:
 -              set_cpu_active((long)hcpu, true);
 +              set_cpu_active(cpu, true);
                return NOTIFY_OK;
 +
        default:
                return NOTIFY_DONE;
        }
@@@ -6511,8 -6428,7 +6511,8 @@@ static struct sched_domain_topology_lev
        { NULL, },
  };
  
 -struct sched_domain_topology_level *sched_domain_topology = default_topology;
 +static struct sched_domain_topology_level *sched_domain_topology =
 +      default_topology;
  
  #define for_each_sd_topology(tl)                      \
        for (tl = sched_domain_topology; tl->mask; tl++)
@@@ -7281,6 -7197,9 +7281,6 @@@ void __init sched_init_smp(void
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
 -      /* nohz_full won't take effect without isolating the cpus. */
 -      tick_nohz_full_add_cpus_to(cpu_isolated_map);
 -
        sched_init_numa();
  
        /*
@@@ -7513,7 -7432,7 +7513,7 @@@ void __init sched_init(void
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
 -      int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 +      int nested = preempt_count() + rcu_preempt_depth();
  
        return (nested == preempt_offset);
  }
@@@ -7760,7 -7679,7 +7760,7 @@@ void sched_move_task(struct task_struc
        queued = task_on_rq_queued(tsk);
  
        if (queued)
 -              dequeue_task(rq, tsk, 0);
 +              dequeue_task(rq, tsk, DEQUEUE_SAVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
  
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
 -              tsk->sched_class->task_move_group(tsk, queued);
 +              tsk->sched_class->task_move_group(tsk);
        else
  #endif
                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
 -              enqueue_task(rq, tsk, 0);
 +              enqueue_task(rq, tsk, ENQUEUE_RESTORE);
  
        task_rq_unlock(rq, tsk, &flags);
  }
@@@ -8244,13 -8163,6 +8244,6 @@@ static void cpu_cgroup_attach(struct cg
                sched_move_task(task);
  }
  
- static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-                           struct cgroup_subsys_state *old_css,
-                           struct task_struct *task)
- {
-       sched_move_task(task);
- }
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@@ -8582,7 -8494,6 +8575,6 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
-       .exit           = cpu_cgroup_exit,
        .legacy_cftypes = cpu_files,
        .early_init     = 1,
  };
diff --combined mm/memcontrol.c
index c57c4423c68837d14816c5ff230435e1567e7c20,0ddd0ff2b52ecb0811f87dff872d9db8b4ce3579..b732edfddb767025185f27c8879903591c2b0c82
@@@ -434,7 -434,7 +434,7 @@@ struct cgroup_subsys_state *mem_cgroup_
  
        memcg = page->mem_cgroup;
  
-       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+       if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
  
        rcu_read_unlock();
@@@ -644,14 -644,12 +644,14 @@@ mem_cgroup_largest_soft_limit_node(stru
  }
  
  /*
 + * Return page count for single (non recursive) @memcg.
 + *
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
 - * a periodic synchronizion of counter in memcg's counter.
 + * a periodic synchronization of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
 - * common workload, threashold and synchonization as vmstat[] should be
 + * common workload, threshold and synchronization as vmstat[] should be
   * implemented.
   */
 -static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 -                               enum mem_cgroup_stat_index idx)
 +static unsigned long
 +mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
  {
        long val = 0;
        int cpu;
  
 +      /* Per-cpu values can be negative, use a signed accumulator */
        for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
 +      /*
 +       * Summing races with updates, so val may be negative.  Avoid exposing
 +       * transient negative values.
 +       */
 +      if (val < 0)
 +              val = 0;
        return val;
  }
  
@@@ -1263,7 -1254,7 +1263,7 @@@ void mem_cgroup_print_oom_info(struct m
                for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                        if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                                continue;
 -                      pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
 +                      pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
                                K(mem_cgroup_read_stat(iter, i)));
                }
  
@@@ -2828,11 -2819,14 +2828,11 @@@ static unsigned long tree_stat(struct m
                               enum mem_cgroup_stat_index idx)
  {
        struct mem_cgroup *iter;
 -      long val = 0;
 +      unsigned long val = 0;
  
 -      /* Per-cpu values can be negative, use a signed accumulator */
        for_each_mem_cgroup_tree(iter, memcg)
                val += mem_cgroup_read_stat(iter, idx);
  
 -      if (val < 0) /* race ? */
 -              val = 0;
        return val;
  }
  
@@@ -2926,7 -2920,7 +2926,7 @@@ static int memcg_activate_kmem(struct m
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
-       if (cgroup_has_tasks(memcg->css.cgroup) ||
+       if (cgroup_is_populated(memcg->css.cgroup) ||
            (memcg->use_hierarchy && memcg_has_children(memcg)))
                err = -EBUSY;
        mutex_unlock(&memcg_create_mutex);
@@@ -3175,7 -3169,7 +3175,7 @@@ static int memcg_stat_show(struct seq_f
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
 -              seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 +              seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
        }
  
                           (u64)memsw * PAGE_SIZE);
  
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 -              long long val = 0;
 +              unsigned long long val = 0;
  
                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 -              seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 +              seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
        }
  
        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@@ -3387,7 -3381,6 +3387,7 @@@ static int __mem_cgroup_usage_register_
        ret = page_counter_memparse(args, "-1", &threshold);
        if (ret)
                return ret;
 +      threshold <<= PAGE_SHIFT;
  
        mutex_lock(&memcg->thresholds_lock);
  
@@@ -3741,43 -3734,44 +3741,43 @@@ struct wb_domain *mem_cgroup_wb_domain(
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
 - * @pavail: out parameter for number of available pages
 + * @pfilepages: out parameter for number of file pages
 + * @pheadroom: out parameter for number of allocatable pages according to memcg
   * @pdirty: out parameter for number of dirty pages
   * @pwriteback: out parameter for number of pages under writeback
   *
 - * Determine the numbers of available, dirty, and writeback pages in @wb's
 - * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
 - * more involved.
 + * Determine the numbers of file, headroom, dirty, and writeback pages in
 + * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
 + * is a bit more involved.
   *
 - * A memcg's headroom is "min(max, high) - used".  The available memory is
 - * calculated as the lowest headroom of itself and the ancestors plus the
 - * number of pages already being used for file pages.  Note that this
 - * doesn't consider the actual amount of available memory in the system.
 - * The caller should further cap *@pavail accordingly.
 + * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
 + * headroom is calculated as the lowest headroom of itself and the
 + * ancestors.  Note that this doesn't consider the actual amount of
 + * available memory in the system.  The caller should further cap
 + * *@pheadroom accordingly.
   */
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback)
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback)
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 -      unsigned long head_room = PAGE_COUNTER_MAX;
 -      unsigned long file_pages;
  
        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
  
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
 +      *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 +                                                   (1 << LRU_ACTIVE_FILE));
 +      *pheadroom = PAGE_COUNTER_MAX;
  
 -      file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 -                                                  (1 << LRU_ACTIVE_FILE));
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                unsigned long used = page_counter_read(&memcg->memory);
  
 -              head_room = min(head_room, ceiling - min(ceiling, used));
 +              *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                memcg = parent;
        }
 -
 -      *pavail = file_pages + head_room;
  }
  
  #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -4066,8 -4060,7 +4066,7 @@@ static struct cftype mem_cgroup_legacy_
        {
                .name = "cgroup.event_control",         /* XXX: for compat */
                .write = memcg_write_event_control,
-               .flags = CFTYPE_NO_PREFIX,
-               .mode = S_IWUGO,
+               .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
        },
        {
                .name = "swappiness",
@@@ -4185,6 -4178,7 +4184,6 @@@ static struct mem_cgroup *mem_cgroup_al
        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
                goto out_free_stat;
  
 -      spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
  
  out_free_stat:
@@@ -4834,7 -4828,7 +4833,7 @@@ static int mem_cgroup_can_attach(struc
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *from;
-       struct task_struct *p;
+       struct task_struct *leader, *p;
        struct mm_struct *mm;
        unsigned long move_flags;
        int ret = 0;
        if (!move_flags)
                return 0;
  
-       p = cgroup_taskset_first(tset);
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+       }
+       if (!p)
+               return 0;
        from = mem_cgroup_from_task(p);
  
        VM_BUG_ON(from == memcg);
@@@ -5064,7 -5071,7 +5076,7 @@@ static void mem_cgroup_bind(struct cgro
         * guarantees that @root doesn't have any children, so turning it
         * on for the root memcg is enough.
         */
-       if (cgroup_on_dfl(root_css->cgroup))
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                root_mem_cgroup->use_hierarchy = true;
        else
                root_mem_cgroup->use_hierarchy = false;
@@@ -5208,6 -5215,7 +5220,7 @@@ static struct cftype memory_files[] = 
        {
                .name = "events",
                .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct mem_cgroup, events_file),
                .seq_show = memory_events_show,
        },
        { }     /* terminate */
diff --combined mm/vmscan.c
index 7f63a9381f71ebbb0c1f9bdda94a913c930280f0,2d978b28a410b25df1acde351630dee387efbbe5..e7057af54b6e267558a99749fac80dc77dd7855f
@@@ -175,7 -175,7 +175,7 @@@ static bool sane_reclaim(struct scan_co
        if (!memcg)
                return true;
  #ifdef CONFIG_CGROUP_WRITEBACK
-       if (cgroup_on_dfl(memcg->css.cgroup))
 -      if (memcg->css.cgroup)
++      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return true;
  #endif
        return false;