Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 992cda68fa63b8a2b3a289e67c1d91700f5ddc06..d6496fde618065d987701c7d43ef4178ff2bf7b6 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1614,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
         noapic          [SMP,APIC] Tells the kernel to not make use of any
                         IOAPICs that may be present in the system.
  
+       noautogroup     Disable scheduler automatic task group creation.
+
         nobats          [PPC] Do not use BATs for mapping kernel lowmem
                         on "Classic" PPC cores.
  
diff --git a/arch/Kconfig b/arch/Kconfig

index 8bf0fa652eb63c57dec1ebfec1a93a407be4ed32..f78c2be4242b437ced3308795952102bf1359763 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
  config HAVE_ARCH_JUMP_LABEL
         bool
  
+config HAVE_ARCH_MUTEX_CPU_RELAX
+       bool
+
  source "kernel/gcov/Kconfig"
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index e0b98e71ff4797e5807f9e5d99f674aeb90456b8..6c6d7b339aae4f49fac84b7b3edc069f4ccf3adf 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
         select HAVE_KERNEL_LZMA
         select HAVE_KERNEL_LZO
         select HAVE_GET_USER_PAGES_FAST
+       select HAVE_ARCH_MUTEX_CPU_RELAX
         select ARCH_INLINE_SPIN_TRYLOCK
         select ARCH_INLINE_SPIN_TRYLOCK_BH
         select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h

index 458c1f7fbc1808d48982aa0c5fe89bfe3df2098c..688271f5f2e452b9951599550f33ed0ddcfe0a7c 100644 (file)
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
   */
  
  #include <asm-generic/mutex-dec.h>
+
+#define arch_mutex_cpu_relax() barrier()
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 182845147fe45bde8f5607a799f23cc1e2818117..08cba2c3b61240e085b9861967af5bd0adb4d227 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
  
  #endif
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+       struct inode *inode = m->private;
+       struct task_struct *p;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+       proc_sched_autogroup_show_task(p, m);
+
+       put_task_struct(p);
+
+       return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+           size_t count, loff_t *offset)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct task_struct *p;
+       char buffer[PROC_NUMBUF];
+       long nice;
+       int err;
+
+       memset(buffer, 0, sizeof(buffer));
+       if (count > sizeof(buffer) - 1)
+               count = sizeof(buffer) - 1;
+       if (copy_from_user(buffer, buf, count))
+               return -EFAULT;
+
+       err = strict_strtol(strstrip(buffer), 0, &nice);
+       if (err)
+               return -EINVAL;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+
+       err = nice;
+       err = proc_sched_autogroup_set_nice(p, &err);
+       if (err)
+               count = err;
+
+       put_task_struct(p);
+
+       return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+       int ret;
+
+       ret = single_open(filp, sched_autogroup_show, NULL);
+       if (!ret) {
+               struct seq_file *m = filp->private_data;
+
+               m->private = inode;
+       }
+       return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+       .open           = sched_autogroup_open,
+       .read           = seq_read,
+       .write          = sched_autogroup_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
  static ssize_t comm_write(struct file *file, const char __user *buf,
                                 size_t count, loff_t *offset)
  {
@@ -2732,6 +2808,9 @@ static const struct pid_entry tgid_base_stuff[] = {
         INF("limits",     S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+       REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
  #endif
         REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
diff --git a/include/linux/completion.h b/include/linux/completion.h

index 36d57f74cd01c6c126ee2f7c2ea2c98f66868b2f..51494e6b55487f30496c8870165dd75f8ba4c7b1 100644 (file)
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
  extern int wait_for_completion_killable(struct completion *x);
  extern unsigned long wait_for_completion_timeout(struct completion *x,
                                                    unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
-                       struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
-                       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+       struct completion *x, unsigned long timeout);
  extern bool try_wait_for_completion(struct completion *x);
  extern bool completion_done(struct completion *x);
  
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 6b281fae114a8d28db406e6886e74dafc5ddfaf0..caa151fbebb74c661289a69ffb52762435178d53 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
  #include <linux/securebits.h>
  #include <net/net_namespace.h>
  
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)                                      \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
  extern struct files_struct init_files;
  extern struct fs_struct init_fs;
  
@@ -144,7 +151,7 @@ extern struct cred init_cred;
                 .nr_cpus_allowed = NR_CPUS,                             \
         },                                                              \
         .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
-       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+       INIT_PUSHABLE_TASKS(tsk)                                        \
         .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
         .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
         .real_parent    = &tsk,                                         \
diff --git a/include/linux/mutex.h b/include/linux/mutex.h

index f363bc8fdc74c821c99aa59d5bfcb9554c012c9a..94b48bd40dd735f77963fcd31797d32bb68b3379 100644 (file)
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
  extern void mutex_unlock(struct mutex *lock);
  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
  #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 48c409c279d3cda58bb9d0ca936c8cf1799cf056..777cd01e240ee0fca7a8b6a76c74137d0dcfaaa9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -513,6 +513,8 @@ struct thread_group_cputimer {
         spinlock_t lock;
  };
  
+struct autogroup;
+
  /*
   * NOTE! "signal_struct" does not have it's own
   * locking, because a shared signal_struct always
@@ -580,6 +582,9 @@ struct signal_struct {
  
         struct tty_struct *tty; /* NULL if no tty */
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
         /*
          * Cumulative resource counters for dead threads in the group,
          * and for reaped dead child processes forked by this group.
@@ -1242,7 +1247,9 @@ struct task_struct {
  #endif
  
         struct list_head tasks;
+#ifdef CONFIG_SMP
         struct plist_node pushable_tasks;
+#endif
  
         struct mm_struct *mm, *active_mm;
  #if defined(SPLIT_RSS_COUNTING)
@@ -1883,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
  
  #ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
  extern void idle_task_exit(void);
  #else
  static inline void idle_task_exit(void) {}
  #endif
  
-extern void sched_idle_next(void);
-
  #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
  extern void wake_up_idle_cpu(int cpu);
  #else
@@ -1900,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
  extern unsigned int sysctl_sched_child_runs_first;
  
  enum sched_tunable_scaling {
@@ -1917,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
  extern unsigned int sysctl_sched_time_avg;
  extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
  
  int sched_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *length,
@@ -1942,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
  
  extern unsigned int sysctl_sched_compat_yield;
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
  #ifdef CONFIG_RT_MUTEXES
  extern int rt_mutex_getprio(struct task_struct *p);
  extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1960,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
  extern int can_nice(const struct task_struct *p, const int nice);
  extern int task_curr(const struct task_struct *p);
  extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+                             const struct sched_param *);
  extern int sched_setscheduler_nocheck(struct task_struct *, int,
-                                     struct sched_param *);
+                                     const struct sched_param *);
  extern struct task_struct *idle_task(int cpu);
  extern struct task_struct *curr_task(int cpu);
  extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/init/Kconfig b/init/Kconfig

index 526ec1c7456a901d896cc3dca7edec19ff242d70..8dfd094e68753dd919dd50299c82fa7385d6621e 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -794,6 +794,19 @@ config NET_NS
  
  endif # NAMESPACES
  
+config SCHED_AUTOGROUP
+       bool "Automatic process group scheduling"
+       select EVENTFD
+       select CGROUPS
+       select CGROUP_SCHED
+       select FAIR_GROUP_SCHED
+       help
+         This option optimizes the scheduler for common desktop workloads by
+         automatically creating and populating task groups.  This separation
+         of workloads isolates aggressive CPU burners (like build jobs) from
+         desktop applications.  Task group autogeneration is currently based
+         upon task session.
+
  config MM_OWNER
         bool
  
diff --git a/kernel/cpu.c b/kernel/cpu.c

index f6e726f184916029e2d1cfdbcd4acb2b26f14e69..cb7a1efa9c2b51ac2d97f02a82887638ffa2bf67 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
  }
  
  struct take_cpu_down_param {
-       struct task_struct *caller;
         unsigned long mod;
         void *hcpu;
  };
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
  static int __ref take_cpu_down(void *_param)
  {
         struct take_cpu_down_param *param = _param;
-       unsigned int cpu = (unsigned long)param->hcpu;
         int err;
  
         /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
  
         cpu_notify(CPU_DYING | param->mod, param->hcpu);
  
-       if (task_cpu(param->caller) == cpu)
-               move_task_off_dead_cpu(cpu, param->caller);
-       /* Force idle task to run as soon as we yield: it should
-          immediately notice cpu is offline and die quickly. */
-       sched_idle_next();
         return 0;
  }
  
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         void *hcpu = (void *)(long)cpu;
         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
         struct take_cpu_down_param tcd_param = {
-               .caller = current,
                 .mod = mod,
                 .hcpu = hcpu,
         };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         }
         BUG_ON(cpu_online(cpu));
  
-       /* Wait for it to sleep (leaving idle task). */
+       /*
+        * The migration_call() CPU_DYING callback will have removed all
+        * runnable tasks from the cpu, there's only the idle task left now
+        * that the migration thread is done doing the stop_machine thing.
+        *
+        * Wait for the stop thread to go away.
+        */
         while (!idle_cpu(cpu))
-               yield();
+               cpu_relax();
  
         /* This actually kills the CPU. */
         __cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c

index 5447dc7defa95b8f0e13acb80b45487df7dc2e73..7d164e25b0f0ea42498d748f389824773e2a0c78 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
  
  static inline void put_signal_struct(struct signal_struct *sig)
  {
-       if (atomic_dec_and_test(&sig->sigcnt))
+       if (atomic_dec_and_test(&sig->sigcnt)) {
+               sched_autogroup_exit(sig);
                 free_signal_struct(sig);
+       }
  }
  
  void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         posix_cpu_timers_init_group(sig);
  
         tty_audit_fork(sig);
+       sched_autogroup_fork(sig);
  
         sig->oom_adj = current->signal->oom_adj;
         sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
         }
  bad_fork_cleanup_signal:
         if (!(clone_flags & CLONE_THREAD))
-               free_signal_struct(p->signal);
+               put_signal_struct(p->signal);
  bad_fork_cleanup_sighand:
         __cleanup_sighand(p->sighand);
  bad_fork_cleanup_fs:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index 5f92acc5f952e0afb0489017c265a943a4a7d464..91a5fa25054e1d14d62339749f3229fae49f3766 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
   */
  static int irq_thread(void *data)
  {
-       struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+       static struct sched_param param = {
+               .sched_priority = MAX_USER_RT_PRIO/2,
+       };
         struct irqaction *action = data;
         struct irq_desc *desc = irq_to_desc(action->irq);
         int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c

index ca61bbdd44b2e11acad4d866ab31224a5a5cab16..5355cfd44a3fd21cd767c13d053410ec338f2ede 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
         wait_for_completion(&create.done);
  
         if (!IS_ERR(create.result)) {
-               struct sched_param param = { .sched_priority = 0 };
+               static struct sched_param param = { .sched_priority = 0 };
                 va_list args;
  
                 va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c

index 200407c1502f509ee3f9d8a665bc4d3b78a27f74..a5889fb28ecff33eaf5fae64c9d2a50ca03cb2f7 100644 (file)
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                  * memory barriers as we'll eventually observe the right
                  * values at the cost of a few extra spins.
                  */
-               cpu_relax();
+               arch_mutex_cpu_relax();
         }
  #endif
         spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c

index a23315dc4498844c113cecc9792eabd063e1d87b..ab3ffc5b3b64613507134573dbb94af132c4adff 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
  
  void printk_tick(void)
  {
-       if (__get_cpu_var(printk_pending)) {
-               __get_cpu_var(printk_pending) = 0;
+       if (__this_cpu_read(printk_pending)) {
+               __this_cpu_write(printk_pending, 0);
                 wake_up_interruptible(&log_wait);
         }
  }
  
  int printk_needs_cpu(int cpu)
  {
-       if (unlikely(cpu_is_offline(cpu)))
+       if (cpu_is_offline(cpu))
                 printk_tick();
-       return per_cpu(printk_pending, cpu);
+       return __this_cpu_read(printk_pending);
  }
  
  void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c

index 260132961a9930406a0a2d689163675011309636..04949089e7601ccd2a9b82f0f30c5905cbc9777b 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
+#include <asm/mutex.h>
  
  #include "sched_cpupri.h"
  #include "workqueue_sched.h"
+#include "sched_autogroup.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
         unsigned long shares;
+
+       atomic_t load_weight;
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
         struct task_group *parent;
         struct list_head siblings;
         struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
  };
  
  #define root_task_group init_task_group
  
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return list_empty(&root_task_group.children);
-}
-#endif
-
  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
  
  /*
@@ -342,6 +341,7 @@ struct cfs_rq {
          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
          * list is used during load balance.
          */
+       int on_list;
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
  
@@ -360,14 +360,17 @@ struct cfs_rq {
         unsigned long h_load;
  
         /*
-        * this cpu's part of tg->shares
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
          */
-       unsigned long shares;
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
  
-       /*
-        * load.weight at the time we set shares
-        */
-       unsigned long rq_weight;
+       unsigned long load_contribution;
  #endif
  #endif
  };
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
   */
  static inline struct task_group *task_group(struct task_struct *p)
  {
+       struct task_group *tg;
         struct cgroup_subsys_state *css;
  
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                         lockdep_is_held(&task_rq(p)->lock));
-       return container_of(css, struct task_group, css);
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
  }
  
  /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -792,20 +798,6 @@ late_initcall(sched_init_debug);
   */
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
  
-/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
  /*
   * period over which we average the RT time consumption, measured
   * in ms.
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
         lw->inv_weight = 0;
  }
  
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
   * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                   unsigned long sd_shares,
-                                   unsigned long sd_rq_weight,
-                                   unsigned long *usd_rq_weight)
-{
-       unsigned long shares, rq_weight;
-       int boost = 0;
-
-       rq_weight = usd_rq_weight[cpu];
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       /*
-        *             \Sum_j shares_j * rq_weight_i
-        * shares_i =  -----------------------------
-        *                  \Sum_j rq_weight_j
-        */
-       shares = (sd_shares * rq_weight) / sd_rq_weight;
-       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-       if (abs(shares - tg->se[cpu]->load.weight) >
-                       sysctl_sched_shares_thresh) {
-               struct rq *rq = cpu_rq(cpu);
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               __set_se_shares(tg->se[cpu], shares);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-       unsigned long *usd_rq_weight;
-       struct sched_domain *sd = data;
-       unsigned long flags;
-       int i;
-
-       if (!tg->se[0])
-               return 0;
-
-       local_irq_save(flags);
-       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-       for_each_cpu(i, sched_domain_span(sd)) {
-               weight = tg->cfs_rq[i]->load.weight;
-               usd_rq_weight[i] = weight;
-
-               rq_weight += weight;
-               /*
-                * If there are currently no tasks on the cpu pretend there
-                * is one of average load so that when a new task gets to
-                * run here it will not get delayed by group starvation.
-                */
-               if (!weight)
-                       weight = NICE_0_LOAD;
-
-               sum_weight += weight;
-               shares += tg->cfs_rq[i]->shares;
-       }
-
-       if (!rq_weight)
-               rq_weight = sum_weight;
-
-       if ((!shares && rq_weight) || shares > tg->shares)
-               shares = tg->shares;
-
-       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-               shares = tg->shares;
-
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
-
  /*
   * Compute the cpu's hierarchical load factor for each task group.
   * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                 load = cpu_rq(cpu)->load.weight;
         } else {
                 load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->cfs_rq[cpu]->shares;
+               load *= tg->se[cpu]->load.weight;
                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
         }
  
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void update_shares(struct sched_domain *sd)
-{
-       s64 elapsed;
-       u64 now;
-
-       if (root_task_group_empty())
-               return;
-
-       now = local_clock();
-       elapsed = now - sd->last_update;
-
-       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-               sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, sd);
-       }
-}
-
  static void update_h_load(long cpu)
  {
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
  #endif
  
  #ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
  
  #endif
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-       cfs_rq->shares = shares;
-#endif
-}
-#endif
-
  static void calc_load_account_idle(struct rq *this_rq);
  static void update_sysctl(void);
  static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_autogroup.c"
  #include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
   * The task's runqueue lock must be held.
   * Returns true if you have to wait for migration thread.
   */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
  {
-       struct rq *rq = task_rq(p);
-
         /*
          * If the task is not on a runqueue (and not running), then
          * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                 return dest_cpu;
  
         /* No more Mr. Nice Guy. */
-       if (unlikely(dest_cpu >= nr_cpu_ids)) {
-               dest_cpu = cpuset_cpus_allowed_fallback(p);
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, cpu);
-               }
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
         }
  
         return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+#ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
  
         put_cpu();
  }
@@ -3549,7 +3418,7 @@ void sched_exec(void)
          * select_task_rq() can race against ->cpus_allowed
          */
         if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
  
                 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                 if (task_thread_info(rq->curr) != owner || need_resched())
                         return 0;
  
-               cpu_relax();
+               arch_mutex_cpu_relax();
         }
  
         return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
   * This waits for either a completion of a specific task to be signaled or for a
   * specified timeout to expire. It is interruptible. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
  {
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
   * signaled or for a specified timeout to expire. It can be
   * interrupted by a kill signal. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_killable_timeout(struct completion *x,
                                      unsigned long timeout)
  {
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
  }
  
  static int __sched_setscheduler(struct task_struct *p, int policy,
-                               struct sched_param *param, bool user)
+                               const struct sched_param *param, bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
   * NOTE that the task may be already dead.
   */
  int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+                      const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, true);
  }
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
   * but our caller might not have that capability.
   */
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              struct sched_param *param)
+                              const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, false);
  }
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
         unsigned state;
  
         state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  #if BITS_PER_LONG == 32
         if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
         SET_SYSCTL(sched_min_granularity);
         SET_SYSCTL(sched_latency);
         SET_SYSCTL(sched_wakeup_granularity);
-       SET_SYSCTL(sched_shares_ratelimit);
  #undef SET_SYSCTL
  }
  
@@ -5830,7 +5698,7 @@ again:
                 goto out;
  
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, dest_cpu)) {
+       if (migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
+
  /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
   */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
  {
-       struct rq *rq = cpu_rq(dead_cpu);
-       int needs_cpu, uninitialized_var(dest_cpu);
-       unsigned long flags;
+       struct mm_struct *mm = current->active_mm;
  
-       local_irq_save(flags);
+       BUG_ON(cpu_online(smp_processor_id()));
  
-       raw_spin_lock(&rq->lock);
-       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-       if (needs_cpu)
-               dest_cpu = select_fallback_rq(dead_cpu, p);
-       raw_spin_unlock(&rq->lock);
-       /*
-        * It can only fail if we race with set_cpus_allowed(),
-        * in the racer should migrate the task anyway.
-        */
-       if (needs_cpu)
-               __migrate_task(p, dead_cpu, dest_cpu);
-       local_irq_restore(flags);
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
  }
  
  /*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
         struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-       unsigned long flags;
  
-       local_irq_save(flags);
-       double_rq_lock(rq_src, rq_dest);
         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
         rq_src->nr_uninterruptible = 0;
-       double_rq_unlock(rq_src, rq_dest);
-       local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-
-       do_each_thread(t, p) {
-               if (p == current)
-                       continue;
-
-               if (task_cpu(p) == src_cpu)
-                       move_task_off_dead_cpu(src_cpu, p);
-       } while_each_thread(t, p);
-
-       read_unlock(&tasklist_lock);
  }
  
  /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
   */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
  {
-       int this_cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(this_cpu);
-       struct task_struct *p = rq->idle;
-       unsigned long flags;
-
-       /* cpu has to be offline */
-       BUG_ON(cpu_online(this_cpu));
-
-       /*
-        * Strictly not necessary since rest of the CPUs are stopped by now
-        * and interrupts disabled on the current cpu.
-        */
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-       activate_task(rq, p, 0);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  
  /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
   */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
  {
         struct rq *rq = cpu_rq(dead_cpu);
-
-       /* Must be exiting, otherwise would be on tasklist. */
-       BUG_ON(!p->exit_state);
-
-       /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->state == TASK_DEAD);
-
-       get_task_struct(p);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
  
         /*
-        * Drop lock around migration; if someone else moves it,
-        * that's OK. No task can be added to this CPU, so iteration is
-        * fine.
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
          */
-       raw_spin_unlock_irq(&rq->lock);
-       move_task_off_dead_cpu(dead_cpu, p);
-       raw_spin_lock_irq(&rq->lock);
-
-       put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next;
+       rq->stop = NULL;
  
         for ( ; ; ) {
-               if (!rq->nr_running)
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
                         break;
+
                 next = pick_next_task(rq);
-               if (!next)
-                       break;
+               BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
-               migrate_dead(dead_cpu, next);
  
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
         }
-}
  
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       rq->stop = stop;
  }
+
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                 rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  
  #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               migrate_live_tasks(cpu);
-               /* Idle task back to normal (off runqueue, low prio) */
-               raw_spin_lock_irq(&rq->lock);
-               deactivate_task(rq, rq->idle, 0);
-               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-               rq->idle->sched_class = &idle_sched_class;
-               migrate_dead_tasks(cpu);
-               raw_spin_unlock_irq(&rq->lock);
-               migrate_nr_uninterruptible(rq);
-               BUG_ON(rq->nr_running != 0);
-               calc_global_load_remove(rq);
-               break;
-
         case CPU_DYING:
-       case CPU_DYING_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
                 break;
  #endif
         }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *se, int cpu,
                                 struct sched_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
         tg->cfs_rq[cpu] = cfs_rq;
         init_cfs_rq(cfs_rq, rq);
         cfs_rq->tg = tg;
-       if (add)
-               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
  
         tg->se[cpu] = se;
         /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                 se->cfs_rq = parent->my_q;
  
         se->my_q = cfs_rq;
-       se->load.weight = tg->shares;
-       se->load.inv_weight = 0;
+       update_load_set(&se->load, 0);
         se->parent = parent;
  }
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *rt_se, int cpu,
                 struct sched_rt_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-       if (add)
-               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
  
         tg->rt_se[cpu] = rt_se;
         if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
  #ifdef CONFIG_CGROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
         INIT_LIST_HEAD(&init_task_group.children);
-
+       autogroup_init(&init_task);
  #endif /* CONFIG_CGROUP_SCHED */
  
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                           __alignof__(unsigned long));
-#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
  
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.shares = init_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                 /*
                  * How much cpu bandwidth does init_task_group get?
                  *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                  */
-               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
  #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
-               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!se)
                         goto err_free_rq;
  
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
  
         return 1;
@@ -8497,15 +8271,21 @@ err:
         return 0;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                       &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
-       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  #else /* !CONFG_FAIR_GROUP_SCHED */
  static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
  }
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!rt_se)
                         goto err_free_rq;
  
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }
  
         return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
  err:
         return 0;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                       &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
  #else /* !CONFIG_RT_GROUP_SCHED */
  static inline void free_rt_sched_group(struct task_group *tg)
  {
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
         return 1;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
  #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
         unsigned long flags;
-       int i;
  
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                 goto err;
  
         spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
-               register_fair_sched_group(tg, i);
-               register_rt_sched_group(tg, i);
-       }
         list_add_rcu(&tg->list, &task_groups);
  
         WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
         unsigned long flags;
         int i;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
                 unregister_fair_sched_group(tg, i);
-               unregister_rt_sched_group(tg, i);
-       }
+
+       spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
  #endif /* CONFIG_CGROUP_SCHED */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       int on_rq;
-
-       on_rq = se->on_rq;
-       if (on_rq)
-               dequeue_entity(cfs_rq, se, 0);
-
-       se->load.weight = shares;
-       se->load.inv_weight = 0;
-
-       if (on_rq)
-               enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
  static DEFINE_MUTEX(shares_mutex);
  
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         if (tg->shares == shares)
                 goto done;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for any ongoing reference to this group to finish */
-       synchronize_sched();
-
-       /*
-        * Now we are free to modify the group's share on each cpu
-        * w/o tripping rebalance_share or load_balance_fair.
-        */
         tg->shares = shares;
         for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
-               set_se_shares(tg->se[i], shares);
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se), 0);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
  
-       /*
-        * Enable load balance activity on this group, by inserting it back on
-        * each cpu's rq->leaf_cfs_rq_list.
-        */
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               register_fair_sched_group(tg, i);
-       list_add_rcu(&tg->siblings, &tg->parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
  done:
         mutex_unlock(&shares_mutex);
         return 0;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c

new file mode 100644 (file)

index 0000000..c80fedc
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+       autogroup_default.tg = &init_task_group;
+       init_task_group.autogroup = &autogroup_default;
+       kref_init(&autogroup_default.kref);
+       init_rwsem(&autogroup_default.lock);
+       init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+       kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+       struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+       sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+       kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+       kref_get(&ag->kref);
+       return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+       struct autogroup *ag;
+       unsigned long flags;
+
+       if (!lock_task_sighand(p, &flags))
+               return autogroup_kref_get(&autogroup_default);
+
+       ag = autogroup_kref_get(p->signal->autogroup);
+       unlock_task_sighand(p, &flags);
+
+       return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+       struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+       struct task_group *tg;
+
+       if (!ag)
+               goto out_fail;
+
+       tg = sched_create_group(&init_task_group);
+
+       if (IS_ERR(tg))
+               goto out_free;
+
+       kref_init(&ag->kref);
+       init_rwsem(&ag->lock);
+       ag->id = atomic_inc_return(&autogroup_seq_nr);
+       ag->tg = tg;
+       tg->autogroup = ag;
+
+       return ag;
+
+out_free:
+       kfree(ag);
+out_fail:
+       if (printk_ratelimit()) {
+               printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                       ag ? "sched_create_group()" : "kmalloc()");
+       }
+
+       return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+       if (tg != &root_task_group)
+               return false;
+
+       if (p->sched_class != &fair_sched_class)
+               return false;
+
+       /*
+        * We can only assume the task group can't go away on us if
+        * autogroup_move_group() can see us on ->thread_group list.
+        */
+       if (p->flags & PF_EXITING)
+               return false;
+
+       return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+       if (enabled && task_wants_autogroup(p, tg))
+               return p->signal->autogroup->tg;
+
+       return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+       struct autogroup *prev;
+       struct task_struct *t;
+       unsigned long flags;
+
+       BUG_ON(!lock_task_sighand(p, &flags));
+
+       prev = p->signal->autogroup;
+       if (prev == ag) {
+               unlock_task_sighand(p, &flags);
+               return;
+       }
+
+       p->signal->autogroup = autogroup_kref_get(ag);
+
+       t = p;
+       do {
+               sched_move_task(t);
+       } while_each_thread(p, t);
+
+       unlock_task_sighand(p, &flags);
+       autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+       struct autogroup *ag = autogroup_create();
+
+       autogroup_move_group(p, ag);
+       /* drop extra refrence added by autogroup_create() */
+       autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+       autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+       sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+       autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+       sysctl_sched_autogroup_enabled = 0;
+
+       return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+       static unsigned long next = INITIAL_JIFFIES;
+       struct autogroup *ag;
+       int err;
+
+       if (*nice < -20 || *nice > 19)
+               return -EINVAL;
+
+       err = security_task_setnice(current, *nice);
+       if (err)
+               return err;
+
+       if (*nice < 0 && !can_nice(current, *nice))
+               return -EPERM;
+
+       /* this is a heavy operation taking global locks.. */
+       if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+               return -EAGAIN;
+
+       next = HZ / 10 + jiffies;
+       ag = autogroup_task_get(p);
+
+       down_write(&ag->lock);
+       err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+       if (!err)
+               ag->nice = *nice;
+       up_write(&ag->lock);
+
+       autogroup_kref_put(ag);
+
+       return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+       struct autogroup *ag = autogroup_task_get(p);
+
+       down_read(&ag->lock);
+       seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+       up_read(&ag->lock);
+
+       autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h

new file mode 100644 (file)

index 0000000..5358e24
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+       struct kref             kref;
+       struct task_group       *tg;
+       struct rw_semaphore     lock;
+       unsigned long           id;
+       int                     nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c

index 52f1a149bfb15a871a362255498fadf90e357c57..9d8af0b3fb64544d9ca7076f3478d2239b46540e 100644 (file)
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
  }
  EXPORT_SYMBOL_GPL(sched_clock);
  
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
  
  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
  __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c

index 2e1b0d17dd9b6a8b4ac48891a988c025a8b07ed5..1dfae3d014b5934eba4b3be25bfbad746196b4b7 100644 (file)
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
  #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
-               struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
  {
         struct sched_entity *se = tg->se[cpu];
         if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
  #endif
  
-#ifdef CONFIG_CGROUP_SCHED
-       {
-               char path[64];
-
-               rcu_read_lock();
-               cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-               rcu_read_unlock();
-               SEQ_printf(m, " %s", path);
-       }
-#endif
         SEQ_printf(m, "\n");
  }
  
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
         read_unlock_irqrestore(&tasklist_lock, flags);
  }
  
-#if defined(CONFIG_CGROUP_SCHED) && \
-       (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-       /* may be NULL if the underlying cgroup isn't fully-created yet */
-       if (!tg->css.cgroup) {
-               buf[0] = '\0';
-               return;
-       }
-       cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  {
         s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
         struct sched_entity *last;
         unsigned long flags;
  
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = cfs_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
         SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                         SPLIT_NS(cfs_rq->exec_clock));
  
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
         spread0 = min_vruntime - rq0_min_vruntime;
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                         SPLIT_NS(spread0));
-       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
         SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                         cfs_rq->nr_spread_over);
+       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_SMP
-       SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                       SPLIT_NS(cfs_rq->load_avg));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                       SPLIT_NS(cfs_rq->load_period));
+       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                       cfs_rq->load_contribution);
+       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                       atomic_read(&cfs_rq->tg->load_weight));
  #endif
+
         print_cfs_group_stats(m, cpu, cfs_rq->tg);
  #endif
  }
  
  void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = rt_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
         SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
  
  #define P(x) \
         SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  #undef P
  }
  
+extern __read_mostly int sched_clock_running;
+
  static void print_cpu(struct seq_file *m, int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
  
  static int sched_debug_show(struct seq_file *m, void *v)
  {
-       u64 now = ktime_to_ns(ktime_get());
+       u64 ktime, sched_clk, cpu_clk;
+       unsigned long flags;
         int cpu;
  
-       SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+       local_irq_save(flags);
+       ktime = ktime_to_ns(ktime_get());
+       sched_clk = sched_clock();
+       cpu_clk = local_clock();
+       local_irq_restore(flags);
+
+       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
  
-       SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+       SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+       SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+       PN(ktime);
+       PN(sched_clk);
+       PN(cpu_clk);
+       P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+       P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+       SEQ_printf(m, "\n");
+       SEQ_printf(m, "sysctl_sched\n");
  
  #define P(x) \
         SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
  #define PN(x) \
         SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-       P(jiffies);
         PN(sysctl_sched_latency);
         PN(sysctl_sched_min_granularity);
         PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 00ebd7686676bd87a6e5b3be513d27028e518796..c62ebae65cf0c5e5d1628b0368692a94cda37568 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
  static const struct sched_class fair_sched_class;
  
  /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return cfs_rq->tg->cfs_rq[this_cpu];
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return &cpu_rq(this_cpu)->cfs;
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
         WRT_SYSCTL(sched_min_granularity);
         WRT_SYSCTL(sched_latency);
         WRT_SYSCTL(sched_wakeup_granularity);
-       WRT_SYSCTL(sched_shares_ratelimit);
  #undef WRT_SYSCTL
  
         return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  
         curr->vruntime += delta_exec_weighted;
         update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
  }
  
  static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_add(&se->group_node, &cfs_rq->tasks);
         }
         cfs_rq->nr_running++;
-       se->on_rq = 1;
  }
  
  static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_del_init(&se->group_node);
         }
         cfs_rq->nr_running--;
-       se->on_rq = 0;
  }
  
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (!cfs_rq)
+               return;
+
+       now = rq_of(cfs_rq)->clock;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long load_weight, load, shares;
+
+       if (!cfs_rq)
+               return;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se)
+               return;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       update_cfs_shares(cfs_rq, se->load.weight);
         account_entity_enqueue(cfs_rq, se);
  
         if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         check_spread(cfs_rq, se);
         if (se != cfs_rq->curr)
                 __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1)
+               list_add_leaf_cfs_rq(cfs_rq);
  }
  
  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
         account_entity_dequeue(cfs_rq, se);
         update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq, 0);
  
         /*
          * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          */
         update_curr(cfs_rq);
  
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
  #ifdef CONFIG_SCHED_HRTICK
         /*
          * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 flags = ENQUEUE_WAKEUP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 dequeue_entity(cfs_rq, se, flags);
+
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight)
                         break;
                 flags |= DEQUEUE_SLEEP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
   * Adding load to a group doesn't make a group heavier, but can cause movement
   * of group shares between cpus. Assuming the shares were perfectly aligned one
   * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
   */
-static long effective_load(struct task_group *tg, int cpu,
-               long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
         if (!tg->parent)
                 return wl;
  
-       /*
-        * By not taking the decrease of shares on the other cpu into
-        * account our error leans towards reducing the affine wakeups.
-        */
-       if (!wl && sched_feat(ASYM_EFF_LOAD))
-               return wl;
-
         for_each_sched_entity(se) {
                 long S, rw, s, a, b;
-               long more_w;
-
-               /*
-                * Instead of using this increment, also add the difference
-                * between when the shares were last updated and now.
-                */
-               more_w = se->my_q->load.weight - se->my_q->rq_weight;
-               wl += more_w;
-               wg += more_w;
  
                 S = se->my_q->tg->shares;
-               s = se->my_q->shares;
-               rw = se->my_q->rq_weight;
+               s = se->load.weight;
+               rw = se->my_q->load.weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                         sd = tmp;
         }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (sched_feat(LB_SHARES_UPDATE)) {
-               /*
-                * Pick the largest domain to update shares over
-                */
-               tmp = sd;
-               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                       tmp = affine_sd;
-
-               if (tmp) {
-                       raw_spin_unlock(&rq->lock);
-                       update_shares(tmp);
-                       raw_spin_lock(&rq->lock);
-               }
-       }
-#endif
-
         if (affine_sd) {
                 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                         return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq, 0);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
+       rcu_read_unlock();
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
         return max_load_move - rem_load_move;
  }
  #else
+static inline void update_shares(int cpu)
+{
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         schedstat_inc(sd, lb_count[idle]);
  
  redo:
-       update_shares(sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                    cpus, balance);
  
@@ -3174,8 +3381,6 @@ out_one_pinned:
         else
                 ld_moved = 0;
  out:
-       if (ld_moved)
-               update_shares(sd);
         return ld_moved;
  }
  
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
          */
         raw_spin_unlock(&this_rq->lock);
  
+       update_shares(this_cpu);
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         int update_next_balance = 0;
         int need_serialize;
  
+       update_shares(cpu);
+
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h

index 185f920ec1a2e923b0d966f787c610ff26a7b6cb..68e69acc29b9570b10ecf8a5892ea62c48700c5c 100644 (file)
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
  SCHED_FEAT(HRTICK, 0)
  SCHED_FEAT(DOUBLE_TICK, 0)
  SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
  
  /*
   * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index bea7d79f7e9ca958bba514cbd8eb48ceab47bab3..c914ec747ca6709e25a177eb3c4152c75cb40aee 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
  }
  
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
  #define for_each_leaf_rt_rq(rt_rq, rq) \
         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
         return ktime_to_ns(def_rt_bandwidth.rt_period);
  }
  
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
  #define for_each_leaf_rt_rq(rt_rq, rq) \
         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
  
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                 return;
  
+       if (!rt_rq->rt_nr_running)
+               list_add_leaf_rt_rq(rt_rq);
+
         if (head)
                 list_add(&rt_se->run_list, queue);
         else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
  
         dec_rt_tasks(rt_se, rt_rq);
+       if (!rt_rq->rt_nr_running)
+               list_del_leaf_rt_rq(rt_rq);
  }
  
  /*
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 18f4be0d5fe0bbf853935972d9b441e95bc61c5a..d4d918a91881407acd8abbde6691f77197cd013c 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                              cpumask_any(cpu_online_mask));
         case CPU_DEAD:
         case CPU_DEAD_FROZEN: {
-               struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+               static struct sched_param param = {
+                       .sched_priority = MAX_RT_PRIO-1
+               };
  
                 p = per_cpu(ksoftirqd, hotcpu);
                 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c

index 7f5a0cd296a96ca44e43f0db028026094dbbb57a..2745dcdb6c6c5756a7bafd19e6497c31cc077d60 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
         err = session;
  out:
         write_unlock_irq(&tasklist_lock);
-       if (err > 0)
+       if (err > 0) {
                 proc_sid_connector(group_leader);
+               sched_autogroup_create_attach(group_leader);
+       }
         return err;
  }
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 46404414d8a7d4b187903906927157e1f092020a..ae5cbb1e3ced15b8cc2e00b052496953709cf4c2 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns;                       /* 0 usecs */
  static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
  #endif
  
  #ifdef CONFIG_COMPACTION
@@ -304,15 +302,6 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_wakeup_granularity_ns,
                 .extra2         = &max_wakeup_granularity_ns,
         },
-       {
-               .procname       = "sched_shares_ratelimit",
-               .data           = &sysctl_sched_shares_ratelimit,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = sched_proc_update_handler,
-               .extra1         = &min_sched_shares_ratelimit,
-               .extra2         = &max_sched_shares_ratelimit,
-       },
         {
                 .procname       = "sched_tunable_scaling",
                 .data           = &sysctl_sched_tunable_scaling,
@@ -322,14 +311,6 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_sched_tunable_scaling,
                 .extra2         = &max_sched_tunable_scaling,
         },
-       {
-               .procname       = "sched_shares_thresh",
-               .data           = &sysctl_sched_shares_thresh,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-       },
         {
                 .procname       = "sched_migration_cost",
                 .data           = &sysctl_sched_migration_cost,
@@ -351,6 +332,13 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+       {
+               .procname       = "sched_shares_window",
+               .data           = &sysctl_sched_shares_window,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         {
                 .procname       = "timer_migration",
                 .data           = &sysctl_timer_migration,
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+#ifdef CONFIG_SCHED_AUTOGROUP
+       {
+               .procname       = "sched_autogroup_enabled",
+               .data           = &sysctl_sched_autogroup_enabled,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
  #ifdef CONFIG_PROVE_LOCKING
         {
                 .procname       = "prove_locking",
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

index 155a415b3209c0c4e65936be1b593678aea42d27..562c56e048fdbc34b18dded38cd21ccb3cd87f08 100644 (file)
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
  static int trace_wakeup_test_thread(void *data)
  {
         /* Make this a RT thread, doesn't need to be too high */
-       struct sched_param param = { .sched_priority = 5 };
+       static struct sched_param param = { .sched_priority = 5 };
         struct completion *x = data;
  
         sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index aaa8dae0823619222010875d9fdc6f5dac182d2d..6e7b575ac33cf2dcba3f9dc749f7039e6805a3f0 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
   */
  static int watchdog(void *unused)
  {
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+       static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
         struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
  
         sched_setscheduler(current, SCHED_FIFO, &param);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Jan 2011 18:23:33 +0000 (10:23 -0800)
Documentation/kernel-parameters.txt		patch \| blob \| history
arch/Kconfig		patch \| blob \| history
arch/s390/Kconfig		patch \| blob \| history
arch/s390/include/asm/mutex.h		patch \| blob \| history
fs/proc/base.c		patch \| blob \| history
include/linux/completion.h		patch \| blob \| history
include/linux/init_task.h		patch \| blob \| history
include/linux/mutex.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/irq/manage.c		patch \| blob \| history
kernel/kthread.c		patch \| blob \| history
kernel/mutex.c		patch \| blob \| history
kernel/printk.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_autogroup.c	[new file with mode: 0644]	patch \| blob
kernel/sched_autogroup.h	[new file with mode: 0644]	patch \| blob
kernel/sched_clock.c		patch \| blob \| history
kernel/sched_debug.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history
kernel/sched_features.h		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history
kernel/softirq.c		patch \| blob \| history
kernel/sys.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
kernel/trace/trace_selftest.c		patch \| blob \| history
kernel/watchdog.c		patch \| blob \| history