Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)
diff --combined fs/locks.c

index 52366e877d7636a04078bdd14133f30be6459f5d,2eb81975c99c7ee2c247cf6c7aa11e348bb1e3fc..19ee18a6829b9ae443778d63d57a8d8924131748
--- 1/fs/locks.c
--- 2/fs/locks.c
+++ b/fs/locks.c
@@@ -768,7 -768,7 +768,7 @@@ static int flock_lock_file(struct file 
          * give it the opportunity to lock the file.
          */
         if (found)
-               cond_resched_bkl();
+               cond_resched();
   
   find_conflict:
         for_each_lock(inode, before) {
@@@ -1591,7 -1591,7 +1591,7 @@@ SYSCALL_DEFINE2(flock, unsigned int, fd
         if (can_sleep)
                 lock->fl_flags |= FL_SLEEP;
   
- -      error = security_file_lock(filp, cmd);
+ +      error = security_file_lock(filp, lock->fl_type);
         if (error)
                 goto out_free;
   
diff --combined include/linux/hardirq.h

index 330cb31bb496ac892fe685aca43d357e5fd1e860,0d885fd75111048f1e8359b1fd49572dd385ba5e..6d527ee82b2b2904b2d30f978da8f31a61783d6a
--- 1/include/linux/hardirq.h
--- 2/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@@ -64,6 -64,12 +64,12 @@@
   #define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
   #define NMI_OFFSET    (1UL << NMI_SHIFT)
   
+ #ifndef PREEMPT_ACTIVE
+ #define PREEMPT_ACTIVE_BITS   1
+ #define PREEMPT_ACTIVE_SHIFT  (NMI_SHIFT + NMI_BITS)
+ #define PREEMPT_ACTIVE        (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+ #endif
+ 
   #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
   #error PREEMPT_ACTIVE is too low!
   #endif
@@@ -132,7 -138,7 +138,7 @@@ static inline void account_system_vtime
   }
   #endif
   
- -#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
+ +#if defined(CONFIG_NO_HZ)
   extern void rcu_irq_enter(void);
   extern void rcu_irq_exit(void);
   extern void rcu_nmi_enter(void);
@@@ -142,7 -148,7 +148,7 @@@ extern void rcu_nmi_exit(void)
   # define rcu_irq_exit() do { } while (0)
   # define rcu_nmi_enter() do { } while (0)
   # define rcu_nmi_exit() do { } while (0)
- -#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
+ +#endif /* #if defined(CONFIG_NO_HZ) */
   
   /*
    * It is safe to do non-atomic ops on ->hardirq_context,
diff --combined include/linux/sched.h

index 379531c089757525a5829be14e2b946cef33f2b8,3a50e8222498c90ae42099f53d4645b0183fffff..f3d74bd04d184955326430d6704741950f574aba
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -38,6 -38,8 +38,8 @@@
   #define SCHED_BATCH           3
   /* SCHED_ISO: reserved but not implemented yet */
   #define SCHED_IDLE            5
+ /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+ #define SCHED_RESET_ON_FORK     0x40000000
   
   #ifdef __KERNEL__
   
@@@ -796,18 -798,19 +798,19 @@@ enum cpu_idle_type 
   #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
   
   #ifdef CONFIG_SMP
- #define SD_LOAD_BALANCE               1       /* Do load balancing on this domain. */
- #define SD_BALANCE_NEWIDLE    2       /* Balance when about to become idle */
- #define SD_BALANCE_EXEC               4       /* Balance on exec */
- #define SD_BALANCE_FORK               8       /* Balance on fork, clone */
- #define SD_WAKE_IDLE          16      /* Wake to idle CPU on task wakeup */
- #define SD_WAKE_AFFINE                32      /* Wake task to waking CPU */
- #define SD_WAKE_BALANCE               64      /* Perform balancing at task wakeup */
- #define SD_SHARE_CPUPOWER     128     /* Domain members share cpu power */
- #define SD_POWERSAVINGS_BALANCE       256     /* Balance for power savings */
- #define SD_SHARE_PKG_RESOURCES        512     /* Domain members share cpu pkg resources */
- #define SD_SERIALIZE          1024    /* Only a single load balancing instance */
- #define SD_WAKE_IDLE_FAR      2048    /* Gain latency sacrificing cache hit */
+ #define SD_LOAD_BALANCE               0x0001  /* Do load balancing on this domain. */
+ #define SD_BALANCE_NEWIDLE    0x0002  /* Balance when about to become idle */
+ #define SD_BALANCE_EXEC               0x0004  /* Balance on exec */
+ #define SD_BALANCE_FORK               0x0008  /* Balance on fork, clone */
+ #define SD_WAKE_IDLE          0x0010  /* Wake to idle CPU on task wakeup */
+ #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
+ #define SD_WAKE_BALANCE               0x0040  /* Perform balancing at task wakeup */
+ #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
+ #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
+ #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
+ #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
+ #define SD_WAKE_IDLE_FAR      0x0800  /* Gain latency sacrificing cache hit */
+ #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
   
   enum powersavings_balance_level {
         POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
@@@ -827,7 -830,7 +830,7 @@@ static inline int sd_balance_for_mc_pow
         if (sched_smt_power_savings)
                 return SD_POWERSAVINGS_BALANCE;
   
-       return 0;
+       return SD_PREFER_SIBLING;
   }
   
   static inline int sd_balance_for_package_power(void)
@@@ -835,7 -838,7 +838,7 @@@
         if (sched_mc_power_savings | sched_smt_power_savings)
                 return SD_POWERSAVINGS_BALANCE;
   
-       return 0;
+       return SD_PREFER_SIBLING;
   }
   
   /*
@@@ -857,15 -860,9 +860,9 @@@ struct sched_group 
   
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-        * single CPU. This is read only (except for setup, hotplug CPU).
-        * Note : Never change cpu_power without recompute its reciprocal
-        */
-       unsigned int __cpu_power;
-       /*
-        * reciprocal value of cpu_power to avoid expensive divides
-        * (see include/linux/reciprocal_div.h)
+        * single CPU.
          */
-       u32 reciprocal_cpu_power;
+       unsigned int cpu_power;
   
         /*
          * The CPUs this group covers.
@@@ -918,6 -915,7 +915,7 @@@ struct sched_domain 
         unsigned int newidle_idx;
         unsigned int wake_idx;
         unsigned int forkexec_idx;
+       unsigned int smt_gain;
         int flags;                      /* See SD_* */
         enum sched_domain_level level;
   
@@@ -1045,7 -1043,6 +1043,6 @@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
-       int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
@@@ -1110,6 -1107,8 +1107,8 @@@ struct sched_entity 
         u64                     wait_max;
         u64                     wait_count;
         u64                     wait_sum;
+       u64                     iowait_count;
+       u64                     iowait_sum;
   
         u64                     sleep_start;
         u64                     sleep_max;
@@@ -1163,8 -1162,6 +1162,8 @@@ struct sched_rt_entity 
   #endif
   };
   
+ +struct rcu_node;
+ +
   struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@@ -1208,12 -1205,10 +1207,12 @@@
         unsigned int policy;
         cpumask_t cpus_allowed;
   
- -#ifdef CONFIG_PREEMPT_RCU
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
         int rcu_read_lock_nesting;
- -      int rcu_flipctr_idx;
- -#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ +      char rcu_read_unlock_special;
+ +      struct rcu_node *rcu_blocked_node;
+ +      struct list_head rcu_node_entry;
+ +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         struct sched_info sched_info;
@@@ -1234,11 -1229,19 +1233,19 @@@
         unsigned did_exec:1;
         unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                                  * execve */
+       unsigned in_iowait:1;
+ 
+ 
+       /* Revert to default priority/policy when forking */
+       unsigned sched_reset_on_fork:1;
+ 
         pid_t pid;
         pid_t tgid;
   
+ #ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
+ #endif
   
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
@@@ -1296,7 -1299,6 +1303,7 @@@
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
                                          * (notably. ptrace) */
+ +      struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
   
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
@@@ -1729,28 -1731,6 +1736,28 @@@ extern cputime_t task_gtime(struct task
   #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
   #define used_math() tsk_used_math(current)
   
+ +#ifdef CONFIG_TREE_PREEMPT_RCU
+ +
+ +#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
+ +#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+ +#define RCU_READ_UNLOCK_GOT_QS  (1 << 2) /* CPU has responded to RCU core. */
+ +
+ +static inline void rcu_copy_process(struct task_struct *p)
+ +{
+ +      p->rcu_read_lock_nesting = 0;
+ +      p->rcu_read_unlock_special = 0;
+ +      p->rcu_blocked_node = NULL;
+ +      INIT_LIST_HEAD(&p->rcu_node_entry);
+ +}
+ +
+ +#else
+ +
+ +static inline void rcu_copy_process(struct task_struct *p)
+ +{
+ +}
+ +
+ +#endif
+ +
   #ifdef CONFIG_SMP
   extern int set_cpus_allowed_ptr(struct task_struct *p,
                                 const struct cpumask *new_mask);
@@@ -1840,11 -1820,12 +1847,12 @@@ extern unsigned int sysctl_sched_min_gr
   extern unsigned int sysctl_sched_wakeup_granularity;
   extern unsigned int sysctl_sched_shares_ratelimit;
   extern unsigned int sysctl_sched_shares_thresh;
- #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_child_runs_first;
+ #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
+ extern unsigned int sysctl_sched_time_avg;
   extern unsigned int sysctl_timer_migration;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
@@@ -2104,7 -2085,7 +2112,7 @@@ static inline unsigned long wait_task_i
   #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
   
- -extern bool is_single_threaded(struct task_struct *);
+ +extern bool current_is_single_threaded(void);
   
   /*
    * Careful: do_each_thread/while_each_thread is a double loop so
@@@ -2308,23 -2289,31 +2316,31 @@@ static inline int need_resched(void
    * cond_resched_softirq() will enable bhs before scheduling.
    */
   extern int _cond_resched(void);
- #ifdef CONFIG_PREEMPT_BKL
- static inline int cond_resched(void)
- {
-       return 0;
- }
+ 
+ #define cond_resched() ({                     \
+       __might_sleep(__FILE__, __LINE__, 0);   \
+       _cond_resched();                        \
+ })
+ 
+ extern int __cond_resched_lock(spinlock_t *lock);
+ 
+ #ifdef CONFIG_PREEMPT
+ #define PREEMPT_LOCK_OFFSET   PREEMPT_OFFSET
   #else
- static inline int cond_resched(void)
- {
-       return _cond_resched();
- }
+ #define PREEMPT_LOCK_OFFSET   0
   #endif
- extern int cond_resched_lock(spinlock_t * lock);
- extern int cond_resched_softirq(void);
- static inline int cond_resched_bkl(void)
- {
-       return _cond_resched();
- }
+ 
+ #define cond_resched_lock(lock) ({                            \
+       __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+       __cond_resched_lock(lock);                              \
+ })
+ 
+ extern int __cond_resched_softirq(void);
+ 
+ #define cond_resched_softirq() ({                             \
+       __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
+       __cond_resched_softirq();                               \
+ })
   
   /*
    * Does a critical section need to be broken due to another
diff --combined init/main.c

index 525f6fb2bd2294aa8ffa109b6c9d249fcbb3fcea,0ec75ce771ac3b6d272d91d6a0445ce405e2ea2f..b34fd8e5edef6b50a9bedeef4dc1ff8625a76394
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -451,7 -451,6 +451,7 @@@ static noinline void __init_refok rest_
   {
         int pid;
   
+ +      rcu_scheduler_starting();
         kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
         numa_default_policy();
         pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@@ -463,6 -462,7 +463,6 @@@
          * at least once to get things moving:
          */
         init_idle_bootup_task(current);
- -      rcu_scheduler_starting();
         preempt_enable_no_resched();
         schedule();
         preempt_disable();
@@@ -631,7 -631,6 +631,6 @@@ asmlinkage void __init start_kernel(voi
         softirq_init();
         timekeeping_init();
         time_init();
-       sched_clock_init();
         profile_init();
         if (!irqs_disabled())
                 printk(KERN_CRIT "start_kernel(): bug: interrupts were "
@@@ -682,6 -681,7 +681,7 @@@
         numa_policy_init();
         if (late_time_init)
                 late_time_init();
+       sched_clock_init();
         calibrate_delay();
         pidmap_init();
         anon_vma_init();
diff --combined kernel/sched.c

index 4066241ae9f433c880eb977a8efe2ed40efa53bf,796baf73197605fd098b1a4998f7a238ceb6a1c6..e27a53685ed9cb4a24d9e1b58650a32161e0a0ea
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -64,7 -64,6 +64,6 @@@
   #include <linux/tsacct_kern.h>
   #include <linux/kprobes.h>
   #include <linux/delayacct.h>
- #include <linux/reciprocal_div.h>
   #include <linux/unistd.h>
   #include <linux/pagemap.h>
   #include <linux/hrtimer.h>
@@@ -120,30 -119,8 +119,8 @@@
    */
   #define RUNTIME_INF   ((u64)~0ULL)
   
- #ifdef CONFIG_SMP
- 
   static void double_rq_lock(struct rq *rq1, struct rq *rq2);
   
- /*
-  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
-  * Since cpu_power is a 'constant', we can use a reciprocal divide.
-  */
- static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
- {
-       return reciprocal_divide(load, sg->reciprocal_cpu_power);
- }
- 
- /*
-  * Each time a sched group cpu_power is changed,
-  * we must compute its reciprocal value
-  */
- static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
- {
-       sg->__cpu_power += val;
-       sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
- }
- #endif
- 
   static inline int rt_policy(int policy)
   {
         if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@@ -309,8 -286,8 +286,8 @@@ void set_tg_uid(struct user_struct *use
   
   /*
    * Root task group.
-  *    Every UID task group (including init_task_group aka UID-0) will
-  *    be a child to this group.
+  *    Every UID task group (including init_task_group aka UID-0) will
+  *    be a child to this group.
    */
   struct task_group root_task_group;
   
@@@ -318,7 -295,7 +295,7 @@@
   /* Default task group's sched entity on each cpu */
   static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
   /* Default task group's cfs_rq on each cpu */
- static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+ static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -616,6 -593,7 +593,7 @@@ struct rq 
   
         unsigned char idle_at_tick;
         /* For active balancing */
+       int post_schedule;
         int active_balance;
         int push_cpu;
         /* cpu of this runqueue: */
@@@ -626,6 -604,9 +604,9 @@@
   
         struct task_struct *migration_thread;
         struct list_head migration_queue;
+ 
+       u64 rt_avg;
+       u64 age_stamp;
   #endif
   
         /* calc_load related fields */
@@@ -693,6 -674,7 +674,7 @@@ static inline int cpu_of(struct rq *rq
   #define this_rq()             (&__get_cpu_var(runqueues))
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
+ #define raw_rq()              (&__raw_get_cpu_var(runqueues))
   
   inline void update_rq_clock(struct rq *rq)
   {
@@@ -860,6 -842,14 +842,14 @@@ unsigned int sysctl_sched_shares_rateli
    */
   unsigned int sysctl_sched_shares_thresh = 4;
   
+ /*
+  * period over which we average the RT time consumption, measured
+  * in ms.
+  *
+  * default: 1s
+  */
+ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+ 
   /*
    * period over which we measure -rt task cpu usage in us.
    * default: 1s
@@@ -1278,12 -1268,37 +1268,37 @@@ void wake_up_idle_cpu(int cpu
   }
   #endif /* CONFIG_NO_HZ */
   
+ static u64 sched_avg_period(void)
+ {
+       return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+ }
+ 
+ static void sched_avg_update(struct rq *rq)
+ {
+       s64 period = sched_avg_period();
+ 
+       while ((s64)(rq->clock - rq->age_stamp) > period) {
+               rq->age_stamp += period;
+               rq->rt_avg /= 2;
+       }
+ }
+ 
+ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ {
+       rq->rt_avg += rt_delta;
+       sched_avg_update(rq);
+ }
+ 
   #else /* !CONFIG_SMP */
   static void resched_task(struct task_struct *p)
   {
         assert_spin_locked(&task_rq(p)->lock);
         set_tsk_need_resched(p);
   }
+ 
+ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ {
+ }
   #endif /* CONFIG_SMP */
   
   #if BITS_PER_LONG == 32
@@@ -1513,28 -1528,35 +1528,35 @@@ static unsigned long cpu_avg_load_per_t
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
+ struct update_shares_data {
+       unsigned long rq_weight[NR_CPUS];
+ };
+ 
+ static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+ 
   static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   
   /*
    * Calculate and set the cpu's group shares.
    */
- static void
- update_group_shares_cpu(struct task_group *tg, int cpu,
-                       unsigned long sd_shares, unsigned long sd_rq_weight)
+ static void update_group_shares_cpu(struct task_group *tg, int cpu,
+                                   unsigned long sd_shares,
+                                   unsigned long sd_rq_weight,
+                                   struct update_shares_data *usd)
   {
-       unsigned long shares;
-       unsigned long rq_weight;
+       unsigned long shares, rq_weight;
+       int boost = 0;
   
-       if (!tg->se[cpu])
-               return;
- 
-       rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       rq_weight = usd->rq_weight[cpu];
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
   
         /*
-        *           \Sum shares * rq_weight
-        * shares =  -----------------------
-        *               \Sum rq_weight
-        *
+        *             \Sum_j shares_j * rq_weight_i
+        * shares_i =  -----------------------------
+        *                  \Sum_j rq_weight_j
          */
         shares = (sd_shares * rq_weight) / sd_rq_weight;
         shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@@ -1545,8 -1567,8 +1567,8 @@@
                 unsigned long flags;
   
                 spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
- 
+               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
@@@ -1559,22 -1581,30 +1581,30 @@@
    */
   static int tg_shares_up(struct task_group *tg, void *data)
   {
-       unsigned long weight, rq_weight = 0;
-       unsigned long shares = 0;
+       unsigned long weight, rq_weight = 0, shares = 0;
+       struct update_shares_data *usd;
         struct sched_domain *sd = data;
+       unsigned long flags;
         int i;
   
+       if (!tg->se[0])
+               return 0;
+ 
+       local_irq_save(flags);
+       usd = &__get_cpu_var(update_shares_data);
+ 
         for_each_cpu(i, sched_domain_span(sd)) {
+               weight = tg->cfs_rq[i]->load.weight;
+               usd->rq_weight[i] = weight;
+ 
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
                  * run here it will not get delayed by group starvation.
                  */
-               weight = tg->cfs_rq[i]->load.weight;
                 if (!weight)
                         weight = NICE_0_LOAD;
   
-               tg->cfs_rq[i]->rq_weight = weight;
                 rq_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
@@@ -1586,7 -1616,9 +1616,9 @@@
                 shares = tg->shares;
   
         for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+               update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+ 
+       local_irq_restore(flags);
   
         return 0;
   }
@@@ -1616,8 -1648,14 +1648,14 @@@ static int tg_load_down(struct task_gro
   
   static void update_shares(struct sched_domain *sd)
   {
-       u64 now = cpu_clock(raw_smp_processor_id());
-       s64 elapsed = now - sd->last_update;
+       s64 elapsed;
+       u64 now;
+ 
+       if (root_task_group_empty())
+               return;
+ 
+       now = cpu_clock(raw_smp_processor_id());
+       elapsed = now - sd->last_update;
   
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
@@@ -1627,6 -1665,9 +1665,9 @@@
   
   static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
   {
+       if (root_task_group_empty())
+               return;
+ 
         spin_unlock(&rq->lock);
         update_shares(sd);
         spin_lock(&rq->lock);
@@@ -1634,6 -1675,9 +1675,9 @@@
   
   static void update_h_load(long cpu)
   {
+       if (root_task_group_empty())
+               return;
+ 
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
   }
   
@@@ -2268,8 -2312,7 +2312,7 @@@ find_idlest_group(struct sched_domain *
                 }
   
                 /* Adjust by relative CPU power of the group */
-               avg_load = sg_div_cpu_power(group,
-                               avg_load * SCHED_LOAD_SCALE);
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
   
                 if (local_group) {
                         this_load = avg_load;
@@@ -2637,9 -2680,32 +2680,32 @@@ void sched_fork(struct task_struct *p, 
         set_task_cpu(p, cpu);
   
         /*
-        * Make sure we do not leak PI boosting priority to the child:
+        * Make sure we do not leak PI boosting priority to the child.
          */
         p->prio = current->normal_prio;
+ 
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                       p->policy = SCHED_NORMAL;
+ 
+               if (p->normal_prio < DEFAULT_PRIO)
+                       p->prio = DEFAULT_PRIO;
+ 
+               if (PRIO_TO_NICE(p->static_prio) < 0) {
+                       p->static_prio = NICE_TO_PRIO(0);
+                       set_load_weight(p);
+               }
+ 
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+ 
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
   
@@@ -2796,12 -2862,6 +2862,6 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
- #ifdef CONFIG_SMP
-       int post_schedule = 0;
- 
-       if (current->sched_class->needs_post_schedule)
-               post_schedule = current->sched_class->needs_post_schedule(rq);
- #endif
   
         rq->prev_mm = NULL;
   
@@@ -2820,10 -2880,6 +2880,6 @@@
         finish_arch_switch(prev);
         perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
- #ifdef CONFIG_SMP
-       if (post_schedule)
-               current->sched_class->post_schedule(rq);
- #endif
   
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@@ -2838,6 -2894,42 +2894,42 @@@
         }
   }
   
+ #ifdef CONFIG_SMP
+ 
+ /* assumes rq->lock is held */
+ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+ {
+       if (prev->sched_class->pre_schedule)
+               prev->sched_class->pre_schedule(rq, prev);
+ }
+ 
+ /* rq->lock is NOT held, but preemption is disabled */
+ static inline void post_schedule(struct rq *rq)
+ {
+       if (rq->post_schedule) {
+               unsigned long flags;
+ 
+               spin_lock_irqsave(&rq->lock, flags);
+               if (rq->curr->sched_class->post_schedule)
+                       rq->curr->sched_class->post_schedule(rq);
+               spin_unlock_irqrestore(&rq->lock, flags);
+ 
+               rq->post_schedule = 0;
+       }
+ }
+ 
+ #else
+ 
+ static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+ {
+ }
+ 
+ static inline void post_schedule(struct rq *rq)
+ {
+ }
+ 
+ #endif
+ 
   /**
    * schedule_tail - first thing a freshly forked thread must call.
    * @prev: the thread we just switched away from.
@@@ -2848,6 -2940,13 +2940,13 @@@ asmlinkage void schedule_tail(struct ta
         struct rq *rq = this_rq();
   
         finish_task_switch(rq, prev);
+ 
+       /*
+        * FIXME: do we need to worry about rq being invalidated by the
+        * task_switch?
+        */
+       post_schedule(rq);
+ 
   #ifdef __ARCH_WANT_UNLOCKED_CTXSW
         /* In this case, finish_task_switch does not reenable preemption */
         preempt_enable();
@@@ -3379,9 -3478,10 +3478,10 @@@ static int move_one_task(struct rq *thi
   {
         const struct sched_class *class;
   
-       for (class = sched_class_highest; class; class = class->next)
+       for_each_class(class) {
                 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                         return 1;
+       }
   
         return 0;
   }
@@@ -3544,7 -3644,7 +3644,7 @@@ static inline void update_sd_power_savi
          * capacity but still has some space to pick up some load
          * from other group and save more power
          */
-       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+       if (sgs->sum_nr_running + 1 > sgs->group_capacity)
                 return;
   
         if (sgs->sum_nr_running > sds->leader_nr_running ||
@@@ -3611,6 -3711,77 +3711,77 @@@ static inline int check_power_save_busi
   }
   #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
+ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+ {
+       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long smt_gain = sd->smt_gain;
+ 
+       smt_gain /= weight;
+ 
+       return smt_gain;
+ }
+ 
+ unsigned long scale_rt_power(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       u64 total, available;
+ 
+       sched_avg_update(rq);
+ 
+       total = sched_avg_period() + (rq->clock - rq->age_stamp);
+       available = total - rq->rt_avg;
+ 
+       if (unlikely((s64)total < SCHED_LOAD_SCALE))
+               total = SCHED_LOAD_SCALE;
+ 
+       total >>= SCHED_LOAD_SHIFT;
+ 
+       return div_u64(available, total);
+ }
+ 
+ static void update_cpu_power(struct sched_domain *sd, int cpu)
+ {
+       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long power = SCHED_LOAD_SCALE;
+       struct sched_group *sdg = sd->groups;
+ 
+       /* here we could scale based on cpufreq */
+ 
+       if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+               power *= arch_scale_smt_power(sd, cpu);
+               power >>= SCHED_LOAD_SHIFT;
+       }
+ 
+       power *= scale_rt_power(cpu);
+       power >>= SCHED_LOAD_SHIFT;
+ 
+       if (!power)
+               power = 1;
+ 
+       sdg->cpu_power = power;
+ }
+ 
+ static void update_group_power(struct sched_domain *sd, int cpu)
+ {
+       struct sched_domain *child = sd->child;
+       struct sched_group *group, *sdg = sd->groups;
+       unsigned long power;
+ 
+       if (!child) {
+               update_cpu_power(sd, cpu);
+               return;
+       }
+ 
+       power = 0;
+ 
+       group = child->groups;
+       do {
+               power += group->cpu_power;
+               group = group->next;
+       } while (group != child->groups);
+ 
+       sdg->cpu_power = power;
+ }
   
   /**
    * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@@ -3624,7 -3795,8 +3795,8 @@@
    * @balance: Should we balance.
    * @sgs: variable to hold the statistics for this group.
    */
- static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ static inline void update_sg_lb_stats(struct sched_domain *sd,
+                       struct sched_group *group, int this_cpu,
                         enum cpu_idle_type idle, int load_idx, int *sd_idle,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
@@@ -3635,8 -3807,11 +3807,11 @@@
         unsigned long sum_avg_load_per_task;
         unsigned long avg_load_per_task;
   
-       if (local_group)
+       if (local_group) {
                 balance_cpu = group_first_cpu(group);
+               if (balance_cpu == this_cpu)
+                       update_group_power(sd, this_cpu);
+       }
   
         /* Tally up the load of all CPUs in the group */
         sum_avg_load_per_task = avg_load_per_task = 0;
@@@ -3685,8 -3860,7 +3860,7 @@@
         }
   
         /* Adjust by relative CPU power of the group */
-       sgs->avg_load = sg_div_cpu_power(group,
-                       sgs->group_load * SCHED_LOAD_SCALE);
+       sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
   
   
         /*
@@@ -3698,14 -3872,14 +3872,14 @@@
          *      normalized nr_running number somewhere that negates
          *      the hierarchy?
          */
-       avg_load_per_task = sg_div_cpu_power(group,
-                       sum_avg_load_per_task * SCHED_LOAD_SCALE);
+       avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
+               group->cpu_power;
   
         if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                 sgs->group_imb = 1;
   
-       sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
- 
+       sgs->group_capacity =
+               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
   }
   
   /**
@@@ -3723,9 -3897,13 +3897,13 @@@ static inline void update_sd_lb_stats(s
                         const struct cpumask *cpus, int *balance,
                         struct sd_lb_stats *sds)
   {
+       struct sched_domain *child = sd->child;
         struct sched_group *group = sd->groups;
         struct sg_lb_stats sgs;
-       int load_idx;
+       int load_idx, prefer_sibling = 0;
+ 
+       if (child && child->flags & SD_PREFER_SIBLING)
+               prefer_sibling = 1;
   
         init_sd_power_savings_stats(sd, sds, idle);
         load_idx = get_sd_load_idx(sd, idle);
@@@ -3736,14 -3914,22 +3914,22 @@@
                 local_group = cpumask_test_cpu(this_cpu,
                                                sched_group_cpus(group));
                 memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+               update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
                                 local_group, cpus, balance, &sgs);
   
                 if (local_group && balance && !(*balance))
                         return;
   
                 sds->total_load += sgs.group_load;
-               sds->total_pwr += group->__cpu_power;
+               sds->total_pwr += group->cpu_power;
+ 
+               /*
+                * In case the child domain prefers tasks go to siblings
+                * first, lower the group capacity to one so that we'll try
+                * and move all the excess tasks away.
+                */
+               if (prefer_sibling)
+                       sgs.group_capacity = min(sgs.group_capacity, 1UL);
   
                 if (local_group) {
                         sds->this_load = sgs.avg_load;
@@@ -3763,7 -3949,6 +3949,6 @@@
                 update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
- 
   }
   
   /**
@@@ -3801,28 -3986,28 +3986,28 @@@ static inline void fix_small_imbalance(
          * moving them.
          */
   
-       pwr_now += sds->busiest->__cpu_power *
+       pwr_now += sds->busiest->cpu_power *
                         min(sds->busiest_load_per_task, sds->max_load);
-       pwr_now += sds->this->__cpu_power *
+       pwr_now += sds->this->cpu_power *
                         min(sds->this_load_per_task, sds->this_load);
         pwr_now /= SCHED_LOAD_SCALE;
   
         /* Amount of load we'd subtract */
-       tmp = sg_div_cpu_power(sds->busiest,
-                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+               sds->busiest->cpu_power;
         if (sds->max_load > tmp)
-               pwr_move += sds->busiest->__cpu_power *
+               pwr_move += sds->busiest->cpu_power *
                         min(sds->busiest_load_per_task, sds->max_load - tmp);
   
         /* Amount of load we'd add */
-       if (sds->max_load * sds->busiest->__cpu_power <
+       if (sds->max_load * sds->busiest->cpu_power <
                 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-               tmp = sg_div_cpu_power(sds->this,
-                       sds->max_load * sds->busiest->__cpu_power);
+               tmp = (sds->max_load * sds->busiest->cpu_power) /
+                       sds->this->cpu_power;
         else
-               tmp = sg_div_cpu_power(sds->this,
-                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
-       pwr_move += sds->this->__cpu_power *
+               tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                       sds->this->cpu_power;
+       pwr_move += sds->this->cpu_power *
                         min(sds->this_load_per_task, sds->this_load + tmp);
         pwr_move /= SCHED_LOAD_SCALE;
   
@@@ -3857,8 -4042,8 +4042,8 @@@ static inline void calculate_imbalance(
                         sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * sds->busiest->__cpu_power,
-               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+       *imbalance = min(max_pull * sds->busiest->cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -3976,6 -4161,26 +4161,26 @@@ ret
         return NULL;
   }
   
+ static struct sched_group *group_of(int cpu)
+ {
+       struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+ 
+       if (!sd)
+               return NULL;
+ 
+       return sd->groups;
+ }
+ 
+ static unsigned long power_of(int cpu)
+ {
+       struct sched_group *group = group_of(cpu);
+ 
+       if (!group)
+               return SCHED_LOAD_SCALE;
+ 
+       return group->cpu_power;
+ }
+ 
   /*
    * find_busiest_queue - find the busiest runqueue among the cpus in group.
    */
@@@ -3988,15 -4193,18 +4193,18 @@@ find_busiest_queue(struct sched_group *
         int i;
   
         for_each_cpu(i, sched_group_cpus(group)) {
+               unsigned long power = power_of(i);
+               unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                 unsigned long wl;
   
                 if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
-               wl = weighted_cpuload(i);
+               wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+               wl /= power;
   
-               if (rq->nr_running == 1 && wl > imbalance)
+               if (capacity && rq->nr_running == 1 && wl > imbalance)
                         continue;
   
                 if (wl > max_load) {
@@@ -5325,7 -5533,7 +5533,7 @@@ need_resched
         preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
- -      rcu_qsctr_inc(cpu);
+ +      rcu_sched_qs(cpu);
         prev = rq->curr;
         switch_count = &prev->nivcsw;
   
@@@ -5349,10 -5557,7 +5557,7 @@@ need_resched_nonpreemptible
                 switch_count = &prev->nvcsw;
         }
   
- #ifdef CONFIG_SMP
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
- #endif
+       pre_schedule(rq, prev);
   
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
@@@ -5378,6 -5583,8 +5583,8 @@@
         } else
                 spin_unlock_irq(&rq->lock);
   
+       post_schedule(rq);
+ 
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
   
@@@ -6123,17 -6330,25 +6330,25 @@@ static int __sched_setscheduler(struct 
         unsigned long flags;
         const struct sched_class *prev_class = p->sched_class;
         struct rq *rq;
+       int reset_on_fork;
   
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
   recheck:
         /* double check policy once rq lock held */
-       if (policy < 0)
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
                 policy = oldpolicy = p->policy;
-       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                       policy != SCHED_IDLE)
-               return -EINVAL;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+ 
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+ 
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@@ -6177,6 -6392,10 +6392,10 @@@
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
                         return -EPERM;
+ 
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
         }
   
         if (user) {
@@@ -6220,6 -6439,8 +6439,8 @@@
         if (running)
                 p->sched_class->put_prev_task(rq, p);
   
+       p->sched_reset_on_fork = reset_on_fork;
+ 
         oldprio = p->prio;
         __setscheduler(rq, p, policy, param->sched_priority);
   
@@@ -6336,14 -6557,15 +6557,15 @@@ SYSCALL_DEFINE1(sched_getscheduler, pid
         if (p) {
                 retval = security_task_getscheduler(p);
                 if (!retval)
-                       retval = p->policy;
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
         }
         read_unlock(&tasklist_lock);
         return retval;
   }
   
   /**
-  * sys_sched_getscheduler - get the RT priority of a thread
+  * sys_sched_getparam - get the RT priority of a thread
    * @pid: the pid in question.
    * @param: structure containing the RT priority.
    */
@@@ -6571,19 -6793,9 +6793,9 @@@ static inline int should_resched(void
   
   static void __cond_resched(void)
   {
- #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-       __might_sleep(__FILE__, __LINE__);
- #endif
-       /*
-        * The BKS might be reacquired before we have dropped
-        * PREEMPT_ACTIVE, which could trigger a second
-        * cond_resched() call.
-        */
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               schedule();
-               sub_preempt_count(PREEMPT_ACTIVE);
-       } while (need_resched());
+       add_preempt_count(PREEMPT_ACTIVE);
+       schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
   }
   
   int __sched _cond_resched(void)
@@@ -6597,20 -6809,18 +6809,20 @@@
   EXPORT_SYMBOL(_cond_resched);
   
   /*
-  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
    * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
- int cond_resched_lock(spinlock_t *lock)
+ int __cond_resched_lock(spinlock_t *lock)
   {
         int resched = should_resched();
         int ret = 0;
   
+ +      lockdep_assert_held(lock);
+ +
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
                 if (resched)
@@@ -6622,9 -6832,9 +6834,9 @@@
         }
         return ret;
   }
- EXPORT_SYMBOL(cond_resched_lock);
+ EXPORT_SYMBOL(__cond_resched_lock);
   
- int __sched cond_resched_softirq(void)
+ int __sched __cond_resched_softirq(void)
   {
         BUG_ON(!in_softirq());
   
@@@ -6636,7 -6846,7 +6848,7 @@@
         }
         return 0;
   }
- EXPORT_SYMBOL(cond_resched_softirq);
+ EXPORT_SYMBOL(__cond_resched_softirq);
   
   /**
    * yield - yield the current processor to other threads.
@@@ -6660,11 -6870,13 +6872,13 @@@ EXPORT_SYMBOL(yield)
    */
   void __sched io_schedule(void)
   {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
         schedule();
+       current->in_iowait = 0;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
   }
@@@ -6672,12 -6884,14 +6886,14 @@@ EXPORT_SYMBOL(io_schedule)
   
   long __sched io_schedule_timeout(long timeout)
   {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
         long ret;
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+       current->in_iowait = 1;
         ret = schedule_timeout(timeout);
+       current->in_iowait = 0;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
         return ret;
@@@ -6994,8 -7208,12 +7210,12 @@@ int set_cpus_allowed_ptr(struct task_st
   
         if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
+               struct task_struct *mt = rq->migration_thread;
+ 
+               get_task_struct(mt);
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
+               put_task_struct(mt);
                 wait_for_completion(&req.done);
                 tlb_migrate_finish(p->mm);
                 return 0;
@@@ -7053,11 -7271,6 +7273,11 @@@ fail
         return ret;
   }
   
+ +#define RCU_MIGRATION_IDLE    0
+ +#define RCU_MIGRATION_NEED_QS 1
+ +#define RCU_MIGRATION_GOT_QS  2
+ +#define RCU_MIGRATION_MUST_SYNC       3
+ +
   /*
    * migration_thread - this is a highprio system thread that performs
    * thread migration by bumping thread off CPU then 'pushing' onto
@@@ -7065,7 -7278,6 +7285,7 @@@
    */
   static int migration_thread(void *data)
   {
+ +      int badcpu;
         int cpu = (long)data;
         struct rq *rq;
   
@@@ -7100,17 -7312,8 +7320,17 @@@
                 req = list_entry(head->next, struct migration_req, list);
                 list_del_init(head->next);
   
- -              spin_unlock(&rq->lock);
- -              __migrate_task(req->task, cpu, req->dest_cpu);
+ +              if (req->task != NULL) {
+ +                      spin_unlock(&rq->lock);
+ +                      __migrate_task(req->task, cpu, req->dest_cpu);
+ +              } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+ +                      req->dest_cpu = RCU_MIGRATION_GOT_QS;
+ +                      spin_unlock(&rq->lock);
+ +              } else {
+ +                      req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+ +                      spin_unlock(&rq->lock);
+ +                      WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+ +              }
                 local_irq_enable();
   
                 complete(&req->done);
@@@ -7642,7 -7845,7 +7862,7 @@@ static int __init migration_init(void
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
   
-       return err;
+       return 0;
   }
   early_initcall(migration_init);
   #endif
@@@ -7689,7 -7892,7 +7909,7 @@@ static int sched_domain_debug_one(struc
                         break;
                 }
   
-               if (!group->__cpu_power) {
+               if (!group->cpu_power) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: domain->cpu_power not "
                                         "set\n");
@@@ -7713,9 -7916,9 +7933,9 @@@
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
   
                 printk(KERN_CONT " %s", str);
-               if (group->__cpu_power != SCHED_LOAD_SCALE) {
-                       printk(KERN_CONT " (__cpu_power = %d)",
-                               group->__cpu_power);
+               if (group->cpu_power != SCHED_LOAD_SCALE) {
+                       printk(KERN_CONT " (cpu_power = %d)",
+                               group->cpu_power);
                 }
   
                 group = group->next;
@@@ -7858,7 -8061,7 +8078,7 @@@ static void rq_attach_root(struct rq *r
         rq->rd = rd;
   
         cpumask_set_cpu(rq->cpu, rd->span);
-       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
@@@ -8000,7 -8203,7 +8220,7 @@@ init_sched_build_groups(const struct cp
                         continue;
   
                 cpumask_clear(sched_group_cpus(sg));
-               sg->__cpu_power = 0;
+               sg->cpu_power = 0;
   
                 for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@@ -8108,6 -8311,39 +8328,39 @@@ struct static_sched_domain 
         DECLARE_BITMAP(span, CONFIG_NR_CPUS);
   };
   
+ struct s_data {
+ #ifdef CONFIG_NUMA
+       int                     sd_allnodes;
+       cpumask_var_t           domainspan;
+       cpumask_var_t           covered;
+       cpumask_var_t           notcovered;
+ #endif
+       cpumask_var_t           nodemask;
+       cpumask_var_t           this_sibling_map;
+       cpumask_var_t           this_core_map;
+       cpumask_var_t           send_covered;
+       cpumask_var_t           tmpmask;
+       struct sched_group      **sched_group_nodes;
+       struct root_domain      *rd;
+ };
+ 
+ enum s_alloc {
+       sa_sched_groups = 0,
+       sa_rootdomain,
+       sa_tmpmask,
+       sa_send_covered,
+       sa_this_core_map,
+       sa_this_sibling_map,
+       sa_nodemask,
+       sa_sched_group_nodes,
+ #ifdef CONFIG_NUMA
+       sa_notcovered,
+       sa_covered,
+       sa_domainspan,
+ #endif
+       sa_none,
+ };
+ 
   /*
    * SMT sched-domains:
    */
@@@ -8225,11 -8461,76 +8478,76 @@@ static void init_numa_sched_groups_powe
                                 continue;
                         }
   
-                       sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+                       sg->cpu_power += sd->groups->cpu_power;
                 }
                 sg = sg->next;
         } while (sg != group_head);
   }
+ 
+ static int build_numa_sched_groups(struct s_data *d,
+                                  const struct cpumask *cpu_map, int num)
+ {
+       struct sched_domain *sd;
+       struct sched_group *sg, *prev;
+       int n, j;
+ 
+       cpumask_clear(d->covered);
+       cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+       if (cpumask_empty(d->nodemask)) {
+               d->sched_group_nodes[num] = NULL;
+               goto out;
+       }
+ 
+       sched_domain_node_span(num, d->domainspan);
+       cpumask_and(d->domainspan, d->domainspan, cpu_map);
+ 
+       sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                         GFP_KERNEL, num);
+       if (!sg) {
+               printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+                      num);
+               return -ENOMEM;
+       }
+       d->sched_group_nodes[num] = sg;
+ 
+       for_each_cpu(j, d->nodemask) {
+               sd = &per_cpu(node_domains, j).sd;
+               sd->groups = sg;
+       }
+ 
+       sg->cpu_power = 0;
+       cpumask_copy(sched_group_cpus(sg), d->nodemask);
+       sg->next = sg;
+       cpumask_or(d->covered, d->covered, d->nodemask);
+ 
+       prev = sg;
+       for (j = 0; j < nr_node_ids; j++) {
+               n = (num + j) % nr_node_ids;
+               cpumask_complement(d->notcovered, d->covered);
+               cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+               cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+               if (cpumask_empty(d->tmpmask))
+                       break;
+               cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+               if (cpumask_empty(d->tmpmask))
+                       continue;
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, num);
+               if (!sg) {
+                       printk(KERN_WARNING
+                              "Can not alloc domain group for node %d\n", j);
+                       return -ENOMEM;
+               }
+               sg->cpu_power = 0;
+               cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+               sg->next = prev->next;
+               cpumask_or(d->covered, d->covered, d->tmpmask);
+               prev->next = sg;
+               prev = sg;
+       }
+ out:
+       return 0;
+ }
   #endif /* CONFIG_NUMA */
   
   #ifdef CONFIG_NUMA
@@@ -8283,15 -8584,13 +8601,13 @@@ static void free_sched_groups(const str
    * there are asymmetries in the topology. If there are asymmetries, group
    * having more cpu_power will pickup more load compared to the group having
    * less cpu_power.
-  *
-  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
-  * the maximum number of tasks a group can handle in the presence of other idle
-  * or lightly loaded groups in the same sched domain.
    */
   static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   {
         struct sched_domain *child;
         struct sched_group *group;
+       long power;
+       int weight;
   
         WARN_ON(!sd || !sd->groups);
   
@@@ -8300,28 -8599,32 +8616,32 @@@
   
         child = sd->child;
   
-       sd->groups->__cpu_power = 0;
+       sd->groups->cpu_power = 0;
   
-       /*
-        * For perf policy, if the groups in child domain share resources
-        * (for example cores sharing some portions of the cache hierarchy
-        * or SMT), then set this domain groups cpu_power such that each group
-        * can handle only one task, when there are other idle groups in the
-        * same sched domain.
-        */
-       if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
-                      (child->flags &
-                       (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
-               sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+       if (!child) {
+               power = SCHED_LOAD_SCALE;
+               weight = cpumask_weight(sched_domain_span(sd));
+               /*
+                * SMT siblings share the power of a single core.
+                * Usually multiple threads get a better yield out of
+                * that one core than a single thread would have,
+                * reflect that in sd->smt_gain.
+                */
+               if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                       power *= sd->smt_gain;
+                       power /= weight;
+                       power >>= SCHED_LOAD_SHIFT;
+               }
+               sd->groups->cpu_power += power;
                 return;
         }
   
         /*
-        * add cpu_power of each child group to this groups cpu_power
+        * Add cpu_power of each child group to this groups cpu_power.
          */
         group = child->groups;
         do {
-               sg_inc_cpu_power(sd->groups, group->__cpu_power);
+               sd->groups->cpu_power += group->cpu_power;
                 group = group->next;
         } while (group != child->groups);
   }
@@@ -8395,280 -8698,285 +8715,285 @@@ static void set_domain_attribute(struc
         }
   }
   
- /*
-  * Build sched domains for a given set of cpus and attach the sched domains
-  * to the individual cpus
-  */
- static int __build_sched_domains(const struct cpumask *cpu_map,
-                                struct sched_domain_attr *attr)
- {
-       int i, err = -ENOMEM;
-       struct root_domain *rd;
-       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
-               tmpmask;
+ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+                                const struct cpumask *cpu_map)
+ {
+       switch (what) {
+       case sa_sched_groups:
+               free_sched_groups(cpu_map, d->tmpmask); /* fall through */
+               d->sched_group_nodes = NULL;
+       case sa_rootdomain:
+               free_rootdomain(d->rd); /* fall through */
+       case sa_tmpmask:
+               free_cpumask_var(d->tmpmask); /* fall through */
+       case sa_send_covered:
+               free_cpumask_var(d->send_covered); /* fall through */
+       case sa_this_core_map:
+               free_cpumask_var(d->this_core_map); /* fall through */
+       case sa_this_sibling_map:
+               free_cpumask_var(d->this_sibling_map); /* fall through */
+       case sa_nodemask:
+               free_cpumask_var(d->nodemask); /* fall through */
+       case sa_sched_group_nodes:
   #ifdef CONFIG_NUMA
-       cpumask_var_t domainspan, covered, notcovered;
-       struct sched_group **sched_group_nodes = NULL;
-       int sd_allnodes = 0;
- 
-       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
-               goto out;
-       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
-               goto free_domainspan;
-       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
-               goto free_covered;
- #endif
- 
-       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
-               goto free_notcovered;
-       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
-               goto free_nodemask;
-       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
-               goto free_this_sibling_map;
-       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
-               goto free_this_core_map;
-       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
-               goto free_send_covered;
+               kfree(d->sched_group_nodes); /* fall through */
+       case sa_notcovered:
+               free_cpumask_var(d->notcovered); /* fall through */
+       case sa_covered:
+               free_cpumask_var(d->covered); /* fall through */
+       case sa_domainspan:
+               free_cpumask_var(d->domainspan); /* fall through */
+ #endif
+       case sa_none:
+               break;
+       }
+ }
   
+ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+                                                  const struct cpumask *cpu_map)
+ {
   #ifdef CONFIG_NUMA
-       /*
-        * Allocate the per-node list of sched groups
-        */
-       sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
-                                   GFP_KERNEL);
-       if (!sched_group_nodes) {
+       if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
+               return sa_none;
+       if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+               return sa_domainspan;
+       if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+               return sa_covered;
+       /* Allocate the per-node list of sched groups */
+       d->sched_group_nodes = kcalloc(nr_node_ids,
+                                     sizeof(struct sched_group *), GFP_KERNEL);
+       if (!d->sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
-               goto free_tmpmask;
-       }
- #endif
- 
-       rd = alloc_rootdomain();
-       if (!rd) {
+               return sa_notcovered;
+       }
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+ #endif
+       if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
+               return sa_sched_group_nodes;
+       if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+               return sa_nodemask;
+       if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+               return sa_this_sibling_map;
+       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+               return sa_this_core_map;
+       if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+               return sa_send_covered;
+       d->rd = alloc_rootdomain();
+       if (!d->rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
-               goto free_sched_groups;
+               return sa_tmpmask;
         }
+       return sa_rootdomain;
+ }
   
+ static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ {
+       struct sched_domain *sd = NULL;
   #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
- #endif
- 
-       /*
-        * Set up domains for cpus specified by the cpu_map.
-        */
-       for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = NULL, *p;
+       struct sched_domain *parent;
   
-               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
- 
- #ifdef CONFIG_NUMA
-               if (cpumask_weight(cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i).sd;
-                       SD_INIT(sd, ALLNODES);
-                       set_domain_attribute(sd, attr);
-                       cpumask_copy(sched_domain_span(sd), cpu_map);
-                       cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
-                       p = sd;
-                       sd_allnodes = 1;
-               } else
-                       p = NULL;
- 
-               sd = &per_cpu(node_domains, i).sd;
-               SD_INIT(sd, NODE);
+       d->sd_allnodes = 0;
+       if (cpumask_weight(cpu_map) >
+           SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+               sd = &per_cpu(allnodes_domains, i).sd;
+               SD_INIT(sd, ALLNODES);
                 set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-               sd->parent = p;
-               if (p)
-                       p->child = sd;
-               cpumask_and(sched_domain_span(sd),
-                           sched_domain_span(sd), cpu_map);
+               cpumask_copy(sched_domain_span(sd), cpu_map);
+               cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
+               d->sd_allnodes = 1;
+       }
+       parent = sd;
+ 
+       sd = &per_cpu(node_domains, i).sd;
+       SD_INIT(sd, NODE);
+       set_domain_attribute(sd, attr);
+       sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+       sd->parent = parent;
+       if (parent)
+               parent->child = sd;
+       cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
   #endif
+       return sd;
+ }
   
-               p = sd;
-               sd = &per_cpu(phys_domains, i).sd;
-               SD_INIT(sd, CPU);
-               set_domain_attribute(sd, attr);
-               cpumask_copy(sched_domain_span(sd), nodemask);
-               sd->parent = p;
-               if (p)
-                       p->child = sd;
-               cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+ {
+       struct sched_domain *sd;
+       sd = &per_cpu(phys_domains, i).sd;
+       SD_INIT(sd, CPU);
+       set_domain_attribute(sd, attr);
+       cpumask_copy(sched_domain_span(sd), d->nodemask);
+       sd->parent = parent;
+       if (parent)
+               parent->child = sd;
+       cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+       return sd;
+ }
   
+ static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+ {
+       struct sched_domain *sd = parent;
   #ifdef CONFIG_SCHED_MC
-               p = sd;
-               sd = &per_cpu(core_domains, i).sd;
-               SD_INIT(sd, MC);
-               set_domain_attribute(sd, attr);
-               cpumask_and(sched_domain_span(sd), cpu_map,
-                                                  cpu_coregroup_mask(i));
-               sd->parent = p;
-               p->child = sd;
-               cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+       sd = &per_cpu(core_domains, i).sd;
+       SD_INIT(sd, MC);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
   #endif
+       return sd;
+ }
   
+ static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+ {
+       struct sched_domain *sd = parent;
   #ifdef CONFIG_SCHED_SMT
-               p = sd;
-               sd = &per_cpu(cpu_domains, i).sd;
-               SD_INIT(sd, SIBLING);
-               set_domain_attribute(sd, attr);
-               cpumask_and(sched_domain_span(sd),
-                           topology_thread_cpumask(i), cpu_map);
-               sd->parent = p;
-               p->child = sd;
-               cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+       sd = &per_cpu(cpu_domains, i).sd;
+       SD_INIT(sd, SIBLING);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
   #endif
-       }
+       return sd;
+ }
   
+ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+                              const struct cpumask *cpu_map, int cpu)
+ {
+       switch (l) {
   #ifdef CONFIG_SCHED_SMT
-       /* Set up CPU (sibling) groups */
-       for_each_cpu(i, cpu_map) {
-               cpumask_and(this_sibling_map,
-                           topology_thread_cpumask(i), cpu_map);
-               if (i != cpumask_first(this_sibling_map))
-                       continue;
- 
-               init_sched_build_groups(this_sibling_map, cpu_map,
-                                       &cpu_to_cpu_group,
-                                       send_covered, tmpmask);
-       }
+       case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+               cpumask_and(d->this_sibling_map, cpu_map,
+                           topology_thread_cpumask(cpu));
+               if (cpu == cpumask_first(d->this_sibling_map))
+                       init_sched_build_groups(d->this_sibling_map, cpu_map,
+                                               &cpu_to_cpu_group,
+                                               d->send_covered, d->tmpmask);
+               break;
   #endif
- 
   #ifdef CONFIG_SCHED_MC
-       /* Set up multi-core groups */
-       for_each_cpu(i, cpu_map) {
-               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
-               if (i != cpumask_first(this_core_map))
-                       continue;
- 
-               init_sched_build_groups(this_core_map, cpu_map,
-                                       &cpu_to_core_group,
-                                       send_covered, tmpmask);
-       }
+       case SD_LV_MC: /* set up multi-core groups */
+               cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
+               if (cpu == cpumask_first(d->this_core_map))
+                       init_sched_build_groups(d->this_core_map, cpu_map,
+                                               &cpu_to_core_group,
+                                               d->send_covered, d->tmpmask);
+               break;
   #endif
- 
-       /* Set up physical groups */
-       for (i = 0; i < nr_node_ids; i++) {
-               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-               if (cpumask_empty(nodemask))
-                       continue;
- 
-               init_sched_build_groups(nodemask, cpu_map,
-                                       &cpu_to_phys_group,
-                                       send_covered, tmpmask);
-       }
- 
+       case SD_LV_CPU: /* set up physical groups */
+               cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+               if (!cpumask_empty(d->nodemask))
+                       init_sched_build_groups(d->nodemask, cpu_map,
+                                               &cpu_to_phys_group,
+                                               d->send_covered, d->tmpmask);
+               break;
   #ifdef CONFIG_NUMA
-       /* Set up node groups */
-       if (sd_allnodes) {
-               init_sched_build_groups(cpu_map, cpu_map,
-                                       &cpu_to_allnodes_group,
-                                       send_covered, tmpmask);
+       case SD_LV_ALLNODES:
+               init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+                                       d->send_covered, d->tmpmask);
+               break;
+ #endif
+       default:
+               break;
         }
+ }
   
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Set up node groups */
-               struct sched_group *sg, *prev;
-               int j;
- 
-               cpumask_clear(covered);
-               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
-               if (cpumask_empty(nodemask)) {
-                       sched_group_nodes[i] = NULL;
-                       continue;
-               }
+ /*
+  * Build sched domains for a given set of cpus and attach the sched domains
+  * to the individual cpus
+  */
+ static int __build_sched_domains(const struct cpumask *cpu_map,
+                                struct sched_domain_attr *attr)
+ {
+       enum s_alloc alloc_state = sa_none;
+       struct s_data d;
+       struct sched_domain *sd;
+       int i;
+ #ifdef CONFIG_NUMA
+       d.sd_allnodes = 0;
+ #endif
   
-               sched_domain_node_span(i, domainspan);
-               cpumask_and(domainspan, domainspan, cpu_map);
+       alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+       if (alloc_state != sa_rootdomain)
+               goto error;
+       alloc_state = sa_sched_groups;
   
-               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                 GFP_KERNEL, i);
-               if (!sg) {
-                       printk(KERN_WARNING "Can not alloc domain group for "
-                               "node %d\n", i);
-                       goto error;
-               }
-               sched_group_nodes[i] = sg;
-               for_each_cpu(j, nodemask) {
-                       struct sched_domain *sd;
+       /*
+        * Set up domains for cpus specified by the cpu_map.
+        */
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+                           cpu_map);
   
-                       sd = &per_cpu(node_domains, j).sd;
-                       sd->groups = sg;
-               }
-               sg->__cpu_power = 0;
-               cpumask_copy(sched_group_cpus(sg), nodemask);
-               sg->next = sg;
-               cpumask_or(covered, covered, nodemask);
-               prev = sg;
+               sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+               sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+       }
   
-               for (j = 0; j < nr_node_ids; j++) {
-                       int n = (i + j) % nr_node_ids;
+       for_each_cpu(i, cpu_map) {
+               build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+               build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+       }
   
-                       cpumask_complement(notcovered, covered);
-                       cpumask_and(tmpmask, notcovered, cpu_map);
-                       cpumask_and(tmpmask, tmpmask, domainspan);
-                       if (cpumask_empty(tmpmask))
-                               break;
+       /* Set up physical groups */
+       for (i = 0; i < nr_node_ids; i++)
+               build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
   
-                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
-                       if (cpumask_empty(tmpmask))
-                               continue;
+ #ifdef CONFIG_NUMA
+       /* Set up node groups */
+       if (d.sd_allnodes)
+               build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
   
-                       sg = kmalloc_node(sizeof(struct sched_group) +
-                                         cpumask_size(),
-                                         GFP_KERNEL, i);
-                       if (!sg) {
-                               printk(KERN_WARNING
-                               "Can not alloc domain group for node %d\n", j);
-                               goto error;
-                       }
-                       sg->__cpu_power = 0;
-                       cpumask_copy(sched_group_cpus(sg), tmpmask);
-                       sg->next = prev->next;
-                       cpumask_or(covered, covered, tmpmask);
-                       prev->next = sg;
-                       prev = sg;
-               }
-       }
+       for (i = 0; i < nr_node_ids; i++)
+               if (build_numa_sched_groups(&d, cpu_map, i))
+                       goto error;
   #endif
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
         for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
- 
+               sd = &per_cpu(cpu_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
         for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
- 
+               sd = &per_cpu(core_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   #endif
   
         for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
- 
+               sd = &per_cpu(phys_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   
   #ifdef CONFIG_NUMA
         for (i = 0; i < nr_node_ids; i++)
-               init_numa_sched_groups_power(sched_group_nodes[i]);
+               init_numa_sched_groups_power(d.sched_group_nodes[i]);
   
-       if (sd_allnodes) {
+       if (d.sd_allnodes) {
                 struct sched_group *sg;
   
                 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-                                                               tmpmask);
+                                                               d.tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
         for_each_cpu(i, cpu_map) {
-               struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
@@@ -8676,44 -8984,16 +9001,16 @@@
   #else
                 sd = &per_cpu(phys_domains, i).sd;
   #endif
-               cpu_attach_domain(sd, rd, i);
+               cpu_attach_domain(sd, d.rd, i);
         }
   
-       err = 0;
- 
- free_tmpmask:
-       free_cpumask_var(tmpmask);
- free_send_covered:
-       free_cpumask_var(send_covered);
- free_this_core_map:
-       free_cpumask_var(this_core_map);
- free_this_sibling_map:
-       free_cpumask_var(this_sibling_map);
- free_nodemask:
-       free_cpumask_var(nodemask);
- free_notcovered:
- #ifdef CONFIG_NUMA
-       free_cpumask_var(notcovered);
- free_covered:
-       free_cpumask_var(covered);
- free_domainspan:
-       free_cpumask_var(domainspan);
- out:
- #endif
-       return err;
- 
- free_sched_groups:
- #ifdef CONFIG_NUMA
-       kfree(sched_group_nodes);
- #endif
-       goto free_tmpmask;
+       d.sched_group_nodes = NULL; /* don't free this we still need it */
+       __free_domain_allocs(&d, sa_tmpmask, cpu_map);
+       return 0;
   
- #ifdef CONFIG_NUMA
   error:
-       free_sched_groups(cpu_map, tmpmask);
-       free_rootdomain(rd);
-       goto free_tmpmask;
- #endif
+       __free_domain_allocs(&d, alloc_state, cpu_map);
+       return -ENOMEM;
   }
   
   static int build_sched_domains(const struct cpumask *cpu_map)
@@@ -9321,11 -9601,11 +9618,11 @@@ void __init sched_init(void
                  * system cpu resource, based on the weight assigned to root
                  * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                  * by letting tasks of init_task_group sit in a separate cfs_rq
-                * (init_cfs_rq) and having one entity represent this group of
+                * (init_tg_cfs_rq) and having one entity represent this group of
                  * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                  */
                 init_tg_cfs_entry(&init_task_group,
-                               &per_cpu(init_cfs_rq, i),
+                               &per_cpu(init_tg_cfs_rq, i),
                                 &per_cpu(init_sched_entity, i), i, 1,
                                 root_task_group.se[i]);
   
@@@ -9351,6 -9631,7 +9648,7 @@@
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
+               rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
@@@ -9415,13 -9696,20 +9713,20 @@@
   }
   
   #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- void __might_sleep(char *file, int line)
+ static inline int preempt_count_equals(int preempt_offset)
+ {
+       int nested = preempt_count() & ~PREEMPT_ACTIVE;
+ 
+       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ }
+ 
+ void __might_sleep(char *file, int line, int preempt_offset)
   {
   #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
   
-       if ((!in_atomic() && !irqs_disabled()) ||
-                   system_state != SYSTEM_RUNNING || oops_in_progress)
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
@@@ -10598,113 -10886,3 +10903,113 @@@ struct cgroup_subsys cpuacct_subsys = 
         .subsys_id = cpuacct_subsys_id,
   };
   #endif        /* CONFIG_CGROUP_CPUACCT */
+ +
+ +#ifndef CONFIG_SMP
+ +
+ +int rcu_expedited_torture_stats(char *page)
+ +{
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+ +
+ +void synchronize_sched_expedited(void)
+ +{
+ +}
+ +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+ +
+ +#else /* #ifndef CONFIG_SMP */
+ +
+ +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+ +static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+ +
+ +#define RCU_EXPEDITED_STATE_POST -2
+ +#define RCU_EXPEDITED_STATE_IDLE -1
+ +
+ +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ +
+ +int rcu_expedited_torture_stats(char *page)
+ +{
+ +      int cnt = 0;
+ +      int cpu;
+ +
+ +      cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+ +      for_each_online_cpu(cpu) {
+ +               cnt += sprintf(&page[cnt], " %d:%d",
+ +                              cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+ +      }
+ +      cnt += sprintf(&page[cnt], "\n");
+ +      return cnt;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+ +
+ +static long synchronize_sched_expedited_count;
+ +
+ +/*
+ + * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ + * approach to force grace period to end quickly.  This consumes
+ + * significant time on all CPUs, and is thus not recommended for
+ + * any sort of common-case code.
+ + *
+ + * Note that it is illegal to call this function while holding any
+ + * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ + * observe this restriction will result in deadlock.
+ + */
+ +void synchronize_sched_expedited(void)
+ +{
+ +      int cpu;
+ +      unsigned long flags;
+ +      bool need_full_sync = 0;
+ +      struct rq *rq;
+ +      struct migration_req *req;
+ +      long snap;
+ +      int trycount = 0;
+ +
+ +      smp_mb();  /* ensure prior mod happens before capturing snap. */
+ +      snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ +      get_online_cpus();
+ +      while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ +              put_online_cpus();
+ +              if (trycount++ < 10)
+ +                      udelay(trycount * num_online_cpus());
+ +              else {
+ +                      synchronize_sched();
+ +                      return;
+ +              }
+ +              if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ +                      smp_mb(); /* ensure test happens before caller kfree */
+ +                      return;
+ +              }
+ +              get_online_cpus();
+ +      }
+ +      rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+ +      for_each_online_cpu(cpu) {
+ +              rq = cpu_rq(cpu);
+ +              req = &per_cpu(rcu_migration_req, cpu);
+ +              init_completion(&req->done);
+ +              req->task = NULL;
+ +              req->dest_cpu = RCU_MIGRATION_NEED_QS;
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              list_add(&req->list, &rq->migration_queue);
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +              wake_up_process(rq->migration_thread);
+ +      }
+ +      for_each_online_cpu(cpu) {
+ +              rcu_expedited_state = cpu;
+ +              req = &per_cpu(rcu_migration_req, cpu);
+ +              rq = cpu_rq(cpu);
+ +              wait_for_completion(&req->done);
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+ +                      need_full_sync = 1;
+ +              req->dest_cpu = RCU_MIGRATION_IDLE;
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +      }
+ +      rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ +      mutex_unlock(&rcu_sched_expedited_mutex);
+ +      put_online_cpus();
+ +      if (need_full_sync)
+ +              synchronize_sched();
+ +}
+ +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+ +
+ +#endif /* #else #ifndef CONFIG_SMP */
diff --combined kernel/sysctl.c

index 71d8dc7f99208629b63eb73800a69ea85e119c43,25d6bf3383bea9f8d2f95caca8a34d5bb2df7808..3125cff1c570d8f0797b79ddce8c0b94f6f8df70
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -49,6 -49,7 +49,6 @@@
   #include <linux/acpi.h>
   #include <linux/reboot.h>
   #include <linux/ftrace.h>
- -#include <linux/security.h>
   #include <linux/slow-work.h>
   #include <linux/perf_counter.h>
   
@@@ -245,6 -246,14 +245,14 @@@ static int max_wakeup_granularity_ns = 
   #endif
   
   static struct ctl_table kern_table[] = {
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_child_runs_first",
+               .data           = &sysctl_sched_child_runs_first,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
   #ifdef CONFIG_SCHED_DEBUG
         {
                 .ctl_name       = CTL_UNNUMBERED,
@@@ -297,14 -306,6 +305,6 @@@
                 .strategy       = &sysctl_intvec,
                 .extra1         = &zero,
         },
-       {
-               .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "sched_child_runs_first",
-               .data           = &sysctl_sched_child_runs_first,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "sched_features",
@@@ -329,6 -330,14 +329,14 @@@
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_time_avg",
+               .data           = &sysctl_sched_time_avg,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "timer_migration",
diff --combined kernel/workqueue.c

index 3c44b56b0da71d3c18560d010a6c7b114c8c8798,ea1b4e7674d564c0a1dd48f4a711f896e418925a..addfe2df93b13d8982235054d986c1d059fc0894
--- 1/kernel/workqueue.c
--- 2/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@@ -317,8 -317,6 +317,6 @@@ static int worker_thread(void *__cwq
         if (cwq->wq->freezeable)
                 set_freezable();
   
-       set_user_nice(current, -5);
- 
         for (;;) {
                 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
                 if (!freezing(current) &&
@@@ -600,12 -598,7 +598,12 @@@ static struct workqueue_struct *keventd
    * schedule_work - put work task in global workqueue
    * @work: job to be done
    *
- - * This puts a job in the kernel-global workqueue.
+ + * Returns zero if @work was already on the kernel-global workqueue and
+ + * non-zero otherwise.
+ + *
+ + * This puts a job in the kernel-global workqueue if it was not already
+ + * queued and leaves it in the same position on the kernel-global
+ + * workqueue otherwise.
    */
   int schedule_work(struct work_struct *work)
   {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 11 Sep 2009 20:23:18 +0000 (13:23 -0700)
		1	2
fs/locks.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/hardirq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/workqueue.c	patch \|	diff1 \|	diff2 \|	blob \| history