Merge branch 'locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
diff --combined include/linux/sched.h

index 1d19c025f9d2e78b34692b3f350d9bef6fe522ed,b00a77f4999ed64d2fe668532819e2a29dfb6fd6..29df6374d2de517e078c7dd29dd5eb6b7c18f82b
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -331,7 -331,9 +331,9 @@@ extern signed long schedule_timeout(sig
   extern signed long schedule_timeout_interruptible(signed long timeout);
   extern signed long schedule_timeout_killable(signed long timeout);
   extern signed long schedule_timeout_uninterruptible(signed long timeout);
+ asmlinkage void __schedule(void);
   asmlinkage void schedule(void);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
   
   struct nsproxy;
   struct user_namespace;
@@@ -443,6 -445,7 +445,6 @@@ struct pacct_struct 
    * @utime:            time spent in user mode, in &cputime_t units
    * @stime:            time spent in kernel mode, in &cputime_t units
    * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
- - * @lock:             lock for fields in this struct
    *
    * This structure groups together three kinds of CPU time that are
    * tracked for threads and thread groups.  Most things considering
@@@ -453,33 -456,23 +455,33 @@@ struct task_cputime 
         cputime_t utime;
         cputime_t stime;
         unsigned long long sum_exec_runtime;
- -      spinlock_t lock;
   };
   /* Alternate field names when used to cache expirations. */
   #define prof_exp      stime
   #define virt_exp      utime
   #define sched_exp     sum_exec_runtime
   
+ +#define INIT_CPUTIME  \
+ +      (struct task_cputime) {                                 \
+ +              .utime = cputime_zero,                          \
+ +              .stime = cputime_zero,                          \
+ +              .sum_exec_runtime = 0,                          \
+ +      }
+ +
   /**
- - * struct thread_group_cputime - thread group interval timer counts
- - * @totals:           thread group interval timers; substructure for
- - *                    uniprocessor kernel, per-cpu for SMP kernel.
+ + * struct thread_group_cputimer - thread group interval timer counts
+ + * @cputime:          thread group interval timers.
+ + * @running:          non-zero when there are timers running and
+ + *                    @cputime receives updates.
+ + * @lock:             lock for fields in this struct.
    *
    * This structure contains the version of task_cputime, above, that is
- - * used for thread group CPU clock calculations.
+ + * used for thread group CPU timer calculations.
    */
- -struct thread_group_cputime {
- -      struct task_cputime totals;
+ +struct thread_group_cputimer {
+ +      struct task_cputime cputime;
+ +      int running;
+ +      spinlock_t lock;
   };
   
   /*
@@@ -528,10 -521,10 +530,10 @@@ struct signal_struct 
         cputime_t it_prof_incr, it_virt_incr;
   
         /*
- -       * Thread group totals for process CPU clocks.
- -       * See thread_group_cputime(), et al, for details.
+ +       * Thread group totals for process CPU timers.
+ +       * See thread_group_cputimer(), et al, for details.
          */
- -      struct thread_group_cputime cputime;
+ +      struct thread_group_cputimer cputimer;
   
         /* Earliest-expiration cache. */
         struct task_cputime cputime_expires;
@@@ -568,7 -561,7 +570,7 @@@
          * Live threads maintain their own counters and add to these
          * in __exit_signal, except for the group leader.
          */
- -      cputime_t cutime, cstime;
+ +      cputime_t utime, stime, cutime, cstime;
         cputime_t gtime;
         cputime_t cgtime;
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@@ -576,14 -569,6 +578,14 @@@
         unsigned long inblock, oublock, cinblock, coublock;
         struct task_io_accounting ioac;
   
+ +      /*
+ +       * Cumulative ns of schedule CPU time fo dead threads in the
+ +       * group, not including a zombie group leader, (This only differs
+ +       * from jiffies_to_ns(utime + stime) if sched_clock uses something
+ +       * other than jiffies.)
+ +       */
+ +      unsigned long long sum_sched_runtime;
+ +
         /*
          * We don't bother to synchronize most readers of this at all,
          * because there is no reader checking a limit that actually needs
@@@ -998,7 -983,6 +1000,7 @@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ +      int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
@@@ -1053,10 -1037,6 +1055,10 @@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
   
+ +      u64                     start_runtime;
+ +      u64                     avg_wakeup;
+ +      u64                     nr_migrations;
+ +
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@ -1072,6 -1052,7 +1074,6 @@@
         u64                     exec_max;
         u64                     slice_max;
   
- -      u64                     nr_migrations;
         u64                     nr_migrations_cold;
         u64                     nr_failed_migrations_affine;
         u64                     nr_failed_migrations_running;
@@@ -1168,7 -1149,6 +1170,7 @@@ struct task_struct 
   #endif
   
         struct list_head tasks;
+ +      struct plist_node pushable_tasks;
   
         struct mm_struct *mm, *active_mm;
   
@@@ -1180,14 -1160,13 +1182,14 @@@
         /* ??? */
         unsigned int personality;
         unsigned did_exec:1;
+ +      unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
+ +                               * execve */
         pid_t pid;
         pid_t tgid;
   
- -#ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
- -#endif
+ +
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
          * older sibling, respectively.  (p->father can be replaced with 
@@@ -1334,6 -1313,7 +1336,7 @@@
         int lockdep_depth;
         unsigned int lockdep_recursion;
         struct held_lock held_locks[MAX_LOCK_DEPTH];
+       gfp_t lockdep_reclaim_gfp;
   #endif
   
   /* journalling filesystem info */
@@@ -1425,9 -1405,6 +1428,9 @@@
   #endif
   };
   
+ +/* Future-safe accessor for struct task_struct's cpus_allowed. */
+ +#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+ +
   /*
    * Priority of a process goes from 0..MAX_PRIO-1, valid RT
    * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@@ -1679,16 -1656,6 +1682,16 @@@ static inline int set_cpus_allowed(stru
         return set_cpus_allowed_ptr(p, &new_mask);
   }
   
+ +/*
+ + * Architectures can set this to 1 if they have specified
+ + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ + * but then during bootup it turns out that sched_clock()
+ + * is reliable after all:
+ + */
+ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ +extern int sched_clock_stable;
+ +#endif
+ +
   extern unsigned long long sched_clock(void);
   
   extern void sched_clock_init(void);
@@@ -2106,19 -2073,6 +2109,19 @@@ static inline int object_is_on_stack(vo
   
   extern void thread_info_cache_init(void);
   
+ +#ifdef CONFIG_DEBUG_STACK_USAGE
+ +static inline unsigned long stack_not_used(struct task_struct *p)
+ +{
+ +      unsigned long *n = end_of_stack(p);
+ +
+ +      do {    /* Skip over canary */
+ +              n++;
+ +      } while (!*n);
+ +
+ +      return (unsigned long)n - (unsigned long)end_of_stack(p);
+ +}
+ +#endif
+ +
   /* set thread flags in other task's structures
    * - see asm/thread_info.h for TIF_xxxx flags available
    */
@@@ -2232,14 -2186,27 +2235,14 @@@ static inline int spin_needbreak(spinlo
   /*
    * Thread group CPU time accounting.
    */
- -
- -static inline
- -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
- -{
- -      struct task_cputime *totals = &tsk->signal->cputime.totals;
- -      unsigned long flags;
- -
- -      spin_lock_irqsave(&totals->lock, flags);
- -      *times = *totals;
- -      spin_unlock_irqrestore(&totals->lock, flags);
- -}
+ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
+ +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
   
   static inline void thread_group_cputime_init(struct signal_struct *sig)
   {
- -      sig->cputime.totals = (struct task_cputime){
- -              .utime = cputime_zero,
- -              .stime = cputime_zero,
- -              .sum_exec_runtime = 0,
- -      };
- -
- -      spin_lock_init(&sig->cputime.totals.lock);
+ +      sig->cputimer.cputime = INIT_CPUTIME;
+ +      spin_lock_init(&sig->cputimer.lock);
+ +      sig->cputimer.running = 0;
   }
   
   static inline void thread_group_cputime_free(struct signal_struct *sig)
@@@ -2323,13 -2290,9 +2326,13 @@@ extern long sched_group_rt_runtime(stru
   extern int sched_group_set_rt_period(struct task_group *tg,
                                       long rt_period_us);
   extern long sched_group_rt_period(struct task_group *tg);
+ +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
   #endif
   #endif
   
+ +extern int task_can_switch_user(struct user_struct *up,
+ +                                      struct task_struct *tsk);
+ +
   #ifdef CONFIG_TASK_XACCT
   static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
   {
diff --combined include/linux/timer.h

index e2d662e3416e38026bbc7385f744481de1b7d0a0,51774eb87cc6b825b3321afef833e35912a619e5..6cdb6f3331f11b6a80d7cd2c85a45706937b16d3
--- 1/include/linux/timer.h
--- 2/include/linux/timer.h
+++ b/include/linux/timer.h
@@@ -5,6 -5,7 +5,7 @@@
   #include <linux/ktime.h>
   #include <linux/stddef.h>
   #include <linux/debugobjects.h>
+ #include <linux/stringify.h>
   
   struct tvec_base;
   
@@@ -21,52 -22,126 +22,126 @@@ struct timer_list 
         char start_comm[16];
         int start_pid;
   #endif
+ #ifdef CONFIG_LOCKDEP
+       struct lockdep_map lockdep_map;
+ #endif
   };
   
   extern struct tvec_base boot_tvec_bases;
   
+ #ifdef CONFIG_LOCKDEP
+ /*
+  * NB: because we have to copy the lockdep_map, setting the lockdep_map key
+  * (second argument) here is required, otherwise it could be initialised to
+  * the copy of the lockdep_map later! We use the pointer to and the string
+  * "<file>:<line>" as the key resp. the name of the lockdep_map.
+  */
+ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)                          \
+       .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
+ #else
+ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
+ #endif
+ 
   #define TIMER_INITIALIZER(_function, _expires, _data) {               \
                 .entry = { .prev = TIMER_ENTRY_STATIC },        \
                 .function = (_function),                        \
                 .expires = (_expires),                          \
                 .data = (_data),                                \
                 .base = &boot_tvec_bases,                       \
+               __TIMER_LOCKDEP_MAP_INITIALIZER(                \
+                       __FILE__ ":" __stringify(__LINE__))     \
         }
   
   #define DEFINE_TIMER(_name, _function, _expires, _data)               \
         struct timer_list _name =                               \
                 TIMER_INITIALIZER(_function, _expires, _data)
   
- void init_timer(struct timer_list *timer);
- void init_timer_deferrable(struct timer_list *timer);
+ void init_timer_key(struct timer_list *timer,
+                   const char *name,
+                   struct lock_class_key *key);
+ void init_timer_deferrable_key(struct timer_list *timer,
+                              const char *name,
+                              struct lock_class_key *key);
+ 
+ #ifdef CONFIG_LOCKDEP
+ #define init_timer(timer)                                             \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_key((timer), #timer, &__key);                \
+       } while (0)
+ 
+ #define init_timer_deferrable(timer)                                  \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_deferrable_key((timer), #timer, &__key);     \
+       } while (0)
+ 
+ #define init_timer_on_stack(timer)                                    \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_on_stack_key((timer), #timer, &__key);       \
+       } while (0)
+ 
+ #define setup_timer(timer, fn, data)                                  \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               setup_timer_key((timer), #timer, &__key, (fn), (data));\
+       } while (0)
+ 
+ #define setup_timer_on_stack(timer, fn, data)                         \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               setup_timer_on_stack_key((timer), #timer, &__key,       \
+                                        (fn), (data));                 \
+       } while (0)
+ #else
+ #define init_timer(timer)\
+       init_timer_key((timer), NULL, NULL)
+ #define init_timer_deferrable(timer)\
+       init_timer_deferrable_key((timer), NULL, NULL)
+ #define init_timer_on_stack(timer)\
+       init_timer_on_stack_key((timer), NULL, NULL)
+ #define setup_timer(timer, fn, data)\
+       setup_timer_key((timer), NULL, NULL, (fn), (data))
+ #define setup_timer_on_stack(timer, fn, data)\
+       setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
+ #endif
   
   #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
- extern void init_timer_on_stack(struct timer_list *timer);
+ extern void init_timer_on_stack_key(struct timer_list *timer,
+                                   const char *name,
+                                   struct lock_class_key *key);
   extern void destroy_timer_on_stack(struct timer_list *timer);
   #else
   static inline void destroy_timer_on_stack(struct timer_list *timer) { }
- static inline void init_timer_on_stack(struct timer_list *timer)
+ static inline void init_timer_on_stack_key(struct timer_list *timer,
+                                          const char *name,
+                                          struct lock_class_key *key)
   {
-       init_timer(timer);
+       init_timer_key(timer, name, key);
   }
   #endif
   
- static inline void setup_timer(struct timer_list * timer,
+ static inline void setup_timer_key(struct timer_list * timer,
+                               const char *name,
+                               struct lock_class_key *key,
                                 void (*function)(unsigned long),
                                 unsigned long data)
   {
         timer->function = function;
         timer->data = data;
-       init_timer(timer);
+       init_timer_key(timer, name, key);
   }
   
- static inline void setup_timer_on_stack(struct timer_list *timer,
+ static inline void setup_timer_on_stack_key(struct timer_list *timer,
+                                       const char *name,
+                                       struct lock_class_key *key,
                                         void (*function)(unsigned long),
                                         unsigned long data)
   {
         timer->function = function;
         timer->data = data;
-       init_timer_on_stack(timer);
+       init_timer_on_stack_key(timer, name, key);
   }
   
   /**
@@@ -86,8 -161,8 +161,8 @@@ static inline int timer_pending(const s
   
   extern void add_timer_on(struct timer_list *timer, int cpu);
   extern int del_timer(struct timer_list * timer);
- -extern int __mod_timer(struct timer_list *timer, unsigned long expires);
   extern int mod_timer(struct timer_list *timer, unsigned long expires);
+ +extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
   
   /*
    * The jiffies value which is added to now, when there is no timer
@@@ -146,7 -221,25 +221,7 @@@ static inline void timer_stats_timer_cl
   }
   #endif
   
- -/**
- - * add_timer - start a timer
- - * @timer: the timer to be added
- - *
- - * The kernel will do a ->function(->data) callback from the
- - * timer interrupt at the ->expires point in the future. The
- - * current time is 'jiffies'.
- - *
- - * The timer's ->expires, ->function (and if the handler uses it, ->data)
- - * fields must be set prior calling this function.
- - *
- - * Timers with an ->expires field in the past will be executed in the next
- - * timer tick.
- - */
- -static inline void add_timer(struct timer_list *timer)
- -{
- -      BUG_ON(timer_pending(timer));
- -      __mod_timer(timer, timer->expires);
- -}
+ +extern void add_timer(struct timer_list *timer);
   
   #ifdef CONFIG_SMP
     extern int try_to_del_timer_sync(struct timer_list *timer);
diff --combined kernel/sched.c

index 5757e03cfac0bdf7cd50f3625a318645c562b973,186c6fd08acf8de949c936c5dc0116c3b90765fe..196d48babbef87c088214e4f252b4397d2bf25be
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -223,7 -223,7 +223,7 @@@ static void start_rt_bandwidth(struct r
   {
         ktime_t now;
   
- -      if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ +      if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return;
   
         if (hrtimer_active(&rt_b->rt_period_timer))
@@@ -331,13 -331,6 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
    */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return list_empty(&root_task_group.children);
+ +}
+ +#endif
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -398,13 -391,6 +398,13 @@@ static inline void set_task_rq(struct t
   
   #else
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return 1;
+ +}
+ +#endif
+ +
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
   static inline struct task_group *task_group(struct task_struct *p)
   {
@@@ -481,17 -467,11 +481,17 @@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      int highest_prio; /* highest queued rt task prio */
+ +      struct {
+ +              int curr; /* highest queued rt task prio */
+ +#ifdef CONFIG_SMP
+ +              int next; /* next highest */
+ +#endif
+ +      } highest_prio;
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+ +      struct plist_head pushable_tasks;
   #endif
         int rt_throttled;
         u64 rt_time;
@@@ -569,6 -549,7 +569,6 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- -      unsigned char idle_at_tick;
   #ifdef CONFIG_NO_HZ
         unsigned long last_tick_seen;
         unsigned char in_nohz_recently;
@@@ -609,7 -590,6 +609,7 @@@
         struct root_domain *rd;
         struct sched_domain *sd;
   
+ +      unsigned char idle_at_tick;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@@ -638,6 -618,9 +638,6 @@@
         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
- -      unsigned int yld_exp_empty;
- -      unsigned int yld_act_empty;
- -      unsigned int yld_both_empty;
         unsigned int yld_count;
   
         /* schedule() stats */
@@@ -1200,10 -1183,10 +1200,10 @@@ static void resched_task(struct task_st
   
         assert_spin_locked(&task_rq(p)->lock);
   
- -      if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ +      if (test_tsk_need_resched(p))
                 return;
   
- -      set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(p);
   
         cpu = task_cpu(p);
         if (cpu == smp_processor_id())
@@@ -1259,7 -1242,7 +1259,7 @@@ void wake_up_idle_cpu(int cpu
          * lockless. The worst case is that the other CPU runs the
          * idle task through an additional NOOP schedule()
          */
- -      set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(rq->idle);
   
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@@ -1627,42 -1610,21 +1627,42 @@@ static inline void update_shares_locked
   
   #endif
   
+ +#ifdef CONFIG_PREEMPT
+ +
   /*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ + * way at the expense of forcing extra atomic operations in all
+ + * invocations.  This assures that the double_lock is acquired using the
+ + * same underlying policy as the spinlock_t on this architecture, which
+ + * reduces latency compared to the unfair variant below.  However, it
+ + * also adds more overhead and therefore may reduce throughput.
    */
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +      __releases(this_rq->lock)
+ +      __acquires(busiest->lock)
+ +      __acquires(this_rq->lock)
+ +{
+ +      spin_unlock(&this_rq->lock);
+ +      double_rq_lock(this_rq, busiest);
+ +
+ +      return 1;
+ +}
+ +
+ +#else
+ +/*
+ + * Unfair double_lock_balance: Optimizes throughput at the expense of
+ + * latency by eliminating extra atomic operations when the locks are
+ + * already in proper order on entry.  This favors lower cpu-ids and will
+ + * grant the double lock to lower cpus over higher ids under contention,
+ + * regardless of entry order into the function.
+ + */
+ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
   {
         int ret = 0;
   
- -      if (unlikely(!irqs_disabled())) {
- -              /* printk() doesn't work good under rq->lock */
- -              spin_unlock(&this_rq->lock);
- -              BUG_ON(1);
- -      }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@ -1675,22 -1637,6 +1675,22 @@@
         return ret;
   }
   
+ +#endif /* CONFIG_PREEMPT */
+ +
+ +/*
+ + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + */
+ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +{
+ +      if (unlikely(!irqs_disabled())) {
+ +              /* printk() doesn't work good under rq->lock */
+ +              spin_unlock(&this_rq->lock);
+ +              BUG_ON(1);
+ +      }
+ +
+ +      return _double_lock_balance(this_rq, busiest);
+ +}
+ +
   static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
   {
@@@ -1759,9 -1705,6 +1759,9 @@@ static void update_avg(u64 *avg, u64 sa
   
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
+ +      if (wakeup)
+ +              p->se.start_runtime = p->se.sum_exec_runtime;
+ +
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@@ -1769,15 -1712,10 +1769,15 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
- -      if (sleep && p->se.last_wakeup) {
- -              update_avg(&p->se.avg_overlap,
- -                         p->se.sum_exec_runtime - p->se.last_wakeup);
- -              p->se.last_wakeup = 0;
+ +      if (sleep) {
+ +              if (p->se.last_wakeup) {
+ +                      update_avg(&p->se.avg_overlap,
+ +                              p->se.sum_exec_runtime - p->se.last_wakeup);
+ +                      p->se.last_wakeup = 0;
+ +              } else {
+ +                      update_avg(&p->se.avg_wakeup,
+ +                              sysctl_sched_wakeup_granularity);
+ +              }
         }
   
         sched_info_dequeued(p);
@@@ -2079,7 -2017,7 +2079,7 @@@ unsigned long wait_task_inactive(struc
                  * it must be off the runqueue _entirely_, and not
                  * preempted!
                  *
- -               * So if it wa still runnable (but just not actively
+ +               * So if it was still runnable (but just not actively
                  * running right now), it's preempted, and we should
                  * yield - it could be a while.
                  */
@@@ -2328,8 -2266,18 +2328,8 @@@ static int try_to_wake_up(struct task_s
         if (!sched_feat(SYNC_WAKEUPS))
                 sync = 0;
   
- -      if (!sync) {
- -              if (current->se.avg_overlap < sysctl_sched_migration_cost &&
- -                        p->se.avg_overlap < sysctl_sched_migration_cost)
- -                      sync = 1;
- -      } else {
- -              if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
- -                        p->se.avg_overlap >= sysctl_sched_migration_cost)
- -                      sync = 0;
- -      }
- -
   #ifdef CONFIG_SMP
- -      if (sched_feat(LB_WAKEUP_UPDATE)) {
+ +      if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                 struct sched_domain *sd;
   
                 this_cpu = raw_smp_processor_id();
@@@ -2407,22 -2355,6 +2407,22 @@@ out_activate
         activate_task(rq, p, 1);
         success = 1;
   
+ +      /*
+ +       * Only attribute actual wakeups done by this task.
+ +       */
+ +      if (!in_interrupt()) {
+ +              struct sched_entity *se = &current->se;
+ +              u64 sample = se->sum_exec_runtime;
+ +
+ +              if (se->last_wakeup)
+ +                      sample -= se->last_wakeup;
+ +              else
+ +                      sample -= se->start_runtime;
+ +              update_avg(&se->avg_wakeup, sample);
+ +
+ +              se->last_wakeup = se->sum_exec_runtime;
+ +      }
+ +
   out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@@ -2433,6 -2365,8 +2433,6 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
- -      current->se.last_wakeup = current->se.sum_exec_runtime;
- -
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2462,8 -2396,6 +2462,8 @@@ static void __sched_fork(struct task_st
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+ +      p->se.start_runtime             = 0;
+ +      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@@ -2526,8 -2458,6 +2526,8 @@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+ +      plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +
         put_cpu();
   }
   
@@@ -2571,7 -2501,7 +2571,7 @@@ void wake_up_new_task(struct task_struc
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
   /**
- - * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ + * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2668,12 -2598,6 +2668,12 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ +#ifdef CONFIG_SMP
+ +      int post_schedule = 0;
+ +
+ +      if (current->sched_class->needs_post_schedule)
+ +              post_schedule = current->sched_class->needs_post_schedule(rq);
+ +#endif
   
         rq->prev_mm = NULL;
   
@@@ -2692,7 -2616,7 +2692,7 @@@
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
- -      if (current->sched_class->post_schedule)
+ +      if (post_schedule)
                 current->sched_class->post_schedule(rq);
   #endif
   
@@@ -2999,7 -2923,6 +2999,7 @@@ int can_migrate_task(struct task_struc
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
   {
+ +      int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
          * 1) running (obviously), or
@@@ -3023,11 -2946,10 +3023,11 @@@
          * 2) too many balance attempts have failed.
          */
   
- -      if (!task_hot(p, rq->clock, sd) ||
- -                      sd->nr_balance_failed > sd->cache_nice_tries) {
+ +      tsk_cache_hot = task_hot(p, rq->clock, sd);
+ +      if (!tsk_cache_hot ||
+ +              sd->nr_balance_failed > sd->cache_nice_tries) {
   #ifdef CONFIG_SCHEDSTATS
- -              if (task_hot(p, rq->clock, sd)) {
+ +              if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
                         schedstat_inc(p, se.nr_forced_migrations);
                 }
@@@ -3035,7 -2957,7 +3035,7 @@@
                 return 1;
         }
   
- -      if (task_hot(p, rq->clock, sd)) {
+ +      if (tsk_cache_hot) {
                 schedstat_inc(p, se.nr_failed_migrations_hot);
                 return 0;
         }
@@@ -3075,16 -2997,6 +3075,16 @@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
   
+ +#ifdef CONFIG_PREEMPT
+ +      /*
+ +       * NEWIDLE balancing is a source of latency, so preemptible kernels
+ +       * will stop after the first task is pulled to minimize the critical
+ +       * section.
+ +       */
+ +      if (idle == CPU_NEWLY_IDLE)
+ +              goto out;
+ +#endif
+ +
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@ -3131,15 -3043,9 +3131,15 @@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
   
+ +#ifdef CONFIG_PREEMPT
+ +              /*
+ +               * NEWIDLE balancing is a source of latency, so preemptible
+ +               * kernels will stop after the first task is pulled to minimize
+ +               * the critical section.
+ +               */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- -
+ +#endif
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3189,480 -3095,246 +3189,480 @@@ static int move_one_task(struct rq *thi
   
         return 0;
   }
- -
+ +/********** Helpers for find_busiest_group ************************/
   /*
- - * find_busiest_group finds and returns the busiest CPU group within the
- - * domain. It calculates and returns the amount of weighted load which
- - * should be moved to restore balance via the imbalance parameter.
+ + * sd_lb_stats - Structure to store the statistics of a sched_domain
+ + *            during load balancing.
    */
- -static struct sched_group *
- -find_busiest_group(struct sched_domain *sd, int this_cpu,
- -                 unsigned long *imbalance, enum cpu_idle_type idle,
- -                 int *sd_idle, const struct cpumask *cpus, int *balance)
- -{
- -      struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- -      unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- -      unsigned long max_pull;
- -      unsigned long busiest_load_per_task, busiest_nr_running;
- -      unsigned long this_load_per_task, this_nr_running;
- -      int load_idx, group_imb = 0;
+ +struct sd_lb_stats {
+ +      struct sched_group *busiest; /* Busiest group in this sd */
+ +      struct sched_group *this;  /* Local group in this sd */
+ +      unsigned long total_load;  /* Total load of all groups in sd */
+ +      unsigned long total_pwr;   /*   Total power of all groups in sd */
+ +      unsigned long avg_load;    /* Average load across all groups in sd */
+ +
+ +      /** Statistics of this group */
+ +      unsigned long this_load;
+ +      unsigned long this_load_per_task;
+ +      unsigned long this_nr_running;
+ +
+ +      /* Statistics of the busiest group */
+ +      unsigned long max_load;
+ +      unsigned long busiest_load_per_task;
+ +      unsigned long busiest_nr_running;
+ +
+ +      int group_imb; /* Is there imbalance in this sd */
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      int power_savings_balance = 1;
- -      unsigned long leader_nr_running = 0, min_load_per_task = 0;
- -      unsigned long min_nr_running = ULONG_MAX;
- -      struct sched_group *group_min = NULL, *group_leader = NULL;
+ +      int power_savings_balance; /* Is powersave balance needed for this sd */
+ +      struct sched_group *group_min; /* Least loaded group in sd */
+ +      struct sched_group *group_leader; /* Group which relieves group_min */
+ +      unsigned long min_load_per_task; /* load_per_task in group_min */
+ +      unsigned long leader_nr_running; /* Nr running of group_leader */
+ +      unsigned long min_nr_running; /* Nr running of group_min */
   #endif
+ +};
+ +
+ +/*
+ + * sg_lb_stats - stats of a sched_group required for load_balancing
+ + */
+ +struct sg_lb_stats {
+ +      unsigned long avg_load; /*Avg load across the CPUs of the group */
+ +      unsigned long group_load; /* Total load over the CPUs of the group */
+ +      unsigned long sum_nr_running; /* Nr tasks running in the group */
+ +      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ +      unsigned long group_capacity;
+ +      int group_imb; /* Is there an imbalance in the group ? */
+ +};
   
- -      max_load = this_load = total_load = total_pwr = 0;
- -      busiest_load_per_task = busiest_nr_running = 0;
- -      this_load_per_task = this_nr_running = 0;
+ +/**
+ + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ + * @group: The group whose first cpu is to be returned.
+ + */
+ +static inline unsigned int group_first_cpu(struct sched_group *group)
+ +{
+ +      return cpumask_first(sched_group_cpus(group));
+ +}
   
- -      if (idle == CPU_NOT_IDLE)
+ +/**
+ + * get_sd_load_idx - Obtain the load index for a given sched domain.
+ + * @sd: The sched_domain whose load_idx is to be obtained.
+ + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ + */
+ +static inline int get_sd_load_idx(struct sched_domain *sd,
+ +                                      enum cpu_idle_type idle)
+ +{
+ +      int load_idx;
+ +
+ +      switch (idle) {
+ +      case CPU_NOT_IDLE:
                 load_idx = sd->busy_idx;
- -      else if (idle == CPU_NEWLY_IDLE)
+ +              break;
+ +
+ +      case CPU_NEWLY_IDLE:
                 load_idx = sd->newidle_idx;
- -      else
+ +              break;
+ +      default:
                 load_idx = sd->idle_idx;
+ +              break;
+ +      }
   
- -      do {
- -              unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- -              int local_group;
- -              int i;
- -              int __group_imb = 0;
- -              unsigned int balance_cpu = -1, first_idle_cpu = 0;
- -              unsigned long sum_nr_running, sum_weighted_load;
- -              unsigned long sum_avg_load_per_task;
- -              unsigned long avg_load_per_task;
+ +      return load_idx;
+ +}
   
- -              local_group = cpumask_test_cpu(this_cpu,
- -                                             sched_group_cpus(group));
   
- -              if (local_group)
- -                      balance_cpu = cpumask_first(sched_group_cpus(group));
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +/**
+ + * init_sd_power_savings_stats - Initialize power savings statistics for
+ + * the given sched_domain, during load balancing.
+ + *
+ + * @sd: Sched domain whose power-savings statistics are to be initialized.
+ + * @sds: Variable containing the statistics for sd.
+ + * @idle: Idle status of the CPU at which we're performing load-balancing.
+ + */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      /*
+ +       * Busy processors will not participate in power savings
+ +       * balance.
+ +       */
+ +      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ +              sds->power_savings_balance = 0;
+ +      else {
+ +              sds->power_savings_balance = 1;
+ +              sds->min_nr_running = ULONG_MAX;
+ +              sds->leader_nr_running = 0;
+ +      }
+ +}
+ +
+ +/**
+ + * update_sd_power_savings_stats - Update the power saving stats for a
+ + * sched_domain while performing load balancing.
+ + *
+ + * @group: sched_group belonging to the sched_domain under consideration.
+ + * @sds: Variable containing the statistics of the sched_domain
+ + * @local_group: Does group contain the CPU for which we're performing
+ + *            load balancing ?
+ + * @sgs: Variable containing the statistics of the group.
+ + */
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
   
- -              /* Tally up the load of all CPUs in the group */
- -              sum_weighted_load = sum_nr_running = avg_load = 0;
- -              sum_avg_load_per_task = avg_load_per_task = 0;
+ +      if (!sds->power_savings_balance)
+ +              return;
   
- -              max_cpu_load = 0;
- -              min_cpu_load = ~0UL;
+ +      /*
+ +       * If the local group is idle or completely loaded
+ +       * no need to do power savings balance at this domain
+ +       */
+ +      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ +                              !sds->this_nr_running))
+ +              sds->power_savings_balance = 0;
   
- -              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- -                      struct rq *rq = cpu_rq(i);
+ +      /*
+ +       * If a group is already running at full capacity or idle,
+ +       * don't include that group in power savings calculations
+ +       */
+ +      if (!sds->power_savings_balance ||
+ +              sgs->sum_nr_running >= sgs->group_capacity ||
+ +              !sgs->sum_nr_running)
+ +              return;
   
- -                      if (*sd_idle && rq->nr_running)
- -                              *sd_idle = 0;
+ +      /*
+ +       * Calculate the group which has the least non-idle load.
+ +       * This is the group from where we need to pick up the load
+ +       * for saving power
+ +       */
+ +      if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ +          (sgs->sum_nr_running == sds->min_nr_running &&
+ +           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ +              sds->group_min = group;
+ +              sds->min_nr_running = sgs->sum_nr_running;
+ +              sds->min_load_per_task = sgs->sum_weighted_load /
+ +                                              sgs->sum_nr_running;
+ +      }
   
- -                      /* Bias balancing toward cpus of our domain */
- -                      if (local_group) {
- -                              if (idle_cpu(i) && !first_idle_cpu) {
- -                                      first_idle_cpu = 1;
- -                                      balance_cpu = i;
- -                              }
+ +      /*
+ +       * Calculate the group which is almost near its
+ +       * capacity but still has some space to pick up some load
+ +       * from other group and save more power
+ +       */
+ +      if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ +              return;
   
- -                              load = target_load(i, load_idx);
- -                      } else {
- -                              load = source_load(i, load_idx);
- -                              if (load > max_cpu_load)
- -                                      max_cpu_load = load;
- -                              if (min_cpu_load > load)
- -                                      min_cpu_load = load;
- -                      }
+ +      if (sgs->sum_nr_running > sds->leader_nr_running ||
+ +          (sgs->sum_nr_running == sds->leader_nr_running &&
+ +           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ +              sds->group_leader = group;
+ +              sds->leader_nr_running = sgs->sum_nr_running;
+ +      }
+ +}
   
- -                      avg_load += load;
- -                      sum_nr_running += rq->nr_running;
- -                      sum_weighted_load += weighted_cpuload(i);
+ +/**
+ + * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ + * @sds: Variable containing the statistics of the sched_domain
+ + *    under consideration.
+ + * @this_cpu: Cpu at which we're currently performing load-balancing.
+ + * @imbalance: Variable to store the imbalance.
+ + *
+ + * Description:
+ + * Check if we have potential to perform some power-savings balance.
+ + * If yes, set the busiest group to be the least loaded group in the
+ + * sched_domain, so that it's CPUs can be put to idle.
+ + *
+ + * Returns 1 if there is potential to perform power-savings balance.
+ + * Else returns 0.
+ + */
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      if (!sds->power_savings_balance)
+ +              return 0;
   
- -                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
- -              }
+ +      if (sds->this != sds->group_leader ||
+ +                      sds->group_leader == sds->group_min)
+ +              return 0;
   
- -              /*
- -               * First idle cpu or the first cpu(busiest) in this sched group
- -               * is eligible for doing load balancing at this and above
- -               * domains. In the newly idle case, we will allow all the cpu's
- -               * to do the newly idle load balance.
- -               */
- -              if (idle != CPU_NEWLY_IDLE && local_group &&
- -                  balance_cpu != this_cpu && balance) {
- -                      *balance = 0;
- -                      goto ret;
- -              }
+ +      *imbalance = sds->min_load_per_task;
+ +      sds->busiest = sds->group_min;
   
- -              total_load += avg_load;
- -              total_pwr += group->__cpu_power;
+ +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ +                      group_first_cpu(sds->group_leader);
+ +      }
   
- -              /* Adjust by relative CPU power of the group */
- -              avg_load = sg_div_cpu_power(group,
- -                              avg_load * SCHED_LOAD_SCALE);
+ +      return 1;
   
+ +}
+ +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      return;
+ +}
   
- -              /*
- -               * Consider the group unbalanced when the imbalance is larger
- -               * than the average weight of two tasks.
- -               *
- -               * APZ: with cgroup the avg task weight can vary wildly and
- -               *      might not be a suitable number - should we keep a
- -               *      normalized nr_running number somewhere that negates
- -               *      the hierarchy?
- -               */
- -              avg_load_per_task = sg_div_cpu_power(group,
- -                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
+ +      return;
+ +}
   
- -              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- -                      __group_imb = 1;
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      return 0;
+ +}
+ +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
- -              group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
   
+ +/**
+ + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ + * @group: sched_group whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @local_group: Does group contain this_cpu.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sgs: variable to hold the statistics for this group.
+ + */
+ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ +                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ +                      int local_group, const struct cpumask *cpus,
+ +                      int *balance, struct sg_lb_stats *sgs)
+ +{
+ +      unsigned long load, max_cpu_load, min_cpu_load;
+ +      int i;
+ +      unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ +      unsigned long sum_avg_load_per_task;
+ +      unsigned long avg_load_per_task;
+ +
+ +      if (local_group)
+ +              balance_cpu = group_first_cpu(group);
+ +
+ +      /* Tally up the load of all CPUs in the group */
+ +      sum_avg_load_per_task = avg_load_per_task = 0;
+ +      max_cpu_load = 0;
+ +      min_cpu_load = ~0UL;
+ +
+ +      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ +              struct rq *rq = cpu_rq(i);
+ +
+ +              if (*sd_idle && rq->nr_running)
+ +                      *sd_idle = 0;
+ +
+ +              /* Bias balancing toward cpus of our domain */
                 if (local_group) {
- -                      this_load = avg_load;
- -                      this = group;
- -                      this_nr_running = sum_nr_running;
- -                      this_load_per_task = sum_weighted_load;
- -              } else if (avg_load > max_load &&
- -                         (sum_nr_running > group_capacity || __group_imb)) {
- -                      max_load = avg_load;
- -                      busiest = group;
- -                      busiest_nr_running = sum_nr_running;
- -                      busiest_load_per_task = sum_weighted_load;
- -                      group_imb = __group_imb;
+ +                      if (idle_cpu(i) && !first_idle_cpu) {
+ +                              first_idle_cpu = 1;
+ +                              balance_cpu = i;
+ +                      }
+ +
+ +                      load = target_load(i, load_idx);
+ +              } else {
+ +                      load = source_load(i, load_idx);
+ +                      if (load > max_cpu_load)
+ +                              max_cpu_load = load;
+ +                      if (min_cpu_load > load)
+ +                              min_cpu_load = load;
                 }
   
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -              /*
- -               * Busy processors will not participate in power savings
- -               * balance.
- -               */
- -              if (idle == CPU_NOT_IDLE ||
- -                              !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -                      goto group_next;
+ +              sgs->group_load += load;
+ +              sgs->sum_nr_running += rq->nr_running;
+ +              sgs->sum_weighted_load += weighted_cpuload(i);
   
- -              /*
- -               * If the local group is idle or completely loaded
- -               * no need to do power savings balance at this domain
- -               */
- -              if (local_group && (this_nr_running >= group_capacity ||
- -                                  !this_nr_running))
- -                      power_savings_balance = 0;
+ +              sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ +      }
   
- -              /*
- -               * If a group is already running at full capacity or idle,
- -               * don't include that group in power savings calculations
- -               */
- -              if (!power_savings_balance || sum_nr_running >= group_capacity
- -                  || !sum_nr_running)
- -                      goto group_next;
+ +      /*
+ +       * First idle cpu or the first cpu(busiest) in this sched group
+ +       * is eligible for doing load balancing at this and above
+ +       * domains. In the newly idle case, we will allow all the cpu's
+ +       * to do the newly idle load balance.
+ +       */
+ +      if (idle != CPU_NEWLY_IDLE && local_group &&
+ +          balance_cpu != this_cpu && balance) {
+ +              *balance = 0;
+ +              return;
+ +      }
   
- -              /*
- -               * Calculate the group which has the least non-idle load.
- -               * This is the group from where we need to pick up the load
- -               * for saving power
- -               */
- -              if ((sum_nr_running < min_nr_running) ||
- -                  (sum_nr_running == min_nr_running &&
- -                   cpumask_first(sched_group_cpus(group)) >
- -                   cpumask_first(sched_group_cpus(group_min)))) {
- -                      group_min = group;
- -                      min_nr_running = sum_nr_running;
- -                      min_load_per_task = sum_weighted_load /
- -                                              sum_nr_running;
- -              }
+ +      /* Adjust by relative CPU power of the group */
+ +      sgs->avg_load = sg_div_cpu_power(group,
+ +                      sgs->group_load * SCHED_LOAD_SCALE);
   
- -              /*
- -               * Calculate the group which is almost near its
- -               * capacity but still has some space to pick up some load
- -               * from other group and save more power
- -               */
- -              if (sum_nr_running <= group_capacity - 1) {
- -                      if (sum_nr_running > leader_nr_running ||
- -                          (sum_nr_running == leader_nr_running &&
- -                           cpumask_first(sched_group_cpus(group)) <
- -                           cpumask_first(sched_group_cpus(group_leader)))) {
- -                              group_leader = group;
- -                              leader_nr_running = sum_nr_running;
- -                      }
+ +
+ +      /*
+ +       * Consider the group unbalanced when the imbalance is larger
+ +       * than the average weight of two tasks.
+ +       *
+ +       * APZ: with cgroup the avg task weight can vary wildly and
+ +       *      might not be a suitable number - should we keep a
+ +       *      normalized nr_running number somewhere that negates
+ +       *      the hierarchy?
+ +       */
+ +      avg_load_per_task = sg_div_cpu_power(group,
+ +                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +
+ +      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ +              sgs->group_imb = 1;
+ +
+ +      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ +
+ +}
+ +
+ +/**
+ + * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ + * @sd: sched_domain whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sds: variable to hold the statistics for this sched_domain.
+ + */
+ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ +                      enum cpu_idle_type idle, int *sd_idle,
+ +                      const struct cpumask *cpus, int *balance,
+ +                      struct sd_lb_stats *sds)
+ +{
+ +      struct sched_group *group = sd->groups;
+ +      struct sg_lb_stats sgs;
+ +      int load_idx;
+ +
+ +      init_sd_power_savings_stats(sd, sds, idle);
+ +      load_idx = get_sd_load_idx(sd, idle);
+ +
+ +      do {
+ +              int local_group;
+ +
+ +              local_group = cpumask_test_cpu(this_cpu,
+ +                                             sched_group_cpus(group));
+ +              memset(&sgs, 0, sizeof(sgs));
+ +              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ +                              local_group, cpus, balance, &sgs);
+ +
+ +              if (local_group && balance && !(*balance))
+ +                      return;
+ +
+ +              sds->total_load += sgs.group_load;
+ +              sds->total_pwr += group->__cpu_power;
+ +
+ +              if (local_group) {
+ +                      sds->this_load = sgs.avg_load;
+ +                      sds->this = group;
+ +                      sds->this_nr_running = sgs.sum_nr_running;
+ +                      sds->this_load_per_task = sgs.sum_weighted_load;
+ +              } else if (sgs.avg_load > sds->max_load &&
+ +                         (sgs.sum_nr_running > sgs.group_capacity ||
+ +                              sgs.group_imb)) {
+ +                      sds->max_load = sgs.avg_load;
+ +                      sds->busiest = group;
+ +                      sds->busiest_nr_running = sgs.sum_nr_running;
+ +                      sds->busiest_load_per_task = sgs.sum_weighted_load;
+ +                      sds->group_imb = sgs.group_imb;
                 }
- -group_next:
- -#endif
+ +
+ +              update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
   
- -      if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- -              goto out_balanced;
- -
- -      avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ +}
   
- -      if (this_load >= avg_load ||
- -                      100*max_load <= sd->imbalance_pct*this_load)
- -              goto out_balanced;
+ +/**
+ + * fix_small_imbalance - Calculate the minor imbalance that exists
+ + *                    amongst the groups of a sched_domain, during
+ + *                    load balancing.
+ + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ + * @imbalance: Variable to store the imbalance.
+ + */
+ +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ +                              int this_cpu, unsigned long *imbalance)
+ +{
+ +      unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ +      unsigned int imbn = 2;
+ +
+ +      if (sds->this_nr_running) {
+ +              sds->this_load_per_task /= sds->this_nr_running;
+ +              if (sds->busiest_load_per_task >
+ +                              sds->this_load_per_task)
+ +                      imbn = 1;
+ +      } else
+ +              sds->this_load_per_task =
+ +                      cpu_avg_load_per_task(this_cpu);
   
- -      busiest_load_per_task /= busiest_nr_running;
- -      if (group_imb)
- -              busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ +      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ +                      sds->busiest_load_per_task * imbn) {
+ +              *imbalance = sds->busiest_load_per_task;
+ +              return;
+ +      }
   
         /*
- -       * We're trying to get all the cpus to the average_load, so we don't
- -       * want to push ourselves above the average load, nor do we wish to
- -       * reduce the max loaded cpu below the average load, as either of these
- -       * actions would just result in more rebalancing later, and ping-pong
- -       * tasks around. Thus we look for the minimum possible imbalance.
- -       * Negative imbalances (*we* are more loaded than anyone else) will
- -       * be counted as no imbalance for these purposes -- we can't fix that
- -       * by pulling tasks to us. Be careful of negative numbers as they'll
- -       * appear as very large values with unsigned longs.
+ +       * OK, we don't have enough imbalance to justify moving tasks,
+ +       * however we may be able to increase total CPU power used by
+ +       * moving them.
          */
- -      if (max_load <= busiest_load_per_task)
- -              goto out_balanced;
   
+ +      pwr_now += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load);
+ +      pwr_now += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load);
+ +      pwr_now /= SCHED_LOAD_SCALE;
+ +
+ +      /* Amount of load we'd subtract */
+ +      tmp = sg_div_cpu_power(sds->busiest,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      if (sds->max_load > tmp)
+ +              pwr_move += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load - tmp);
+ +
+ +      /* Amount of load we'd add */
+ +      if (sds->max_load * sds->busiest->__cpu_power <
+ +              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->max_load * sds->busiest->__cpu_power);
+ +      else
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      pwr_move += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load + tmp);
+ +      pwr_move /= SCHED_LOAD_SCALE;
+ +
+ +      /* Move if we gain throughput */
+ +      if (pwr_move > pwr_now)
+ +              *imbalance = sds->busiest_load_per_task;
+ +}
+ +
+ +/**
+ + * calculate_imbalance - Calculate the amount of imbalance present within the
+ + *                     groups of a given sched_domain during load balance.
+ + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: Cpu for which currently load balance is being performed.
+ + * @imbalance: The variable to store the imbalance.
+ + */
+ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ +              unsigned long *imbalance)
+ +{
+ +      unsigned long max_pull;
         /*
          * In the presence of smp nice balancing, certain scenarios can have
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
- -      if (max_load < avg_load) {
+ +      if (sds->max_load < sds->avg_load) {
                 *imbalance = 0;
- -              goto small_imbalance;
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
         }
   
         /* Don't want to pull so many tasks that a group would go idle */
- -      max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ +      max_pull = min(sds->max_load - sds->avg_load,
+ +                      sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
- -      *imbalance = min(max_pull * busiest->__cpu_power,
- -                              (avg_load - this_load) * this->__cpu_power)
+ +      *imbalance = min(max_pull * sds->busiest->__cpu_power,
+ +              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -3671,110 -3343,78 +3671,110 @@@
          * a think about bumping its value to force at least one task to be
          * moved
          */
- -      if (*imbalance < busiest_load_per_task) {
- -              unsigned long tmp, pwr_now, pwr_move;
- -              unsigned int imbn;
- -
- -small_imbalance:
- -              pwr_move = pwr_now = 0;
- -              imbn = 2;
- -              if (this_nr_running) {
- -                      this_load_per_task /= this_nr_running;
- -                      if (busiest_load_per_task > this_load_per_task)
- -                              imbn = 1;
- -              } else
- -                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ +      if (*imbalance < sds->busiest_load_per_task)
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
   
- -              if (max_load - this_load + busiest_load_per_task >=
- -                                      busiest_load_per_task * imbn) {
- -                      *imbalance = busiest_load_per_task;
- -                      return busiest;
- -              }
+ +}
+ +/******* find_busiest_group() helpers end here *********************/
   
- -              /*
- -               * OK, we don't have enough imbalance to justify moving tasks,
- -               * however we may be able to increase total CPU power used by
- -               * moving them.
- -               */
+ +/**
+ + * find_busiest_group - Returns the busiest group within the sched_domain
+ + * if there is an imbalance. If there isn't an imbalance, and
+ + * the user has opted for power-savings, it returns a group whose
+ + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ + * such a group exists.
+ + *
+ + * Also calculates the amount of weighted load which should be moved
+ + * to restore balance.
+ + *
+ + * @sd: The sched_domain whose busiest group is to be returned.
+ + * @this_cpu: The cpu for which load balancing is currently being performed.
+ + * @imbalance: Variable which stores amount of weighted load which should
+ + *            be moved to restore balance/put a group to idle.
+ + * @idle: The idle status of this_cpu.
+ + * @sd_idle: The idleness of sd
+ + * @cpus: The set of CPUs under consideration for load-balancing.
+ + * @balance: Pointer to a variable indicating if this_cpu
+ + *    is the appropriate cpu to perform load balancing at this_level.
+ + *
+ + * Returns:   - the busiest group if imbalance exists.
+ + *            - If no imbalance and user has opted for power-savings balance,
+ + *               return the least loaded group whose CPUs can be
+ + *               put to idle by rebalancing its tasks onto our group.
+ + */
+ +static struct sched_group *
+ +find_busiest_group(struct sched_domain *sd, int this_cpu,
+ +                 unsigned long *imbalance, enum cpu_idle_type idle,
+ +                 int *sd_idle, const struct cpumask *cpus, int *balance)
+ +{
+ +      struct sd_lb_stats sds;
   
- -              pwr_now += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load);
- -              pwr_now += this->__cpu_power *
- -                              min(this_load_per_task, this_load);
- -              pwr_now /= SCHED_LOAD_SCALE;
- -
- -              /* Amount of load we'd subtract */
- -              tmp = sg_div_cpu_power(busiest,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              if (max_load > tmp)
- -                      pwr_move += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load - tmp);
- -
- -              /* Amount of load we'd add */
- -              if (max_load * busiest->__cpu_power <
- -                              busiest_load_per_task * SCHED_LOAD_SCALE)
- -                      tmp = sg_div_cpu_power(this,
- -                                      max_load * busiest->__cpu_power);
- -              else
- -                      tmp = sg_div_cpu_power(this,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              pwr_move += this->__cpu_power *
- -                              min(this_load_per_task, this_load + tmp);
- -              pwr_move /= SCHED_LOAD_SCALE;
+ +      memset(&sds, 0, sizeof(sds));
   
- -              /* Move if we gain throughput */
- -              if (pwr_move > pwr_now)
- -                      *imbalance = busiest_load_per_task;
- -      }
+ +      /*
+ +       * Compute the various statistics relavent for load balancing at
+ +       * this level.
+ +       */
+ +      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ +                                      balance, &sds);
+ +
+ +      /* Cases where imbalance does not exist from POV of this_cpu */
+ +      /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ +       *    at this level.
+ +       * 2) There is no busy sibling group to pull from.
+ +       * 3) This group is the busiest group.
+ +       * 4) This group is more busy than the avg busieness at this
+ +       *    sched_domain.
+ +       * 5) The imbalance is within the specified limit.
+ +       * 6) Any rebalance would lead to ping-pong
+ +       */
+ +      if (balance && !(*balance))
+ +              goto ret;
   
- -      return busiest;
+ +      if (!sds.busiest || sds.busiest_nr_running == 0)
+ +              goto out_balanced;
   
- -out_balanced:
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -              goto ret;
+ +      if (sds.this_load >= sds.max_load)
+ +              goto out_balanced;
   
- -      if (this == group_leader && group_leader != group_min) {
- -              *imbalance = min_load_per_task;
- -              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- -                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- -                              cpumask_first(sched_group_cpus(group_leader));
- -              }
- -              return group_min;
- -      }
- -#endif
+ +      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ +
+ +      if (sds.this_load >= sds.avg_load)
+ +              goto out_balanced;
+ +
+ +      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ +              goto out_balanced;
+ +
+ +      sds.busiest_load_per_task /= sds.busiest_nr_running;
+ +      if (sds.group_imb)
+ +              sds.busiest_load_per_task =
+ +                      min(sds.busiest_load_per_task, sds.avg_load);
+ +
+ +      /*
+ +       * We're trying to get all the cpus to the average_load, so we don't
+ +       * want to push ourselves above the average load, nor do we wish to
+ +       * reduce the max loaded cpu below the average load, as either of these
+ +       * actions would just result in more rebalancing later, and ping-pong
+ +       * tasks around. Thus we look for the minimum possible imbalance.
+ +       * Negative imbalances (*we* are more loaded than anyone else) will
+ +       * be counted as no imbalance for these purposes -- we can't fix that
+ +       * by pulling tasks to us. Be careful of negative numbers as they'll
+ +       * appear as very large values with unsigned longs.
+ +       */
+ +      if (sds.max_load <= sds.busiest_load_per_task)
+ +              goto out_balanced;
+ +
+ +      /* Looks like there is an imbalance. Compute it */
+ +      calculate_imbalance(&sds, this_cpu, imbalance);
+ +      return sds.busiest;
+ +
+ +out_balanced:
+ +      /*
+ +       * There is no obvious imbalance. But check if we can do some balancing
+ +       * to save power.
+ +       */
+ +      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ +              return sds.busiest;
   ret:
         *imbalance = 0;
         return NULL;
@@@ -4250,24 -3890,19 +4250,24 @@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
   
         if (stop_tick) {
- -              cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
   
- -              /*
- -               * If we are going offline and still the leader, give up!
- -               */
- -              if (!cpu_active(cpu) &&
- -                  atomic_read(&nohz.load_balancer) == cpu) {
+ +              if (!cpu_active(cpu)) {
+ +                      if (atomic_read(&nohz.load_balancer) != cpu)
+ +                              return 0;
+ +
+ +                      /*
+ +                       * If we are going offline and still the leader,
+ +                       * give up!
+ +                       */
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
+ +
                         return 0;
                 }
   
+ +              cpumask_set_cpu(cpu, nohz.cpu_mask);
+ +
                 /* time for ilb owner also to sleep */
                 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
@@@ -4427,11 -4062,6 +4427,11 @@@ static void run_rebalance_domains(struc
   #endif
   }
   
+ +static inline int on_null_domain(int cpu)
+ +{
+ +      return !rcu_dereference(cpu_rq(cpu)->sd);
+ +}
+ +
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
    *
@@@ -4489,9 -4119,7 +4489,9 @@@ static inline void trigger_load_balance
             cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
- -      if (time_after_eq(jiffies, rq->next_balance))
+ +      /* Don't need to rebalance while attached to NULL domain */
+ +      if (time_after_eq(jiffies, rq->next_balance) &&
+ +          likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
   }
   
@@@ -4885,33 -4513,11 +4885,33 @@@ static inline void schedule_debug(struc
   #endif
   }
   
+ +static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ +{
+ +      if (prev->state == TASK_RUNNING) {
+ +              u64 runtime = prev->se.sum_exec_runtime;
+ +
+ +              runtime -= prev->se.prev_sum_exec_runtime;
+ +              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+ +
+ +              /*
+ +               * In order to avoid avg_overlap growing stale when we are
+ +               * indeed overlapping and hence not getting put to sleep, grow
+ +               * the avg_overlap on preemption.
+ +               *
+ +               * We use the average preemption runtime because that
+ +               * correlates to the amount of cache footprint a task can
+ +               * build up.
+ +               */
+ +              update_avg(&prev->se.avg_overlap, runtime);
+ +      }
+ +      prev->sched_class->put_prev_task(rq, prev);
+ +}
+ +
   /*
    * Pick up the highest-prio task:
    */
   static inline struct task_struct *
- -pick_next_task(struct rq *rq, struct task_struct *prev)
+ +pick_next_task(struct rq *rq)
   {
         const struct sched_class *class;
         struct task_struct *p;
@@@ -4942,15 -4548,13 +4942,13 @@@
   /*
    * schedule() is the main scheduler function.
    */
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
   
- need_resched:
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@@ -4985,8 -4589,8 +4983,8 @@@ need_resched_nonpreemptible
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
   
- -      prev->sched_class->put_prev_task(rq, prev);
- -      next = pick_next_task(rq, prev);
+ +      put_prev_task(rq, prev);
+ +      next = pick_next_task(rq);
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@@ -5007,13 -4611,80 +5005,80 @@@
   
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
+ }
   
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+       preempt_disable();
+       __schedule();
         preempt_enable_no_resched();
         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                 goto need_resched;
   }
   EXPORT_SYMBOL(schedule);
   
+ #ifdef CONFIG_SMP
+ /*
+  * Look out! "owner" is an entirely speculative pointer
+  * access and not reliable.
+  */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+       unsigned int cpu;
+       struct rq *rq;
+ 
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+ 
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+ #else
+       cpu = owner->cpu;
+ #endif
+ 
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+ 
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+ 
+       rq = cpu_rq(cpu);
+ 
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+ 
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+ 
+               cpu_relax();
+       }
+ out:
+       return 1;
+ }
+ #endif
+ 
   #ifdef CONFIG_PREEMPT
   /*
    * this is the entry point to schedule() from in-kernel preemption
@@@ -5041,7 -4712,7 +5106,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   EXPORT_SYMBOL(preempt_schedule);
   
@@@ -5070,7 -4741,7 +5135,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   
   #endif /* CONFIG_PREEMPT */
@@@ -5544,7 -5215,7 +5609,7 @@@ SYSCALL_DEFINE1(nice, int, increment
         if (increment > 40)
                 increment = 40;
   
- -      nice = PRIO_TO_NICE(current->static_prio) + increment;
+ +      nice = TASK_NICE(current) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@@ -6343,7 -6014,12 +6408,7 @@@ void sched_show_task(struct task_struc
                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
   #endif
   #ifdef CONFIG_DEBUG_STACK_USAGE
- -      {
- -              unsigned long *n = end_of_stack(p);
- -              while (!*n)
- -                      n++;
- -              free = (unsigned long)n - (unsigned long)end_of_stack(p);
- -      }
+ +      free = stack_not_used(p);
   #endif
         printk(KERN_CONT "%5lu %5d %6d\n", free,
                 task_pid_nr(p), task_pid_nr(p->real_parent));
@@@ -6817,7 -6493,7 +6882,7 @@@ static void migrate_dead_tasks(unsigne
                 if (!rq->nr_running)
                         break;
                 update_rq_clock(rq);
- -              next = pick_next_task(rq, rq->curr);
+ +              next = pick_next_task(rq);
                 if (!next)
                         break;
                 next->sched_class->put_prev_task(rq, next);
@@@ -7338,26 -7014,20 +7403,26 @@@ static void free_rootdomain(struct root
   
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
+ +      struct root_domain *old_rd = NULL;
         unsigned long flags;
   
         spin_lock_irqsave(&rq->lock, flags);
   
         if (rq->rd) {
- -              struct root_domain *old_rd = rq->rd;
+ +              old_rd = rq->rd;
   
                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
                         set_rq_offline(rq);
   
                 cpumask_clear_cpu(rq->cpu, old_rd->span);
   
- -              if (atomic_dec_and_test(&old_rd->refcount))
- -                      free_rootdomain(old_rd);
+ +              /*
+ +               * If we dont want to free the old_rt yet then
+ +               * set old_rd to NULL to skip the freeing later
+ +               * in this function:
+ +               */
+ +              if (!atomic_dec_and_test(&old_rd->refcount))
+ +                      old_rd = NULL;
         }
   
         atomic_inc(&rd->refcount);
@@@ -7368,9 -7038,6 +7433,9 @@@
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
+ +
+ +      if (old_rd)
+ +              free_rootdomain(old_rd);
   }
   
   static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@@ -8612,15 -8279,11 +8677,15 @@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      rt_rq->highest_prio = MAX_RT_PRIO;
+ +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +#ifdef CONFIG_SMP
+ +      rt_rq->highest_prio.next = MAX_RT_PRIO;
+ +#endif
   #endif
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+ +      plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
@@@ -9622,16 -9285,6 +9687,16 @@@ static int sched_rt_global_constraints(
   
         return ret;
   }
+ +
+ +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+ +{
+ +      /* Don't accept realtime tasks when there is no way for them to run */
+ +      if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
   #else /* !CONFIG_RT_GROUP_SCHED */
   static int sched_rt_global_constraints(void)
   {
@@@ -9725,7 -9378,8 +9790,7 @@@ cpu_cgroup_can_attach(struct cgroup_sub
                       struct task_struct *tsk)
   {
   #ifdef CONFIG_RT_GROUP_SCHED
- -      /* Don't accept realtime tasks when there is no way for them to run */
- -      if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ +      if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                 return -EINVAL;
   #else
         /* We don't support RT-tasks being in separate groups */
@@@ -9888,7 -9542,7 +9953,7 @@@ cpuacct_destroy(struct cgroup_subsys *s
   
   static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
   {
- -      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ +      u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
   
   #ifndef CONFIG_64BIT
@@@ -9907,7 -9561,7 +9972,7 @@@
   
   static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
   {
- -      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ +      u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
   
   #ifndef CONFIG_64BIT
         /*
@@@ -9996,14 -9650,14 +10061,14 @@@ static void cpuacct_charge(struct task_
         struct cpuacct *ca;
         int cpu;
   
- -      if (!cpuacct_subsys.active)
+ +      if (unlikely(!cpuacct_subsys.active))
                 return;
   
         cpu = task_cpu(tsk);
         ca = task_ca(tsk);
   
         for (; ca; ca = ca->parent) {
- -              u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ +              u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
   }
diff --combined kernel/sched_features.h

index 76f61756e677647dda4ae01ede5ac942329a09c8,07bc02e99ab1998a0b32117c2132a3c853937ca2..4569bfa7df9b9fee6f057e22aca18299d3c45f3b
--- 1/kernel/sched_features.h
--- 2/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@@ -1,6 -1,5 +1,6 @@@
   SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
- -SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+ +SCHED_FEAT(NORMALIZED_SLEEPER, 0)
+ +SCHED_FEAT(ADAPTIVE_GRAN, 1)
   SCHED_FEAT(WAKEUP_PREEMPT, 1)
   SCHED_FEAT(START_DEBIT, 1)
   SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@@ -14,3 -13,4 +14,4 @@@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1
   SCHED_FEAT(ASYM_EFF_LOAD, 1)
   SCHED_FEAT(WAKEUP_OVERLAP, 0)
   SCHED_FEAT(LAST_BUDDY, 1)
+ SCHED_FEAT(OWNER_SPIN, 1)
diff --combined kernel/timer.c

index 9b77fc9a9ac8286074b1784771dac7c0eb2a08e2,ef1c385bc5726ee6185575d4fcb3c3cab987f3ed..b4555568b4e4ad16f34a887eabed6f21e05abfba
--- 1/kernel/timer.c
--- 2/kernel/timer.c
+++ b/kernel/timer.c
@@@ -491,14 -491,18 +491,18 @@@ static inline void debug_timer_free(str
         debug_object_free(timer, &timer_debug_descr);
   }
   
- static void __init_timer(struct timer_list *timer);
+ static void __init_timer(struct timer_list *timer,
+                        const char *name,
+                        struct lock_class_key *key);
   
- void init_timer_on_stack(struct timer_list *timer)
+ void init_timer_on_stack_key(struct timer_list *timer,
+                            const char *name,
+                            struct lock_class_key *key)
   {
         debug_object_init_on_stack(timer, &timer_debug_descr);
-       __init_timer(timer);
+       __init_timer(timer, name, key);
   }
- EXPORT_SYMBOL_GPL(init_timer_on_stack);
+ EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
   
   void destroy_timer_on_stack(struct timer_list *timer)
   {
@@@ -512,7 -516,9 +516,9 @@@ static inline void debug_timer_activate
   static inline void debug_timer_deactivate(struct timer_list *timer) { }
   #endif
   
- static void __init_timer(struct timer_list *timer)
+ static void __init_timer(struct timer_list *timer,
+                        const char *name,
+                        struct lock_class_key *key)
   {
         timer->entry.next = NULL;
         timer->base = __raw_get_cpu_var(tvec_bases);
@@@ -521,6 -527,7 +527,7 @@@
         timer->start_pid = -1;
         memset(timer->start_comm, 0, TASK_COMM_LEN);
   #endif
+       lockdep_init_map(&timer->lockdep_map, name, key, 0);
   }
   
   /**
@@@ -530,19 -537,23 +537,23 @@@
    * init_timer() must be done to a timer prior calling *any* of the
    * other timer functions.
    */
- void init_timer(struct timer_list *timer)
+ void init_timer_key(struct timer_list *timer,
+                   const char *name,
+                   struct lock_class_key *key)
   {
         debug_timer_init(timer);
-       __init_timer(timer);
+       __init_timer(timer, name, key);
   }
- EXPORT_SYMBOL(init_timer);
+ EXPORT_SYMBOL(init_timer_key);
   
- void init_timer_deferrable(struct timer_list *timer)
+ void init_timer_deferrable_key(struct timer_list *timer,
+                              const char *name,
+                              struct lock_class_key *key)
   {
-       init_timer(timer);
+       init_timer_key(timer, name, key);
         timer_set_deferrable(timer);
   }
- EXPORT_SYMBOL(init_timer_deferrable);
+ EXPORT_SYMBOL(init_timer_deferrable_key);
   
   static inline void detach_timer(struct timer_list *timer,
                                 int clear_pending)
@@@ -589,14 -600,11 +600,14 @@@ static struct tvec_base *lock_timer_bas
         }
   }
   
- -int __mod_timer(struct timer_list *timer, unsigned long expires)
+ +static inline int
+ +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
   {
         struct tvec_base *base, *new_base;
         unsigned long flags;
- -      int ret = 0;
+ +      int ret;
+ +
+ +      ret = 0;
   
         timer_stats_timer_set_start_info(timer);
         BUG_ON(!timer->function);
@@@ -606,9 -614,6 +617,9 @@@
         if (timer_pending(timer)) {
                 detach_timer(timer, 0);
                 ret = 1;
+ +      } else {
+ +              if (pending_only)
+ +                      goto out_unlock;
         }
   
         debug_timer_activate(timer);
@@@ -635,28 -640,42 +646,28 @@@
   
         timer->expires = expires;
         internal_add_timer(base, timer);
+ +
+ +out_unlock:
         spin_unlock_irqrestore(&base->lock, flags);
   
         return ret;
   }
   
- -EXPORT_SYMBOL(__mod_timer);
- -
   /**
- - * add_timer_on - start a timer on a particular CPU
- - * @timer: the timer to be added
- - * @cpu: the CPU to start it on
+ + * mod_timer_pending - modify a pending timer's timeout
+ + * @timer: the pending timer to be modified
+ + * @expires: new timeout in jiffies
    *
- - * This is not very scalable on SMP. Double adds are not possible.
+ + * mod_timer_pending() is the same for pending timers as mod_timer(),
+ + * but will not re-activate and modify already deleted timers.
+ + *
+ + * It is useful for unserialized use of timers.
    */
- -void add_timer_on(struct timer_list *timer, int cpu)
+ +int mod_timer_pending(struct timer_list *timer, unsigned long expires)
   {
- -      struct tvec_base *base = per_cpu(tvec_bases, cpu);
- -      unsigned long flags;
- -
- -      timer_stats_timer_set_start_info(timer);
- -      BUG_ON(timer_pending(timer) || !timer->function);
- -      spin_lock_irqsave(&base->lock, flags);
- -      timer_set_base(timer, base);
- -      debug_timer_activate(timer);
- -      internal_add_timer(base, timer);
- -      /*
- -       * Check whether the other CPU is idle and needs to be
- -       * triggered to reevaluate the timer wheel when nohz is
- -       * active. We are protected against the other CPU fiddling
- -       * with the timer by holding the timer base lock. This also
- -       * makes sure that a CPU on the way to idle can not evaluate
- -       * the timer wheel.
- -       */
- -      wake_up_idle_cpu(cpu);
- -      spin_unlock_irqrestore(&base->lock, flags);
+ +      return __mod_timer(timer, expires, true);
   }
+ +EXPORT_SYMBOL(mod_timer_pending);
   
   /**
    * mod_timer - modify a timer's timeout
@@@ -680,6 -699,9 +691,6 @@@
    */
   int mod_timer(struct timer_list *timer, unsigned long expires)
   {
- -      BUG_ON(!timer->function);
- -
- -      timer_stats_timer_set_start_info(timer);
         /*
          * This is a common optimization triggered by the
          * networking code - if the timer is re-modified
@@@ -688,61 -710,11 +699,61 @@@
         if (timer->expires == expires && timer_pending(timer))
                 return 1;
   
- -      return __mod_timer(timer, expires);
+ +      return __mod_timer(timer, expires, false);
   }
- -
   EXPORT_SYMBOL(mod_timer);
   
+ +/**
+ + * add_timer - start a timer
+ + * @timer: the timer to be added
+ + *
+ + * The kernel will do a ->function(->data) callback from the
+ + * timer interrupt at the ->expires point in the future. The
+ + * current time is 'jiffies'.
+ + *
+ + * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ + * fields must be set prior calling this function.
+ + *
+ + * Timers with an ->expires field in the past will be executed in the next
+ + * timer tick.
+ + */
+ +void add_timer(struct timer_list *timer)
+ +{
+ +      BUG_ON(timer_pending(timer));
+ +      mod_timer(timer, timer->expires);
+ +}
+ +EXPORT_SYMBOL(add_timer);
+ +
+ +/**
+ + * add_timer_on - start a timer on a particular CPU
+ + * @timer: the timer to be added
+ + * @cpu: the CPU to start it on
+ + *
+ + * This is not very scalable on SMP. Double adds are not possible.
+ + */
+ +void add_timer_on(struct timer_list *timer, int cpu)
+ +{
+ +      struct tvec_base *base = per_cpu(tvec_bases, cpu);
+ +      unsigned long flags;
+ +
+ +      timer_stats_timer_set_start_info(timer);
+ +      BUG_ON(timer_pending(timer) || !timer->function);
+ +      spin_lock_irqsave(&base->lock, flags);
+ +      timer_set_base(timer, base);
+ +      debug_timer_activate(timer);
+ +      internal_add_timer(base, timer);
+ +      /*
+ +       * Check whether the other CPU is idle and needs to be
+ +       * triggered to reevaluate the timer wheel when nohz is
+ +       * active. We are protected against the other CPU fiddling
+ +       * with the timer by holding the timer base lock. This also
+ +       * makes sure that a CPU on the way to idle can not evaluate
+ +       * the timer wheel.
+ +       */
+ +      wake_up_idle_cpu(cpu);
+ +      spin_unlock_irqrestore(&base->lock, flags);
+ +}
+ +
   /**
    * del_timer - deactive a timer.
    * @timer: the timer to be deactivated
@@@ -772,6 -744,7 +783,6 @@@ int del_timer(struct timer_list *timer
   
         return ret;
   }
- -
   EXPORT_SYMBOL(del_timer);
   
   #ifdef CONFIG_SMP
@@@ -805,6 -778,7 +816,6 @@@ out
   
         return ret;
   }
- -
   EXPORT_SYMBOL(try_to_del_timer_sync);
   
   /**
@@@ -826,6 -800,15 +837,15 @@@
    */
   int del_timer_sync(struct timer_list *timer)
   {
+ #ifdef CONFIG_LOCKDEP
+       unsigned long flags;
+ 
+       local_irq_save(flags);
+       lock_map_acquire(&timer->lockdep_map);
+       lock_map_release(&timer->lockdep_map);
+       local_irq_restore(flags);
+ #endif
+ 
         for (;;) {
                 int ret = try_to_del_timer_sync(timer);
                 if (ret >= 0)
@@@ -833,6 -816,7 +853,6 @@@
                 cpu_relax();
         }
   }
- -
   EXPORT_SYMBOL(del_timer_sync);
   #endif
   
@@@ -897,10 -881,36 +917,36 @@@ static inline void __run_timers(struct 
   
                         set_running_timer(base, timer);
                         detach_timer(timer, 1);
+ 
                         spin_unlock_irq(&base->lock);
                         {
                                 int preempt_count = preempt_count();
+ 
+ #ifdef CONFIG_LOCKDEP
+                               /*
+                                * It is permissible to free the timer from
+                                * inside the function that is called from
+                                * it, this we need to take into account for
+                                * lockdep too. To avoid bogus "held lock
+                                * freed" warnings as well as problems when
+                                * looking into timer->lockdep_map, make a
+                                * copy and use that here.
+                                */
+                               struct lockdep_map lockdep_map =
+                                       timer->lockdep_map;
+ #endif
+                               /*
+                                * Couple the lock chain with the lock chain at
+                                * del_timer_sync() by acquiring the lock_map
+                                * around the fn() call here and in
+                                * del_timer_sync().
+                                */
+                               lock_map_acquire(&lockdep_map);
+ 
                                 fn(data);
+ 
+                               lock_map_release(&lockdep_map);
+ 
                                 if (preempt_count != preempt_count()) {
                                         printk(KERN_ERR "huh, entered %p "
                                                "with preempt_count %08x, exited"
@@@ -1304,7 -1314,7 +1350,7 @@@ signed long __sched schedule_timeout(si
         expire = timeout + jiffies;
   
         setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
- -      __mod_timer(&timer, expire);
+ +      __mod_timer(&timer, expire, false);
         schedule();
         del_singleshot_timer_sync(&timer);
   
diff --combined mm/page_alloc.c

index 5c44ed49ca93c5a1d7a39f5570b0d1254a45de80,22b15a4cde8a7f79d8e0ec8407e6123750f69288..a3803ea8c27d5aaaccb5bd2d1f8141786dc9cc27
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -1479,6 -1479,8 +1479,8 @@@ __alloc_pages_internal(gfp_t gfp_mask, 
         unsigned long did_some_progress;
         unsigned long pages_reclaimed = 0;
   
+       lockdep_trace_alloc(gfp_mask);
+ 
         might_sleep_if(wait);
   
         if (should_fail_alloc_page(gfp_mask, order))
@@@ -1578,12 -1580,15 +1580,15 @@@ nofail_alloc
          */
         cpuset_update_task_memory_state();
         p->flags |= PF_MEMALLOC;
+ 
+       lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
   
         did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
   
         p->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
         p->flags &= ~PF_MEMALLOC;
   
         cond_resched();
@@@ -2989,7 -2994,7 +2994,7 @@@ static int __meminit next_active_region
    * was used and there are no special requirements, this is a convenient
    * alternative
    */
- -int __meminit early_pfn_to_nid(unsigned long pfn)
+ +int __meminit __early_pfn_to_nid(unsigned long pfn)
   {
         int i;
   
@@@ -3000,33 -3005,10 +3005,33 @@@
                 if (start_pfn <= pfn && pfn < end_pfn)
                         return early_node_map[i].nid;
         }
+ +      /* This is a memory hole */
+ +      return -1;
+ +}
+ +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+ +
+ +int __meminit early_pfn_to_nid(unsigned long pfn)
+ +{
+ +      int nid;
   
+ +      nid = __early_pfn_to_nid(pfn);
+ +      if (nid >= 0)
+ +              return nid;
+ +      /* just returns 0 */
         return 0;
   }
- -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+ +
+ +#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+ +bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+ +{
+ +      int nid;
+ +
+ +      nid = __early_pfn_to_nid(pfn);
+ +      if (nid >= 0 && nid != node)
+ +              return false;
+ +      return true;
+ +}
+ +#endif
   
   /* Basic iterator support to walk early_node_map[] */
   #define for_each_active_range_index_in_nid(i, nid) \
diff --combined mm/slab.c

index 4d00855629c4921e3c590406335549683e6c6daf,6b61de8543ec25532d55f75f82c2576860d58974..825c606f691d11bfb3372627aff50f115358aff3
--- 1/mm/slab.c
--- 2/mm/slab.c
+++ b/mm/slab.c
@@@ -3318,6 -3318,8 +3318,8 @@@ __cache_alloc_node(struct kmem_cache *c
         unsigned long save_flags;
         void *ptr;
   
+       lockdep_trace_alloc(flags);
+ 
         if (slab_should_failslab(cachep, flags))
                 return NULL;
   
@@@ -3394,6 -3396,8 +3396,8 @@@ __cache_alloc(struct kmem_cache *cachep
         unsigned long save_flags;
         void *objp;
   
+       lockdep_trace_alloc(flags);
+ 
         if (slab_should_failslab(cachep, flags))
                 return NULL;
   
@@@ -4457,4 -4461,3 +4461,4 @@@ size_t ksize(const void *objp
   
         return obj_size(virt_to_cache(objp));
   }
+ +EXPORT_SYMBOL(ksize);
diff --combined mm/slob.c

index 0bfa680a8981127307d1d922231dca3d66f9b2bd,4b1c0c1d63cb173a1dca86a1ae0ef8423332ea58..7a3411524dacd555e9ce70e287a4232072391e10
--- 1/mm/slob.c
--- 2/mm/slob.c
+++ b/mm/slob.c
@@@ -126,9 -126,9 +126,9 @@@ static LIST_HEAD(free_slob_medium)
   static LIST_HEAD(free_slob_large);
   
   /*
- - * slob_page: True for all slob pages (false for bigblock pages)
+ + * is_slob_page: True for all slob pages (false for bigblock pages)
    */
- -static inline int slob_page(struct slob_page *sp)
+ +static inline int is_slob_page(struct slob_page *sp)
   {
         return PageSlobPage((struct page *)sp);
   }
@@@ -143,11 -143,6 +143,11 @@@ static inline void clear_slob_page(stru
         __ClearPageSlobPage((struct page *)sp);
   }
   
+ +static inline struct slob_page *slob_page(const void *addr)
+ +{
+ +      return (struct slob_page *)virt_to_page(addr);
+ +}
+ +
   /*
    * slob_page_free: true for pages on free_slob_pages list.
    */
@@@ -235,7 -230,7 +235,7 @@@ static int slob_last(slob_t *s
         return !((unsigned long)slob_next(s) & ~PAGE_MASK);
   }
   
- -static void *slob_new_page(gfp_t gfp, int order, int node)
+ +static void *slob_new_pages(gfp_t gfp, int order, int node)
   {
         void *page;
   
@@@ -252,17 -247,12 +252,17 @@@
         return page_address(page);
   }
   
+ +static void slob_free_pages(void *b, int order)
+ +{
+ +      free_pages((unsigned long)b, order);
+ +}
+ +
   /*
    * Allocate a slob block within a given slob_page sp.
    */
   static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
   {
- -      slob_t *prev, *cur, *aligned = 0;
+ +      slob_t *prev, *cur, *aligned = NULL;
         int delta = 0, units = SLOB_UNITS(size);
   
         for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@@ -359,10 -349,10 +359,10 @@@ static void *slob_alloc(size_t size, gf
   
         /* Not enough space: must allocate a new page */
         if (!b) {
- -              b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);
+ +              b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
                 if (!b)
- -                      return 0;
- -              sp = (struct slob_page *)virt_to_page(b);
+ +                      return NULL;
+ +              sp = slob_page(b);
                 set_slob_page(sp);
   
                 spin_lock_irqsave(&slob_lock, flags);
@@@ -394,7 -384,7 +394,7 @@@ static void slob_free(void *block, int 
                 return;
         BUG_ON(!size);
   
- -      sp = (struct slob_page *)virt_to_page(block);
+ +      sp = slob_page(block);
         units = SLOB_UNITS(size);
   
         spin_lock_irqsave(&slob_lock, flags);
@@@ -403,11 -393,10 +403,11 @@@
                 /* Go directly to page allocator. Do not pass slob allocator */
                 if (slob_page_free(sp))
                         clear_slob_page_free(sp);
+ +              spin_unlock_irqrestore(&slob_lock, flags);
                 clear_slob_page(sp);
                 free_slob_page(sp);
                 free_page((unsigned long)b);
- -              goto out;
+ +              return;
         }
   
         if (!slob_page_free(sp)) {
@@@ -475,6 -464,8 +475,8 @@@ void *__kmalloc_node(size_t size, gfp_
         unsigned int *m;
         int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
   
+       lockdep_trace_alloc(gfp);
+ 
         if (size < PAGE_SIZE - align) {
                 if (!size)
                         return ZERO_SIZE_PTR;
@@@ -487,7 -478,7 +489,7 @@@
         } else {
                 void *ret;
   
- -              ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
+ +              ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
                 if (ret) {
                         struct page *page;
                         page = virt_to_page(ret);
@@@ -505,8 -496,8 +507,8 @@@ void kfree(const void *block
         if (unlikely(ZERO_OR_NULL_PTR(block)))
                 return;
   
- -      sp = (struct slob_page *)virt_to_page(block);
- -      if (slob_page(sp)) {
+ +      sp = slob_page(block);
+ +      if (is_slob_page(sp)) {
                 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                 unsigned int *m = (unsigned int *)(block - align);
                 slob_free(m, *m + align);
@@@ -524,15 -515,14 +526,15 @@@ size_t ksize(const void *block
         if (unlikely(block == ZERO_SIZE_PTR))
                 return 0;
   
- -      sp = (struct slob_page *)virt_to_page(block);
- -      if (slob_page(sp)) {
+ +      sp = slob_page(block);
+ +      if (is_slob_page(sp)) {
                 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                 unsigned int *m = (unsigned int *)(block - align);
                 return SLOB_UNITS(*m) * SLOB_UNIT;
         } else
                 return sp->page.private;
   }
+ +EXPORT_SYMBOL(ksize);
   
   struct kmem_cache {
         unsigned int size, align;
@@@ -584,7 -574,7 +586,7 @@@ void *kmem_cache_alloc_node(struct kmem
         if (c->size < PAGE_SIZE)
                 b = slob_alloc(c->size, flags, c->align, node);
         else
- -              b = slob_new_page(flags, get_order(c->size), node);
+ +              b = slob_new_pages(flags, get_order(c->size), node);
   
         if (c->ctor)
                 c->ctor(b);
@@@ -598,7 -588,7 +600,7 @@@ static void __kmem_cache_free(void *b, 
         if (size < PAGE_SIZE)
                 slob_free(b, size);
         else
- -              free_pages((unsigned long)b, get_order(size));
+ +              slob_free_pages(b, get_order(size));
   }
   
   static void kmem_rcu_free(struct rcu_head *head)
diff --combined mm/slub.c

index c65a4edafc3343e83a2513f8edab9b0ed82f3ab5,214eb207c513f46276eb06106e94a92e2aa2060b..c4ea9158c9fbd0e4630062aa0098406000be45c3
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -374,8 -374,14 +374,8 @@@ static struct track *get_track(struct k
   static void set_track(struct kmem_cache *s, void *object,
                         enum track_item alloc, unsigned long addr)
   {
- -      struct track *p;
- -
- -      if (s->offset)
- -              p = object + s->offset + sizeof(void *);
- -      else
- -              p = object + s->inuse;
+ +      struct track *p = get_track(s, object, alloc);
   
- -      p += alloc;
         if (addr) {
                 p->addr = addr;
                 p->cpu = smp_processor_id();
@@@ -1329,7 -1335,7 +1329,7 @@@ static struct page *get_any_partial(str
                 n = get_node(s, zone_to_nid(zone));
   
                 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- -                              n->nr_partial > n->min_partial) {
+ +                              n->nr_partial > s->min_partial) {
                         page = get_partial_node(n);
                         if (page)
                                 return page;
@@@ -1381,7 -1387,7 +1381,7 @@@ static void unfreeze_slab(struct kmem_c
                 slab_unlock(page);
         } else {
                 stat(c, DEACTIVATE_EMPTY);
- -              if (n->nr_partial < n->min_partial) {
+ +              if (n->nr_partial < s->min_partial) {
                         /*
                          * Adding an empty slab to the partial slabs in order
                          * to avoid page allocator overhead. This slab needs
@@@ -1590,6 -1596,7 +1590,7 @@@ static __always_inline void *slab_alloc
         unsigned long flags;
         unsigned int objsize;
   
+       lockdep_trace_alloc(gfpflags);
         might_sleep_if(gfpflags & __GFP_WAIT);
   
         if (should_failslab(s->objsize, gfpflags))
@@@ -1718,7 -1725,7 +1719,7 @@@ static __always_inline void slab_free(s
         c = get_cpu_slab(s, smp_processor_id());
         debug_check_no_locks_freed(object, c->objsize);
         if (!(s->flags & SLAB_DEBUG_OBJECTS))
- -              debug_check_no_obj_freed(object, s->objsize);
+ +              debug_check_no_obj_freed(object, c->objsize);
         if (likely(page == c->page && c->node >= 0)) {
                 object[c->offset] = c->freelist;
                 c->freelist = object;
@@@ -1838,7 -1845,6 +1839,7 @@@ static inline int calculate_order(int s
         int order;
         int min_objects;
         int fraction;
+ +      int max_objects;
   
         /*
          * Attempt to find best configuration for a slab. This
@@@ -1851,9 -1857,6 +1852,9 @@@
         min_objects = slub_min_objects;
         if (!min_objects)
                 min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ +      max_objects = (PAGE_SIZE << slub_max_order)/size;
+ +      min_objects = min(min_objects, max_objects);
+ +
         while (min_objects > 1) {
                 fraction = 16;
                 while (fraction >= 4) {
@@@ -1863,7 -1866,7 +1864,7 @@@
                                 return order;
                         fraction /= 2;
                 }
- -              min_objects /= 2;
+ +              min_objects --;
         }
   
         /*
@@@ -1926,6 -1929,17 +1927,6 @@@ static voi
   init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
   {
         n->nr_partial = 0;
- -
- -      /*
- -       * The larger the object size is, the more pages we want on the partial
- -       * list to avoid pounding the page allocator excessively.
- -       */
- -      n->min_partial = ilog2(s->size);
- -      if (n->min_partial < MIN_PARTIAL)
- -              n->min_partial = MIN_PARTIAL;
- -      else if (n->min_partial > MAX_PARTIAL)
- -              n->min_partial = MAX_PARTIAL;
- -
         spin_lock_init(&n->list_lock);
         INIT_LIST_HEAD(&n->partial);
   #ifdef CONFIG_SLUB_DEBUG
@@@ -2168,15 -2182,6 +2169,15 @@@ static int init_kmem_cache_nodes(struc
   }
   #endif
   
+ +static void set_min_partial(struct kmem_cache *s, unsigned long min)
+ +{
+ +      if (min < MIN_PARTIAL)
+ +              min = MIN_PARTIAL;
+ +      else if (min > MAX_PARTIAL)
+ +              min = MAX_PARTIAL;
+ +      s->min_partial = min;
+ +}
+ +
   /*
    * calculate_sizes() determines the order and the distribution of data within
    * a slab object.
@@@ -2315,11 -2320,6 +2316,11 @@@ static int kmem_cache_open(struct kmem_
         if (!calculate_sizes(s, -1))
                 goto error;
   
+ +      /*
+ +       * The larger the object size is, the more pages we want on the partial
+ +       * list to avoid pounding the page allocator excessively.
+ +       */
+ +      set_min_partial(s, ilog2(s->size));
         s->refcount = 1;
   #ifdef CONFIG_NUMA
         s->remote_node_defrag_ratio = 1000;
@@@ -2476,7 -2476,7 +2477,7 @@@ EXPORT_SYMBOL(kmem_cache_destroy)
    *            Kmalloc subsystem
    *******************************************************************/
   
- -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
+ +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
   EXPORT_SYMBOL(kmalloc_caches);
   
   static int __init setup_slub_min_order(char *str)
@@@ -2538,7 -2538,7 +2539,7 @@@ panic
   }
   
   #ifdef CONFIG_ZONE_DMA
- -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
+ +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
   
   static void sysfs_add_func(struct work_struct *w)
   {
@@@ -2659,7 -2659,7 +2660,7 @@@ void *__kmalloc(size_t size, gfp_t flag
   {
         struct kmem_cache *s;
   
- -      if (unlikely(size > PAGE_SIZE))
+ +      if (unlikely(size > SLUB_MAX_SIZE))
                 return kmalloc_large(size, flags);
   
         s = get_slab(size, flags);
@@@ -2687,7 -2687,7 +2688,7 @@@ void *__kmalloc_node(size_t size, gfp_
   {
         struct kmem_cache *s;
   
- -      if (unlikely(size > PAGE_SIZE))
+ +      if (unlikely(size > SLUB_MAX_SIZE))
                 return kmalloc_large_node(size, flags, node);
   
         s = get_slab(size, flags);
@@@ -2737,7 -2737,6 +2738,7 @@@ size_t ksize(const void *object
          */
         return s->size;
   }
+ +EXPORT_SYMBOL(ksize);
   
   void kfree(const void *x)
   {
@@@ -2987,7 -2986,7 +2988,7 @@@ void __init kmem_cache_init(void
                 caches++;
         }
   
- -      for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
+ +      for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
                 create_kmalloc_cache(&kmalloc_caches[i],
                         "kmalloc", 1 << i, GFP_KERNEL);
                 caches++;
@@@ -3024,7 -3023,7 +3025,7 @@@
         slab_state = UP;
   
         /* Provide the correct kmalloc names now that the caches are up */
- -      for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
+ +      for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
                 kmalloc_caches[i]. name =
                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
   
@@@ -3224,7 -3223,7 +3225,7 @@@ void *__kmalloc_track_caller(size_t siz
   {
         struct kmem_cache *s;
   
- -      if (unlikely(size > PAGE_SIZE))
+ +      if (unlikely(size > SLUB_MAX_SIZE))
                 return kmalloc_large(size, gfpflags);
   
         s = get_slab(size, gfpflags);
@@@ -3240,7 -3239,7 +3241,7 @@@ void *__kmalloc_node_track_caller(size_
   {
         struct kmem_cache *s;
   
- -      if (unlikely(size > PAGE_SIZE))
+ +      if (unlikely(size > SLUB_MAX_SIZE))
                 return kmalloc_large_node(size, gfpflags, node);
   
         s = get_slab(size, gfpflags);
@@@ -3837,26 -3836,6 +3838,26 @@@ static ssize_t order_show(struct kmem_c
   }
   SLAB_ATTR(order);
   
+ +static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
+ +{
+ +      return sprintf(buf, "%lu\n", s->min_partial);
+ +}
+ +
+ +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
+ +                               size_t length)
+ +{
+ +      unsigned long min;
+ +      int err;
+ +
+ +      err = strict_strtoul(buf, 10, &min);
+ +      if (err)
+ +              return err;
+ +
+ +      set_min_partial(s, min);
+ +      return length;
+ +}
+ +SLAB_ATTR(min_partial);
+ +
   static ssize_t ctor_show(struct kmem_cache *s, char *buf)
   {
         if (s->ctor) {
@@@ -4172,7 -4151,6 +4173,7 @@@ static struct attribute *slab_attrs[] 
         &object_size_attr.attr,
         &objs_per_slab_attr.attr,
         &order_attr.attr,
+ +      &min_partial_attr.attr,
         &objects_attr.attr,
         &objects_partial_attr.attr,
         &total_objects_attr.attr,
diff --combined mm/vmscan.c

index 56ddf41149eb77a55158ced628c02f817b990618,cf844134527759ced5c2093b89174779006c747d..479e46719394cceccad44d55c28db3e2942616e7
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -1262,6 -1262,7 +1262,6 @@@ static void shrink_active_list(unsigne
          * Move the pages to the [file or anon] inactive list.
          */
         pagevec_init(&pvec, 1);
- -      pgmoved = 0;
         lru = LRU_BASE + file * LRU_FILE;
   
         spin_lock_irq(&zone->lru_lock);
@@@ -1273,7 -1274,6 +1273,7 @@@
          */
         reclaim_stat->recent_rotated[!!file] += pgmoved;
   
+ +      pgmoved = 0;
         while (!list_empty(&l_inactive)) {
                 page = lru_to_page(&l_inactive);
                 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@@ -1469,7 -1469,7 +1469,7 @@@ static void shrink_zone(int priority, s
                 int file = is_file_lru(l);
                 int scan;
   
- -              scan = zone_page_state(zone, NR_LRU_BASE + l);
+ +              scan = zone_nr_pages(zone, sc, l);
                 if (priority) {
                         scan >>= priority;
                         scan = (scan * percent[file]) / 100;
@@@ -1965,6 -1965,8 +1965,8 @@@ static int kswapd(void *p
         };
         node_to_cpumask_ptr(cpumask, pgdat->node_id);
   
+       lockdep_set_current_reclaim_state(GFP_KERNEL);
+ 
         if (!cpumask_empty(cpumask))
                 set_cpus_allowed_ptr(tsk, cpumask);
         current->reclaim_state = &reclaim_state;
@@@ -2057,31 -2059,31 +2059,31 @@@ static unsigned long shrink_all_zones(u
                                       int pass, struct scan_control *sc)
   {
         struct zone *zone;
- -      unsigned long nr_to_scan, ret = 0;
- -      enum lru_list l;
+ +      unsigned long ret = 0;
   
         for_each_zone(zone) {
+ +              enum lru_list l;
   
                 if (!populated_zone(zone))
                         continue;
- -
                 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                         continue;
   
                 for_each_evictable_lru(l) {
+ +                      enum zone_stat_item ls = NR_LRU_BASE + l;
+ +                      unsigned long lru_pages = zone_page_state(zone, ls);
+ +
                         /* For pass = 0, we don't shrink the active list */
- -                      if (pass == 0 &&
- -                              (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
+ +                      if (pass == 0 && (l == LRU_ACTIVE_ANON ||
+ +                                              l == LRU_ACTIVE_FILE))
                                 continue;
   
- -                      zone->lru[l].nr_scan +=
- -                              (zone_page_state(zone, NR_LRU_BASE + l)
- -                                                              >> prio) + 1;
+ +                      zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
                         if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+ +                              unsigned long nr_to_scan;
+ +
                                 zone->lru[l].nr_scan = 0;
- -                              nr_to_scan = min(nr_pages,
- -                                      zone_page_state(zone,
- -                                                      NR_LRU_BASE + l));
+ +                              nr_to_scan = min(nr_pages, lru_pages);
                                 ret += shrink_list(l, nr_to_scan, zone,
                                                                 sc, prio);
                                 if (ret >= nr_pages)
@@@ -2089,6 -2091,7 +2091,6 @@@
                         }
                 }
         }
- -
         return ret;
   }
   
@@@ -2111,6 -2114,7 +2113,6 @@@ unsigned long shrink_all_memory(unsigne
                 .may_swap = 0,
                 .swap_cluster_max = nr_pages,
                 .may_writepage = 1,
- -              .swappiness = vm_swappiness,
                 .isolate_pages = isolate_pages_global,
         };
   
@@@ -2144,8 -2148,10 +2146,8 @@@
                 int prio;
   
                 /* Force reclaiming mapped pages in the passes #3 and #4 */
- -              if (pass > 2) {
+ +              if (pass > 2)
                         sc.may_swap = 1;
- -                      sc.swappiness = 100;
- -              }
   
                 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
                         unsigned long nr_to_scan = nr_pages - ret;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/timer.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_features.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/timer.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slob.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history