]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
Merge branch 'locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Mar 2009 00:17:35 +0000 (17:17 -0700)
* 'locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (33 commits)
  lockdep: fix deadlock in lockdep_trace_alloc
  lockdep: annotate reclaim context (__GFP_NOFS), fix SLOB
  lockdep: annotate reclaim context (__GFP_NOFS), fix
  lockdep: build fix for !PROVE_LOCKING
  lockstat: warn about disabled lock debugging
  lockdep: use stringify.h
  lockdep: simplify check_prev_add_irq()
  lockdep: get_user_chars() redo
  lockdep: simplify get_user_chars()
  lockdep: add comments to mark_lock_irq()
  lockdep: remove macro usage from mark_held_locks()
  lockdep: fully reduce mark_lock_irq()
  lockdep: merge the !_READ mark_lock_irq() helpers
  lockdep: merge the _READ mark_lock_irq() helpers
  lockdep: simplify mark_lock_irq() helpers #3
  lockdep: further simplify mark_lock_irq() helpers
  lockdep: simplify the mark_lock_irq() helpers
  lockdep: split up mark_lock_irq()
  lockdep: generate usage strings
  lockdep: generate the state bit definitions
  ...

1  2 
include/linux/sched.h
include/linux/timer.h
kernel/sched.c
kernel/sched_features.h
kernel/timer.c
mm/page_alloc.c
mm/slab.c
mm/slob.c
mm/slub.c
mm/vmscan.c

diff --combined include/linux/sched.h
index 1d19c025f9d2e78b34692b3f350d9bef6fe522ed,b00a77f4999ed64d2fe668532819e2a29dfb6fd6..29df6374d2de517e078c7dd29dd5eb6b7c18f82b
@@@ -331,7 -331,9 +331,9 @@@ extern signed long schedule_timeout(sig
  extern signed long schedule_timeout_interruptible(signed long timeout);
  extern signed long schedule_timeout_killable(signed long timeout);
  extern signed long schedule_timeout_uninterruptible(signed long timeout);
+ asmlinkage void __schedule(void);
  asmlinkage void schedule(void);
+ extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
  
  struct nsproxy;
  struct user_namespace;
@@@ -443,6 -445,7 +445,6 @@@ struct pacct_struct 
   * @utime:            time spent in user mode, in &cputime_t units
   * @stime:            time spent in kernel mode, in &cputime_t units
   * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
 - * @lock:             lock for fields in this struct
   *
   * This structure groups together three kinds of CPU time that are
   * tracked for threads and thread groups.  Most things considering
@@@ -453,33 -456,23 +455,33 @@@ struct task_cputime 
        cputime_t utime;
        cputime_t stime;
        unsigned long long sum_exec_runtime;
 -      spinlock_t lock;
  };
  /* Alternate field names when used to cache expirations. */
  #define prof_exp      stime
  #define virt_exp      utime
  #define sched_exp     sum_exec_runtime
  
 +#define INIT_CPUTIME  \
 +      (struct task_cputime) {                                 \
 +              .utime = cputime_zero,                          \
 +              .stime = cputime_zero,                          \
 +              .sum_exec_runtime = 0,                          \
 +      }
 +
  /**
 - * struct thread_group_cputime - thread group interval timer counts
 - * @totals:           thread group interval timers; substructure for
 - *                    uniprocessor kernel, per-cpu for SMP kernel.
 + * struct thread_group_cputimer - thread group interval timer counts
 + * @cputime:          thread group interval timers.
 + * @running:          non-zero when there are timers running and
 + *                    @cputime receives updates.
 + * @lock:             lock for fields in this struct.
   *
   * This structure contains the version of task_cputime, above, that is
 - * used for thread group CPU clock calculations.
 + * used for thread group CPU timer calculations.
   */
 -struct thread_group_cputime {
 -      struct task_cputime totals;
 +struct thread_group_cputimer {
 +      struct task_cputime cputime;
 +      int running;
 +      spinlock_t lock;
  };
  
  /*
@@@ -528,10 -521,10 +530,10 @@@ struct signal_struct 
        cputime_t it_prof_incr, it_virt_incr;
  
        /*
 -       * Thread group totals for process CPU clocks.
 -       * See thread_group_cputime(), et al, for details.
 +       * Thread group totals for process CPU timers.
 +       * See thread_group_cputimer(), et al, for details.
         */
 -      struct thread_group_cputime cputime;
 +      struct thread_group_cputimer cputimer;
  
        /* Earliest-expiration cache. */
        struct task_cputime cputime_expires;
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
 -      cputime_t cutime, cstime;
 +      cputime_t utime, stime, cutime, cstime;
        cputime_t gtime;
        cputime_t cgtime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long inblock, oublock, cinblock, coublock;
        struct task_io_accounting ioac;
  
 +      /*
 +       * Cumulative ns of schedule CPU time fo dead threads in the
 +       * group, not including a zombie group leader, (This only differs
 +       * from jiffies_to_ns(utime + stime) if sched_clock uses something
 +       * other than jiffies.)
 +       */
 +      unsigned long long sum_sched_runtime;
 +
        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
@@@ -998,7 -983,6 +1000,7 @@@ struct sched_class 
                              struct rq *busiest, struct sched_domain *sd,
                              enum cpu_idle_type idle);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 +      int (*needs_post_schedule) (struct rq *this_rq);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
@@@ -1053,10 -1037,6 +1055,10 @@@ struct sched_entity 
        u64                     last_wakeup;
        u64                     avg_overlap;
  
 +      u64                     start_runtime;
 +      u64                     avg_wakeup;
 +      u64                     nr_migrations;
 +
  #ifdef CONFIG_SCHEDSTATS
        u64                     wait_start;
        u64                     wait_max;
        u64                     exec_max;
        u64                     slice_max;
  
 -      u64                     nr_migrations;
        u64                     nr_migrations_cold;
        u64                     nr_failed_migrations_affine;
        u64                     nr_failed_migrations_running;
@@@ -1168,7 -1149,6 +1170,7 @@@ struct task_struct 
  #endif
  
        struct list_head tasks;
 +      struct plist_node pushable_tasks;
  
        struct mm_struct *mm, *active_mm;
  
        /* ??? */
        unsigned int personality;
        unsigned did_exec:1;
 +      unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
 +                               * execve */
        pid_t pid;
        pid_t tgid;
  
 -#ifdef CONFIG_CC_STACKPROTECTOR
        /* Canary value for the -fstack-protector gcc feature */
        unsigned long stack_canary;
 -#endif
 +
        /* 
         * pointers to (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with 
        int lockdep_depth;
        unsigned int lockdep_recursion;
        struct held_lock held_locks[MAX_LOCK_DEPTH];
+       gfp_t lockdep_reclaim_gfp;
  #endif
  
  /* journalling filesystem info */
  #endif
  };
  
 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
 +#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
 +
  /*
   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@@ -1679,16 -1656,6 +1682,16 @@@ static inline int set_cpus_allowed(stru
        return set_cpus_allowed_ptr(p, &new_mask);
  }
  
 +/*
 + * Architectures can set this to 1 if they have specified
 + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
 + * but then during bootup it turns out that sched_clock()
 + * is reliable after all:
 + */
 +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 +extern int sched_clock_stable;
 +#endif
 +
  extern unsigned long long sched_clock(void);
  
  extern void sched_clock_init(void);
@@@ -2106,19 -2073,6 +2109,19 @@@ static inline int object_is_on_stack(vo
  
  extern void thread_info_cache_init(void);
  
 +#ifdef CONFIG_DEBUG_STACK_USAGE
 +static inline unsigned long stack_not_used(struct task_struct *p)
 +{
 +      unsigned long *n = end_of_stack(p);
 +
 +      do {    /* Skip over canary */
 +              n++;
 +      } while (!*n);
 +
 +      return (unsigned long)n - (unsigned long)end_of_stack(p);
 +}
 +#endif
 +
  /* set thread flags in other task's structures
   * - see asm/thread_info.h for TIF_xxxx flags available
   */
@@@ -2232,14 -2186,27 +2235,14 @@@ static inline int spin_needbreak(spinlo
  /*
   * Thread group CPU time accounting.
   */
 -
 -static inline
 -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 -{
 -      struct task_cputime *totals = &tsk->signal->cputime.totals;
 -      unsigned long flags;
 -
 -      spin_lock_irqsave(&totals->lock, flags);
 -      *times = *totals;
 -      spin_unlock_irqrestore(&totals->lock, flags);
 -}
 +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
  
  static inline void thread_group_cputime_init(struct signal_struct *sig)
  {
 -      sig->cputime.totals = (struct task_cputime){
 -              .utime = cputime_zero,
 -              .stime = cputime_zero,
 -              .sum_exec_runtime = 0,
 -      };
 -
 -      spin_lock_init(&sig->cputime.totals.lock);
 +      sig->cputimer.cputime = INIT_CPUTIME;
 +      spin_lock_init(&sig->cputimer.lock);
 +      sig->cputimer.running = 0;
  }
  
  static inline void thread_group_cputime_free(struct signal_struct *sig)
@@@ -2323,13 -2290,9 +2326,13 @@@ extern long sched_group_rt_runtime(stru
  extern int sched_group_set_rt_period(struct task_group *tg,
                                      long rt_period_us);
  extern long sched_group_rt_period(struct task_group *tg);
 +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
  #endif
  #endif
  
 +extern int task_can_switch_user(struct user_struct *up,
 +                                      struct task_struct *tsk);
 +
  #ifdef CONFIG_TASK_XACCT
  static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
  {
diff --combined include/linux/timer.h
index e2d662e3416e38026bbc7385f744481de1b7d0a0,51774eb87cc6b825b3321afef833e35912a619e5..6cdb6f3331f11b6a80d7cd2c85a45706937b16d3
@@@ -5,6 -5,7 +5,7 @@@
  #include <linux/ktime.h>
  #include <linux/stddef.h>
  #include <linux/debugobjects.h>
+ #include <linux/stringify.h>
  
  struct tvec_base;
  
@@@ -21,52 -22,126 +22,126 @@@ struct timer_list 
        char start_comm[16];
        int start_pid;
  #endif
+ #ifdef CONFIG_LOCKDEP
+       struct lockdep_map lockdep_map;
+ #endif
  };
  
  extern struct tvec_base boot_tvec_bases;
  
+ #ifdef CONFIG_LOCKDEP
+ /*
+  * NB: because we have to copy the lockdep_map, setting the lockdep_map key
+  * (second argument) here is required, otherwise it could be initialised to
+  * the copy of the lockdep_map later! We use the pointer to and the string
+  * "<file>:<line>" as the key resp. the name of the lockdep_map.
+  */
+ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)                          \
+       .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn),
+ #else
+ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
+ #endif
  #define TIMER_INITIALIZER(_function, _expires, _data) {               \
                .entry = { .prev = TIMER_ENTRY_STATIC },        \
                .function = (_function),                        \
                .expires = (_expires),                          \
                .data = (_data),                                \
                .base = &boot_tvec_bases,                       \
+               __TIMER_LOCKDEP_MAP_INITIALIZER(                \
+                       __FILE__ ":" __stringify(__LINE__))     \
        }
  
  #define DEFINE_TIMER(_name, _function, _expires, _data)               \
        struct timer_list _name =                               \
                TIMER_INITIALIZER(_function, _expires, _data)
  
- void init_timer(struct timer_list *timer);
- void init_timer_deferrable(struct timer_list *timer);
+ void init_timer_key(struct timer_list *timer,
+                   const char *name,
+                   struct lock_class_key *key);
+ void init_timer_deferrable_key(struct timer_list *timer,
+                              const char *name,
+                              struct lock_class_key *key);
+ #ifdef CONFIG_LOCKDEP
+ #define init_timer(timer)                                             \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_key((timer), #timer, &__key);                \
+       } while (0)
+ #define init_timer_deferrable(timer)                                  \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_deferrable_key((timer), #timer, &__key);     \
+       } while (0)
+ #define init_timer_on_stack(timer)                                    \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               init_timer_on_stack_key((timer), #timer, &__key);       \
+       } while (0)
+ #define setup_timer(timer, fn, data)                                  \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               setup_timer_key((timer), #timer, &__key, (fn), (data));\
+       } while (0)
+ #define setup_timer_on_stack(timer, fn, data)                         \
+       do {                                                            \
+               static struct lock_class_key __key;                     \
+               setup_timer_on_stack_key((timer), #timer, &__key,       \
+                                        (fn), (data));                 \
+       } while (0)
+ #else
+ #define init_timer(timer)\
+       init_timer_key((timer), NULL, NULL)
+ #define init_timer_deferrable(timer)\
+       init_timer_deferrable_key((timer), NULL, NULL)
+ #define init_timer_on_stack(timer)\
+       init_timer_on_stack_key((timer), NULL, NULL)
+ #define setup_timer(timer, fn, data)\
+       setup_timer_key((timer), NULL, NULL, (fn), (data))
+ #define setup_timer_on_stack(timer, fn, data)\
+       setup_timer_on_stack_key((timer), NULL, NULL, (fn), (data))
+ #endif
  
  #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
- extern void init_timer_on_stack(struct timer_list *timer);
+ extern void init_timer_on_stack_key(struct timer_list *timer,
+                                   const char *name,
+                                   struct lock_class_key *key);
  extern void destroy_timer_on_stack(struct timer_list *timer);
  #else
  static inline void destroy_timer_on_stack(struct timer_list *timer) { }
- static inline void init_timer_on_stack(struct timer_list *timer)
+ static inline void init_timer_on_stack_key(struct timer_list *timer,
+                                          const char *name,
+                                          struct lock_class_key *key)
  {
-       init_timer(timer);
+       init_timer_key(timer, name, key);
  }
  #endif
  
- static inline void setup_timer(struct timer_list * timer,
+ static inline void setup_timer_key(struct timer_list * timer,
+                               const char *name,
+                               struct lock_class_key *key,
                                void (*function)(unsigned long),
                                unsigned long data)
  {
        timer->function = function;
        timer->data = data;
-       init_timer(timer);
+       init_timer_key(timer, name, key);
  }
  
- static inline void setup_timer_on_stack(struct timer_list *timer,
+ static inline void setup_timer_on_stack_key(struct timer_list *timer,
+                                       const char *name,
+                                       struct lock_class_key *key,
                                        void (*function)(unsigned long),
                                        unsigned long data)
  {
        timer->function = function;
        timer->data = data;
-       init_timer_on_stack(timer);
+       init_timer_on_stack_key(timer, name, key);
  }
  
  /**
@@@ -86,8 -161,8 +161,8 @@@ static inline int timer_pending(const s
  
  extern void add_timer_on(struct timer_list *timer, int cpu);
  extern int del_timer(struct timer_list * timer);
 -extern int __mod_timer(struct timer_list *timer, unsigned long expires);
  extern int mod_timer(struct timer_list *timer, unsigned long expires);
 +extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
  
  /*
   * The jiffies value which is added to now, when there is no timer
@@@ -146,7 -221,25 +221,7 @@@ static inline void timer_stats_timer_cl
  }
  #endif
  
 -/**
 - * add_timer - start a timer
 - * @timer: the timer to be added
 - *
 - * The kernel will do a ->function(->data) callback from the
 - * timer interrupt at the ->expires point in the future. The
 - * current time is 'jiffies'.
 - *
 - * The timer's ->expires, ->function (and if the handler uses it, ->data)
 - * fields must be set prior calling this function.
 - *
 - * Timers with an ->expires field in the past will be executed in the next
 - * timer tick.
 - */
 -static inline void add_timer(struct timer_list *timer)
 -{
 -      BUG_ON(timer_pending(timer));
 -      __mod_timer(timer, timer->expires);
 -}
 +extern void add_timer(struct timer_list *timer);
  
  #ifdef CONFIG_SMP
    extern int try_to_del_timer_sync(struct timer_list *timer);
diff --combined kernel/sched.c
index 5757e03cfac0bdf7cd50f3625a318645c562b973,186c6fd08acf8de949c936c5dc0116c3b90765fe..196d48babbef87c088214e4f252b4397d2bf25be
@@@ -223,7 -223,7 +223,7 @@@ static void start_rt_bandwidth(struct r
  {
        ktime_t now;
  
 -      if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 +      if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
  
        if (hrtimer_active(&rt_b->rt_period_timer))
@@@ -331,13 -331,6 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
   */
  static DEFINE_SPINLOCK(task_group_lock);
  
 +#ifdef CONFIG_SMP
 +static int root_task_group_empty(void)
 +{
 +      return list_empty(&root_task_group.children);
 +}
 +#endif
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_USER_SCHED
  # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -398,13 -391,6 +398,13 @@@ static inline void set_task_rq(struct t
  
  #else
  
 +#ifdef CONFIG_SMP
 +static int root_task_group_empty(void)
 +{
 +      return 1;
 +}
 +#endif
 +
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
  static inline struct task_group *task_group(struct task_struct *p)
  {
@@@ -481,17 -467,11 +481,17 @@@ struct rt_rq 
        struct rt_prio_array active;
        unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 -      int highest_prio; /* highest queued rt task prio */
 +      struct {
 +              int curr; /* highest queued rt task prio */
 +#ifdef CONFIG_SMP
 +              int next; /* next highest */
 +#endif
 +      } highest_prio;
  #endif
  #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
 +      struct plist_head pushable_tasks;
  #endif
        int rt_throttled;
        u64 rt_time;
@@@ -569,6 -549,7 +569,6 @@@ struct rq 
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 -      unsigned char idle_at_tick;
  #ifdef CONFIG_NO_HZ
        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
        struct root_domain *rd;
        struct sched_domain *sd;
  
 +      unsigned char idle_at_tick;
        /* For active balancing */
        int active_balance;
        int push_cpu;
        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
 -      unsigned int yld_exp_empty;
 -      unsigned int yld_act_empty;
 -      unsigned int yld_both_empty;
        unsigned int yld_count;
  
        /* schedule() stats */
@@@ -1200,10 -1183,10 +1200,10 @@@ static void resched_task(struct task_st
  
        assert_spin_locked(&task_rq(p)->lock);
  
 -      if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 +      if (test_tsk_need_resched(p))
                return;
  
 -      set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 +      set_tsk_need_resched(p);
  
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@@ -1259,7 -1242,7 +1259,7 @@@ void wake_up_idle_cpu(int cpu
         * lockless. The worst case is that the other CPU runs the
         * idle task through an additional NOOP schedule()
         */
 -      set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
 +      set_tsk_need_resched(rq->idle);
  
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@@ -1627,42 -1610,21 +1627,42 @@@ static inline void update_shares_locked
  
  #endif
  
 +#ifdef CONFIG_PREEMPT
 +
  /*
 - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 + * fair double_lock_balance: Safely acquires both rq->locks in a fair
 + * way at the expense of forcing extra atomic operations in all
 + * invocations.  This assures that the double_lock is acquired using the
 + * same underlying policy as the spinlock_t on this architecture, which
 + * reduces latency compared to the unfair variant below.  However, it
 + * also adds more overhead and therefore may reduce throughput.
   */
 -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +      __releases(this_rq->lock)
 +      __acquires(busiest->lock)
 +      __acquires(this_rq->lock)
 +{
 +      spin_unlock(&this_rq->lock);
 +      double_rq_lock(this_rq, busiest);
 +
 +      return 1;
 +}
 +
 +#else
 +/*
 + * Unfair double_lock_balance: Optimizes throughput at the expense of
 + * latency by eliminating extra atomic operations when the locks are
 + * already in proper order on entry.  This favors lower cpu-ids and will
 + * grant the double lock to lower cpus over higher ids under contention,
 + * regardless of entry order into the function.
 + */
 +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
  {
        int ret = 0;
  
 -      if (unlikely(!irqs_disabled())) {
 -              /* printk() doesn't work good under rq->lock */
 -              spin_unlock(&this_rq->lock);
 -              BUG_ON(1);
 -      }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
        return ret;
  }
  
 +#endif /* CONFIG_PREEMPT */
 +
 +/*
 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 + */
 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +{
 +      if (unlikely(!irqs_disabled())) {
 +              /* printk() doesn't work good under rq->lock */
 +              spin_unlock(&this_rq->lock);
 +              BUG_ON(1);
 +      }
 +
 +      return _double_lock_balance(this_rq, busiest);
 +}
 +
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
  {
@@@ -1759,9 -1705,6 +1759,9 @@@ static void update_avg(u64 *avg, u64 sa
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
 +      if (wakeup)
 +              p->se.start_runtime = p->se.sum_exec_runtime;
 +
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
 -      if (sleep && p->se.last_wakeup) {
 -              update_avg(&p->se.avg_overlap,
 -                         p->se.sum_exec_runtime - p->se.last_wakeup);
 -              p->se.last_wakeup = 0;
 +      if (sleep) {
 +              if (p->se.last_wakeup) {
 +                      update_avg(&p->se.avg_overlap,
 +                              p->se.sum_exec_runtime - p->se.last_wakeup);
 +                      p->se.last_wakeup = 0;
 +              } else {
 +                      update_avg(&p->se.avg_wakeup,
 +                              sysctl_sched_wakeup_granularity);
 +              }
        }
  
        sched_info_dequeued(p);
@@@ -2079,7 -2017,7 +2079,7 @@@ unsigned long wait_task_inactive(struc
                 * it must be off the runqueue _entirely_, and not
                 * preempted!
                 *
 -               * So if it wa still runnable (but just not actively
 +               * So if it was still runnable (but just not actively
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
@@@ -2328,8 -2266,18 +2328,8 @@@ static int try_to_wake_up(struct task_s
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
  
 -      if (!sync) {
 -              if (current->se.avg_overlap < sysctl_sched_migration_cost &&
 -                        p->se.avg_overlap < sysctl_sched_migration_cost)
 -                      sync = 1;
 -      } else {
 -              if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
 -                        p->se.avg_overlap >= sysctl_sched_migration_cost)
 -                      sync = 0;
 -      }
 -
  #ifdef CONFIG_SMP
 -      if (sched_feat(LB_WAKEUP_UPDATE)) {
 +      if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                struct sched_domain *sd;
  
                this_cpu = raw_smp_processor_id();
@@@ -2407,22 -2355,6 +2407,22 @@@ out_activate
        activate_task(rq, p, 1);
        success = 1;
  
 +      /*
 +       * Only attribute actual wakeups done by this task.
 +       */
 +      if (!in_interrupt()) {
 +              struct sched_entity *se = &current->se;
 +              u64 sample = se->sum_exec_runtime;
 +
 +              if (se->last_wakeup)
 +                      sample -= se->last_wakeup;
 +              else
 +                      sample -= se->start_runtime;
 +              update_avg(&se->avg_wakeup, sample);
 +
 +              se->last_wakeup = se->sum_exec_runtime;
 +      }
 +
  out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
                p->sched_class->task_wake_up(rq, p);
  #endif
  out:
 -      current->se.last_wakeup = current->se.sum_exec_runtime;
 -
        task_rq_unlock(rq, &flags);
  
        return success;
@@@ -2462,8 -2396,6 +2462,8 @@@ static void __sched_fork(struct task_st
        p->se.prev_sum_exec_runtime     = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
 +      p->se.start_runtime             = 0;
 +      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@@ -2526,8 -2458,6 +2526,8 @@@ void sched_fork(struct task_struct *p, 
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
  #endif
 +      plist_node_init(&p->pushable_tasks, MAX_PRIO);
 +
        put_cpu();
  }
  
@@@ -2571,7 -2501,7 +2571,7 @@@ void wake_up_new_task(struct task_struc
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
  /**
 - * preempt_notifier_register - tell me when current is being being preempted & rescheduled
 + * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2668,12 -2598,6 +2668,12 @@@ static void finish_task_switch(struct r
  {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
 +#ifdef CONFIG_SMP
 +      int post_schedule = 0;
 +
 +      if (current->sched_class->needs_post_schedule)
 +              post_schedule = current->sched_class->needs_post_schedule(rq);
 +#endif
  
        rq->prev_mm = NULL;
  
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
 -      if (current->sched_class->post_schedule)
 +      if (post_schedule)
                current->sched_class->post_schedule(rq);
  #endif
  
@@@ -2999,7 -2923,6 +2999,7 @@@ int can_migrate_task(struct task_struc
                     struct sched_domain *sd, enum cpu_idle_type idle,
                     int *all_pinned)
  {
 +      int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
         * 1) running (obviously), or
         * 2) too many balance attempts have failed.
         */
  
 -      if (!task_hot(p, rq->clock, sd) ||
 -                      sd->nr_balance_failed > sd->cache_nice_tries) {
 +      tsk_cache_hot = task_hot(p, rq->clock, sd);
 +      if (!tsk_cache_hot ||
 +              sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
 -              if (task_hot(p, rq->clock, sd)) {
 +              if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
                        schedstat_inc(p, se.nr_forced_migrations);
                }
                return 1;
        }
  
 -      if (task_hot(p, rq->clock, sd)) {
 +      if (tsk_cache_hot) {
                schedstat_inc(p, se.nr_failed_migrations_hot);
                return 0;
        }
@@@ -3075,16 -2997,6 +3075,16 @@@ next
        pulled++;
        rem_load_move -= p->se.load.weight;
  
 +#ifdef CONFIG_PREEMPT
 +      /*
 +       * NEWIDLE balancing is a source of latency, so preemptible kernels
 +       * will stop after the first task is pulled to minimize the critical
 +       * section.
 +       */
 +      if (idle == CPU_NEWLY_IDLE)
 +              goto out;
 +#endif
 +
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@@ -3131,15 -3043,9 +3131,15 @@@ static int move_tasks(struct rq *this_r
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
  
 +#ifdef CONFIG_PREEMPT
 +              /*
 +               * NEWIDLE balancing is a source of latency, so preemptible
 +               * kernels will stop after the first task is pulled to minimize
 +               * the critical section.
 +               */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
 -
 +#endif
        } while (class && max_load_move > total_load_moved);
  
        return total_load_moved > 0;
@@@ -3189,480 -3095,246 +3189,480 @@@ static int move_one_task(struct rq *thi
  
        return 0;
  }
 -
 +/********** Helpers for find_busiest_group ************************/
  /*
 - * find_busiest_group finds and returns the busiest CPU group within the
 - * domain. It calculates and returns the amount of weighted load which
 - * should be moved to restore balance via the imbalance parameter.
 + * sd_lb_stats - Structure to store the statistics of a sched_domain
 + *            during load balancing.
   */
 -static struct sched_group *
 -find_busiest_group(struct sched_domain *sd, int this_cpu,
 -                 unsigned long *imbalance, enum cpu_idle_type idle,
 -                 int *sd_idle, const struct cpumask *cpus, int *balance)
 -{
 -      struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 -      unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 -      unsigned long max_pull;
 -      unsigned long busiest_load_per_task, busiest_nr_running;
 -      unsigned long this_load_per_task, this_nr_running;
 -      int load_idx, group_imb = 0;
 +struct sd_lb_stats {
 +      struct sched_group *busiest; /* Busiest group in this sd */
 +      struct sched_group *this;  /* Local group in this sd */
 +      unsigned long total_load;  /* Total load of all groups in sd */
 +      unsigned long total_pwr;   /*   Total power of all groups in sd */
 +      unsigned long avg_load;    /* Average load across all groups in sd */
 +
 +      /** Statistics of this group */
 +      unsigned long this_load;
 +      unsigned long this_load_per_task;
 +      unsigned long this_nr_running;
 +
 +      /* Statistics of the busiest group */
 +      unsigned long max_load;
 +      unsigned long busiest_load_per_task;
 +      unsigned long busiest_nr_running;
 +
 +      int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -      int power_savings_balance = 1;
 -      unsigned long leader_nr_running = 0, min_load_per_task = 0;
 -      unsigned long min_nr_running = ULONG_MAX;
 -      struct sched_group *group_min = NULL, *group_leader = NULL;
 +      int power_savings_balance; /* Is powersave balance needed for this sd */
 +      struct sched_group *group_min; /* Least loaded group in sd */
 +      struct sched_group *group_leader; /* Group which relieves group_min */
 +      unsigned long min_load_per_task; /* load_per_task in group_min */
 +      unsigned long leader_nr_running; /* Nr running of group_leader */
 +      unsigned long min_nr_running; /* Nr running of group_min */
  #endif
 +};
 +
 +/*
 + * sg_lb_stats - stats of a sched_group required for load_balancing
 + */
 +struct sg_lb_stats {
 +      unsigned long avg_load; /*Avg load across the CPUs of the group */
 +      unsigned long group_load; /* Total load over the CPUs of the group */
 +      unsigned long sum_nr_running; /* Nr tasks running in the group */
 +      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 +      unsigned long group_capacity;
 +      int group_imb; /* Is there an imbalance in the group ? */
 +};
  
 -      max_load = this_load = total_load = total_pwr = 0;
 -      busiest_load_per_task = busiest_nr_running = 0;
 -      this_load_per_task = this_nr_running = 0;
 +/**
 + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
 + * @group: The group whose first cpu is to be returned.
 + */
 +static inline unsigned int group_first_cpu(struct sched_group *group)
 +{
 +      return cpumask_first(sched_group_cpus(group));
 +}
  
 -      if (idle == CPU_NOT_IDLE)
 +/**
 + * get_sd_load_idx - Obtain the load index for a given sched domain.
 + * @sd: The sched_domain whose load_idx is to be obtained.
 + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
 + */
 +static inline int get_sd_load_idx(struct sched_domain *sd,
 +                                      enum cpu_idle_type idle)
 +{
 +      int load_idx;
 +
 +      switch (idle) {
 +      case CPU_NOT_IDLE:
                load_idx = sd->busy_idx;
 -      else if (idle == CPU_NEWLY_IDLE)
 +              break;
 +
 +      case CPU_NEWLY_IDLE:
                load_idx = sd->newidle_idx;
 -      else
 +              break;
 +      default:
                load_idx = sd->idle_idx;
 +              break;
 +      }
  
 -      do {
 -              unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
 -              int local_group;
 -              int i;
 -              int __group_imb = 0;
 -              unsigned int balance_cpu = -1, first_idle_cpu = 0;
 -              unsigned long sum_nr_running, sum_weighted_load;
 -              unsigned long sum_avg_load_per_task;
 -              unsigned long avg_load_per_task;
 +      return load_idx;
 +}
  
 -              local_group = cpumask_test_cpu(this_cpu,
 -                                             sched_group_cpus(group));
  
 -              if (local_group)
 -                      balance_cpu = cpumask_first(sched_group_cpus(group));
 +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 +/**
 + * init_sd_power_savings_stats - Initialize power savings statistics for
 + * the given sched_domain, during load balancing.
 + *
 + * @sd: Sched domain whose power-savings statistics are to be initialized.
 + * @sds: Variable containing the statistics for sd.
 + * @idle: Idle status of the CPU at which we're performing load-balancing.
 + */
 +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
 +{
 +      /*
 +       * Busy processors will not participate in power savings
 +       * balance.
 +       */
 +      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 +              sds->power_savings_balance = 0;
 +      else {
 +              sds->power_savings_balance = 1;
 +              sds->min_nr_running = ULONG_MAX;
 +              sds->leader_nr_running = 0;
 +      }
 +}
 +
 +/**
 + * update_sd_power_savings_stats - Update the power saving stats for a
 + * sched_domain while performing load balancing.
 + *
 + * @group: sched_group belonging to the sched_domain under consideration.
 + * @sds: Variable containing the statistics of the sched_domain
 + * @local_group: Does group contain the CPU for which we're performing
 + *            load balancing ?
 + * @sgs: Variable containing the statistics of the group.
 + */
 +static inline void update_sd_power_savings_stats(struct sched_group *group,
 +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 +{
  
 -              /* Tally up the load of all CPUs in the group */
 -              sum_weighted_load = sum_nr_running = avg_load = 0;
 -              sum_avg_load_per_task = avg_load_per_task = 0;
 +      if (!sds->power_savings_balance)
 +              return;
  
 -              max_cpu_load = 0;
 -              min_cpu_load = ~0UL;
 +      /*
 +       * If the local group is idle or completely loaded
 +       * no need to do power savings balance at this domain
 +       */
 +      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
 +                              !sds->this_nr_running))
 +              sds->power_savings_balance = 0;
  
 -              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 -                      struct rq *rq = cpu_rq(i);
 +      /*
 +       * If a group is already running at full capacity or idle,
 +       * don't include that group in power savings calculations
 +       */
 +      if (!sds->power_savings_balance ||
 +              sgs->sum_nr_running >= sgs->group_capacity ||
 +              !sgs->sum_nr_running)
 +              return;
  
 -                      if (*sd_idle && rq->nr_running)
 -                              *sd_idle = 0;
 +      /*
 +       * Calculate the group which has the least non-idle load.
 +       * This is the group from where we need to pick up the load
 +       * for saving power
 +       */
 +      if ((sgs->sum_nr_running < sds->min_nr_running) ||
 +          (sgs->sum_nr_running == sds->min_nr_running &&
 +           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
 +              sds->group_min = group;
 +              sds->min_nr_running = sgs->sum_nr_running;
 +              sds->min_load_per_task = sgs->sum_weighted_load /
 +                                              sgs->sum_nr_running;
 +      }
  
 -                      /* Bias balancing toward cpus of our domain */
 -                      if (local_group) {
 -                              if (idle_cpu(i) && !first_idle_cpu) {
 -                                      first_idle_cpu = 1;
 -                                      balance_cpu = i;
 -                              }
 +      /*
 +       * Calculate the group which is almost near its
 +       * capacity but still has some space to pick up some load
 +       * from other group and save more power
 +       */
 +      if (sgs->sum_nr_running > sgs->group_capacity - 1)
 +              return;
  
 -                              load = target_load(i, load_idx);
 -                      } else {
 -                              load = source_load(i, load_idx);
 -                              if (load > max_cpu_load)
 -                                      max_cpu_load = load;
 -                              if (min_cpu_load > load)
 -                                      min_cpu_load = load;
 -                      }
 +      if (sgs->sum_nr_running > sds->leader_nr_running ||
 +          (sgs->sum_nr_running == sds->leader_nr_running &&
 +           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
 +              sds->group_leader = group;
 +              sds->leader_nr_running = sgs->sum_nr_running;
 +      }
 +}
  
 -                      avg_load += load;
 -                      sum_nr_running += rq->nr_running;
 -                      sum_weighted_load += weighted_cpuload(i);
 +/**
 + * check_power_save_busiest_group - see if there is potential for some power-savings balance
 + * @sds: Variable containing the statistics of the sched_domain
 + *    under consideration.
 + * @this_cpu: Cpu at which we're currently performing load-balancing.
 + * @imbalance: Variable to store the imbalance.
 + *
 + * Description:
 + * Check if we have potential to perform some power-savings balance.
 + * If yes, set the busiest group to be the least loaded group in the
 + * sched_domain, so that it's CPUs can be put to idle.
 + *
 + * Returns 1 if there is potential to perform power-savings balance.
 + * Else returns 0.
 + */
 +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 +                                      int this_cpu, unsigned long *imbalance)
 +{
 +      if (!sds->power_savings_balance)
 +              return 0;
  
 -                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
 -              }
 +      if (sds->this != sds->group_leader ||
 +                      sds->group_leader == sds->group_min)
 +              return 0;
  
 -              /*
 -               * First idle cpu or the first cpu(busiest) in this sched group
 -               * is eligible for doing load balancing at this and above
 -               * domains. In the newly idle case, we will allow all the cpu's
 -               * to do the newly idle load balance.
 -               */
 -              if (idle != CPU_NEWLY_IDLE && local_group &&
 -                  balance_cpu != this_cpu && balance) {
 -                      *balance = 0;
 -                      goto ret;
 -              }
 +      *imbalance = sds->min_load_per_task;
 +      sds->busiest = sds->group_min;
  
 -              total_load += avg_load;
 -              total_pwr += group->__cpu_power;
 +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
 +                      group_first_cpu(sds->group_leader);
 +      }
  
 -              /* Adjust by relative CPU power of the group */
 -              avg_load = sg_div_cpu_power(group,
 -                              avg_load * SCHED_LOAD_SCALE);
 +      return 1;
  
 +}
 +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
 +{
 +      return;
 +}
  
 -              /*
 -               * Consider the group unbalanced when the imbalance is larger
 -               * than the average weight of two tasks.
 -               *
 -               * APZ: with cgroup the avg task weight can vary wildly and
 -               *      might not be a suitable number - should we keep a
 -               *      normalized nr_running number somewhere that negates
 -               *      the hierarchy?
 -               */
 -              avg_load_per_task = sg_div_cpu_power(group,
 -                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
 +static inline void update_sd_power_savings_stats(struct sched_group *group,
 +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 +{
 +      return;
 +}
  
 -              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 -                      __group_imb = 1;
 +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 +                                      int this_cpu, unsigned long *imbalance)
 +{
 +      return 0;
 +}
 +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
 -              group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
  
 +/**
 + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 + * @group: sched_group whose statistics are to be updated.
 + * @this_cpu: Cpu for which load balance is currently performed.
 + * @idle: Idle status of this_cpu
 + * @load_idx: Load index of sched_domain of this_cpu for load calc.
 + * @sd_idle: Idle status of the sched_domain containing group.
 + * @local_group: Does group contain this_cpu.
 + * @cpus: Set of cpus considered for load balancing.
 + * @balance: Should we balance.
 + * @sgs: variable to hold the statistics for this group.
 + */
 +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 +                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
 +                      int local_group, const struct cpumask *cpus,
 +                      int *balance, struct sg_lb_stats *sgs)
 +{
 +      unsigned long load, max_cpu_load, min_cpu_load;
 +      int i;
 +      unsigned int balance_cpu = -1, first_idle_cpu = 0;
 +      unsigned long sum_avg_load_per_task;
 +      unsigned long avg_load_per_task;
 +
 +      if (local_group)
 +              balance_cpu = group_first_cpu(group);
 +
 +      /* Tally up the load of all CPUs in the group */
 +      sum_avg_load_per_task = avg_load_per_task = 0;
 +      max_cpu_load = 0;
 +      min_cpu_load = ~0UL;
 +
 +      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 +              struct rq *rq = cpu_rq(i);
 +
 +              if (*sd_idle && rq->nr_running)
 +                      *sd_idle = 0;
 +
 +              /* Bias balancing toward cpus of our domain */
                if (local_group) {
 -                      this_load = avg_load;
 -                      this = group;
 -                      this_nr_running = sum_nr_running;
 -                      this_load_per_task = sum_weighted_load;
 -              } else if (avg_load > max_load &&
 -                         (sum_nr_running > group_capacity || __group_imb)) {
 -                      max_load = avg_load;
 -                      busiest = group;
 -                      busiest_nr_running = sum_nr_running;
 -                      busiest_load_per_task = sum_weighted_load;
 -                      group_imb = __group_imb;
 +                      if (idle_cpu(i) && !first_idle_cpu) {
 +                              first_idle_cpu = 1;
 +                              balance_cpu = i;
 +                      }
 +
 +                      load = target_load(i, load_idx);
 +              } else {
 +                      load = source_load(i, load_idx);
 +                      if (load > max_cpu_load)
 +                              max_cpu_load = load;
 +                      if (min_cpu_load > load)
 +                              min_cpu_load = load;
                }
  
 -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -              /*
 -               * Busy processors will not participate in power savings
 -               * balance.
 -               */
 -              if (idle == CPU_NOT_IDLE ||
 -                              !(sd->flags & SD_POWERSAVINGS_BALANCE))
 -                      goto group_next;
 +              sgs->group_load += load;
 +              sgs->sum_nr_running += rq->nr_running;
 +              sgs->sum_weighted_load += weighted_cpuload(i);
  
 -              /*
 -               * If the local group is idle or completely loaded
 -               * no need to do power savings balance at this domain
 -               */
 -              if (local_group && (this_nr_running >= group_capacity ||
 -                                  !this_nr_running))
 -                      power_savings_balance = 0;
 +              sum_avg_load_per_task += cpu_avg_load_per_task(i);
 +      }
  
 -              /*
 -               * If a group is already running at full capacity or idle,
 -               * don't include that group in power savings calculations
 -               */
 -              if (!power_savings_balance || sum_nr_running >= group_capacity
 -                  || !sum_nr_running)
 -                      goto group_next;
 +      /*
 +       * First idle cpu or the first cpu(busiest) in this sched group
 +       * is eligible for doing load balancing at this and above
 +       * domains. In the newly idle case, we will allow all the cpu's
 +       * to do the newly idle load balance.
 +       */
 +      if (idle != CPU_NEWLY_IDLE && local_group &&
 +          balance_cpu != this_cpu && balance) {
 +              *balance = 0;
 +              return;
 +      }
  
 -              /*
 -               * Calculate the group which has the least non-idle load.
 -               * This is the group from where we need to pick up the load
 -               * for saving power
 -               */
 -              if ((sum_nr_running < min_nr_running) ||
 -                  (sum_nr_running == min_nr_running &&
 -                   cpumask_first(sched_group_cpus(group)) >
 -                   cpumask_first(sched_group_cpus(group_min)))) {
 -                      group_min = group;
 -                      min_nr_running = sum_nr_running;
 -                      min_load_per_task = sum_weighted_load /
 -                                              sum_nr_running;
 -              }
 +      /* Adjust by relative CPU power of the group */
 +      sgs->avg_load = sg_div_cpu_power(group,
 +                      sgs->group_load * SCHED_LOAD_SCALE);
  
 -              /*
 -               * Calculate the group which is almost near its
 -               * capacity but still has some space to pick up some load
 -               * from other group and save more power
 -               */
 -              if (sum_nr_running <= group_capacity - 1) {
 -                      if (sum_nr_running > leader_nr_running ||
 -                          (sum_nr_running == leader_nr_running &&
 -                           cpumask_first(sched_group_cpus(group)) <
 -                           cpumask_first(sched_group_cpus(group_leader)))) {
 -                              group_leader = group;
 -                              leader_nr_running = sum_nr_running;
 -                      }
 +
 +      /*
 +       * Consider the group unbalanced when the imbalance is larger
 +       * than the average weight of two tasks.
 +       *
 +       * APZ: with cgroup the avg task weight can vary wildly and
 +       *      might not be a suitable number - should we keep a
 +       *      normalized nr_running number somewhere that negates
 +       *      the hierarchy?
 +       */
 +      avg_load_per_task = sg_div_cpu_power(group,
 +                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
 +
 +      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 +              sgs->group_imb = 1;
 +
 +      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 +
 +}
 +
 +/**
 + * update_sd_lb_stats - Update sched_group's statistics for load balancing.
 + * @sd: sched_domain whose statistics are to be updated.
 + * @this_cpu: Cpu for which load balance is currently performed.
 + * @idle: Idle status of this_cpu
 + * @sd_idle: Idle status of the sched_domain containing group.
 + * @cpus: Set of cpus considered for load balancing.
 + * @balance: Should we balance.
 + * @sds: variable to hold the statistics for this sched_domain.
 + */
 +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 +                      enum cpu_idle_type idle, int *sd_idle,
 +                      const struct cpumask *cpus, int *balance,
 +                      struct sd_lb_stats *sds)
 +{
 +      struct sched_group *group = sd->groups;
 +      struct sg_lb_stats sgs;
 +      int load_idx;
 +
 +      init_sd_power_savings_stats(sd, sds, idle);
 +      load_idx = get_sd_load_idx(sd, idle);
 +
 +      do {
 +              int local_group;
 +
 +              local_group = cpumask_test_cpu(this_cpu,
 +                                             sched_group_cpus(group));
 +              memset(&sgs, 0, sizeof(sgs));
 +              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
 +                              local_group, cpus, balance, &sgs);
 +
 +              if (local_group && balance && !(*balance))
 +                      return;
 +
 +              sds->total_load += sgs.group_load;
 +              sds->total_pwr += group->__cpu_power;
 +
 +              if (local_group) {
 +                      sds->this_load = sgs.avg_load;
 +                      sds->this = group;
 +                      sds->this_nr_running = sgs.sum_nr_running;
 +                      sds->this_load_per_task = sgs.sum_weighted_load;
 +              } else if (sgs.avg_load > sds->max_load &&
 +                         (sgs.sum_nr_running > sgs.group_capacity ||
 +                              sgs.group_imb)) {
 +                      sds->max_load = sgs.avg_load;
 +                      sds->busiest = group;
 +                      sds->busiest_nr_running = sgs.sum_nr_running;
 +                      sds->busiest_load_per_task = sgs.sum_weighted_load;
 +                      sds->group_imb = sgs.group_imb;
                }
 -group_next:
 -#endif
 +
 +              update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
  
 -      if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 -              goto out_balanced;
 -
 -      avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 +}
  
 -      if (this_load >= avg_load ||
 -                      100*max_load <= sd->imbalance_pct*this_load)
 -              goto out_balanced;
 +/**
 + * fix_small_imbalance - Calculate the minor imbalance that exists
 + *                    amongst the groups of a sched_domain, during
 + *                    load balancing.
 + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
 + * @imbalance: Variable to store the imbalance.
 + */
 +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 +                              int this_cpu, unsigned long *imbalance)
 +{
 +      unsigned long tmp, pwr_now = 0, pwr_move = 0;
 +      unsigned int imbn = 2;
 +
 +      if (sds->this_nr_running) {
 +              sds->this_load_per_task /= sds->this_nr_running;
 +              if (sds->busiest_load_per_task >
 +                              sds->this_load_per_task)
 +                      imbn = 1;
 +      } else
 +              sds->this_load_per_task =
 +                      cpu_avg_load_per_task(this_cpu);
  
 -      busiest_load_per_task /= busiest_nr_running;
 -      if (group_imb)
 -              busiest_load_per_task = min(busiest_load_per_task, avg_load);
 +      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
 +                      sds->busiest_load_per_task * imbn) {
 +              *imbalance = sds->busiest_load_per_task;
 +              return;
 +      }
  
        /*
 -       * We're trying to get all the cpus to the average_load, so we don't
 -       * want to push ourselves above the average load, nor do we wish to
 -       * reduce the max loaded cpu below the average load, as either of these
 -       * actions would just result in more rebalancing later, and ping-pong
 -       * tasks around. Thus we look for the minimum possible imbalance.
 -       * Negative imbalances (*we* are more loaded than anyone else) will
 -       * be counted as no imbalance for these purposes -- we can't fix that
 -       * by pulling tasks to us. Be careful of negative numbers as they'll
 -       * appear as very large values with unsigned longs.
 +       * OK, we don't have enough imbalance to justify moving tasks,
 +       * however we may be able to increase total CPU power used by
 +       * moving them.
         */
 -      if (max_load <= busiest_load_per_task)
 -              goto out_balanced;
  
 +      pwr_now += sds->busiest->__cpu_power *
 +                      min(sds->busiest_load_per_task, sds->max_load);
 +      pwr_now += sds->this->__cpu_power *
 +                      min(sds->this_load_per_task, sds->this_load);
 +      pwr_now /= SCHED_LOAD_SCALE;
 +
 +      /* Amount of load we'd subtract */
 +      tmp = sg_div_cpu_power(sds->busiest,
 +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 +      if (sds->max_load > tmp)
 +              pwr_move += sds->busiest->__cpu_power *
 +                      min(sds->busiest_load_per_task, sds->max_load - tmp);
 +
 +      /* Amount of load we'd add */
 +      if (sds->max_load * sds->busiest->__cpu_power <
 +              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
 +              tmp = sg_div_cpu_power(sds->this,
 +                      sds->max_load * sds->busiest->__cpu_power);
 +      else
 +              tmp = sg_div_cpu_power(sds->this,
 +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
 +      pwr_move += sds->this->__cpu_power *
 +                      min(sds->this_load_per_task, sds->this_load + tmp);
 +      pwr_move /= SCHED_LOAD_SCALE;
 +
 +      /* Move if we gain throughput */
 +      if (pwr_move > pwr_now)
 +              *imbalance = sds->busiest_load_per_task;
 +}
 +
 +/**
 + * calculate_imbalance - Calculate the amount of imbalance present within the
 + *                     groups of a given sched_domain during load balance.
 + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 + * @this_cpu: Cpu for which currently load balance is being performed.
 + * @imbalance: The variable to store the imbalance.
 + */
 +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 +              unsigned long *imbalance)
 +{
 +      unsigned long max_pull;
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
 -      if (max_load < avg_load) {
 +      if (sds->max_load < sds->avg_load) {
                *imbalance = 0;
 -              goto small_imbalance;
 +              return fix_small_imbalance(sds, this_cpu, imbalance);
        }
  
        /* Don't want to pull so many tasks that a group would go idle */
 -      max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 +      max_pull = min(sds->max_load - sds->avg_load,
 +                      sds->max_load - sds->busiest_load_per_task);
  
        /* How much load to actually move to equalise the imbalance */
 -      *imbalance = min(max_pull * busiest->__cpu_power,
 -                              (avg_load - this_load) * this->__cpu_power)
 +      *imbalance = min(max_pull * sds->busiest->__cpu_power,
 +              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                        / SCHED_LOAD_SCALE;
  
        /*
         * a think about bumping its value to force at least one task to be
         * moved
         */
 -      if (*imbalance < busiest_load_per_task) {
 -              unsigned long tmp, pwr_now, pwr_move;
 -              unsigned int imbn;
 -
 -small_imbalance:
 -              pwr_move = pwr_now = 0;
 -              imbn = 2;
 -              if (this_nr_running) {
 -                      this_load_per_task /= this_nr_running;
 -                      if (busiest_load_per_task > this_load_per_task)
 -                              imbn = 1;
 -              } else
 -                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
 +      if (*imbalance < sds->busiest_load_per_task)
 +              return fix_small_imbalance(sds, this_cpu, imbalance);
  
 -              if (max_load - this_load + busiest_load_per_task >=
 -                                      busiest_load_per_task * imbn) {
 -                      *imbalance = busiest_load_per_task;
 -                      return busiest;
 -              }
 +}
 +/******* find_busiest_group() helpers end here *********************/
  
 -              /*
 -               * OK, we don't have enough imbalance to justify moving tasks,
 -               * however we may be able to increase total CPU power used by
 -               * moving them.
 -               */
 +/**
 + * find_busiest_group - Returns the busiest group within the sched_domain
 + * if there is an imbalance. If there isn't an imbalance, and
 + * the user has opted for power-savings, it returns a group whose
 + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
 + * such a group exists.
 + *
 + * Also calculates the amount of weighted load which should be moved
 + * to restore balance.
 + *
 + * @sd: The sched_domain whose busiest group is to be returned.
 + * @this_cpu: The cpu for which load balancing is currently being performed.
 + * @imbalance: Variable which stores amount of weighted load which should
 + *            be moved to restore balance/put a group to idle.
 + * @idle: The idle status of this_cpu.
 + * @sd_idle: The idleness of sd
 + * @cpus: The set of CPUs under consideration for load-balancing.
 + * @balance: Pointer to a variable indicating if this_cpu
 + *    is the appropriate cpu to perform load balancing at this_level.
 + *
 + * Returns:   - the busiest group if imbalance exists.
 + *            - If no imbalance and user has opted for power-savings balance,
 + *               return the least loaded group whose CPUs can be
 + *               put to idle by rebalancing its tasks onto our group.
 + */
 +static struct sched_group *
 +find_busiest_group(struct sched_domain *sd, int this_cpu,
 +                 unsigned long *imbalance, enum cpu_idle_type idle,
 +                 int *sd_idle, const struct cpumask *cpus, int *balance)
 +{
 +      struct sd_lb_stats sds;
  
 -              pwr_now += busiest->__cpu_power *
 -                              min(busiest_load_per_task, max_load);
 -              pwr_now += this->__cpu_power *
 -                              min(this_load_per_task, this_load);
 -              pwr_now /= SCHED_LOAD_SCALE;
 -
 -              /* Amount of load we'd subtract */
 -              tmp = sg_div_cpu_power(busiest,
 -                              busiest_load_per_task * SCHED_LOAD_SCALE);
 -              if (max_load > tmp)
 -                      pwr_move += busiest->__cpu_power *
 -                              min(busiest_load_per_task, max_load - tmp);
 -
 -              /* Amount of load we'd add */
 -              if (max_load * busiest->__cpu_power <
 -                              busiest_load_per_task * SCHED_LOAD_SCALE)
 -                      tmp = sg_div_cpu_power(this,
 -                                      max_load * busiest->__cpu_power);
 -              else
 -                      tmp = sg_div_cpu_power(this,
 -                              busiest_load_per_task * SCHED_LOAD_SCALE);
 -              pwr_move += this->__cpu_power *
 -                              min(this_load_per_task, this_load + tmp);
 -              pwr_move /= SCHED_LOAD_SCALE;
 +      memset(&sds, 0, sizeof(sds));
  
 -              /* Move if we gain throughput */
 -              if (pwr_move > pwr_now)
 -                      *imbalance = busiest_load_per_task;
 -      }
 +      /*
 +       * Compute the various statistics relavent for load balancing at
 +       * this level.
 +       */
 +      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
 +                                      balance, &sds);
 +
 +      /* Cases where imbalance does not exist from POV of this_cpu */
 +      /* 1) this_cpu is not the appropriate cpu to perform load balancing
 +       *    at this level.
 +       * 2) There is no busy sibling group to pull from.
 +       * 3) This group is the busiest group.
 +       * 4) This group is more busy than the avg busieness at this
 +       *    sched_domain.
 +       * 5) The imbalance is within the specified limit.
 +       * 6) Any rebalance would lead to ping-pong
 +       */
 +      if (balance && !(*balance))
 +              goto ret;
  
 -      return busiest;
 +      if (!sds.busiest || sds.busiest_nr_running == 0)
 +              goto out_balanced;
  
 -out_balanced:
 -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 -              goto ret;
 +      if (sds.this_load >= sds.max_load)
 +              goto out_balanced;
  
 -      if (this == group_leader && group_leader != group_min) {
 -              *imbalance = min_load_per_task;
 -              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 -                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
 -                              cpumask_first(sched_group_cpus(group_leader));
 -              }
 -              return group_min;
 -      }
 -#endif
 +      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 +
 +      if (sds.this_load >= sds.avg_load)
 +              goto out_balanced;
 +
 +      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 +              goto out_balanced;
 +
 +      sds.busiest_load_per_task /= sds.busiest_nr_running;
 +      if (sds.group_imb)
 +              sds.busiest_load_per_task =
 +                      min(sds.busiest_load_per_task, sds.avg_load);
 +
 +      /*
 +       * We're trying to get all the cpus to the average_load, so we don't
 +       * want to push ourselves above the average load, nor do we wish to
 +       * reduce the max loaded cpu below the average load, as either of these
 +       * actions would just result in more rebalancing later, and ping-pong
 +       * tasks around. Thus we look for the minimum possible imbalance.
 +       * Negative imbalances (*we* are more loaded than anyone else) will
 +       * be counted as no imbalance for these purposes -- we can't fix that
 +       * by pulling tasks to us. Be careful of negative numbers as they'll
 +       * appear as very large values with unsigned longs.
 +       */
 +      if (sds.max_load <= sds.busiest_load_per_task)
 +              goto out_balanced;
 +
 +      /* Looks like there is an imbalance. Compute it */
 +      calculate_imbalance(&sds, this_cpu, imbalance);
 +      return sds.busiest;
 +
 +out_balanced:
 +      /*
 +       * There is no obvious imbalance. But check if we can do some balancing
 +       * to save power.
 +       */
 +      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
 +              return sds.busiest;
  ret:
        *imbalance = 0;
        return NULL;
@@@ -4250,24 -3890,19 +4250,24 @@@ int select_nohz_load_balancer(int stop_
        int cpu = smp_processor_id();
  
        if (stop_tick) {
 -              cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
  
 -              /*
 -               * If we are going offline and still the leader, give up!
 -               */
 -              if (!cpu_active(cpu) &&
 -                  atomic_read(&nohz.load_balancer) == cpu) {
 +              if (!cpu_active(cpu)) {
 +                      if (atomic_read(&nohz.load_balancer) != cpu)
 +                              return 0;
 +
 +                      /*
 +                       * If we are going offline and still the leader,
 +                       * give up!
 +                       */
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
 +
                        return 0;
                }
  
 +              cpumask_set_cpu(cpu, nohz.cpu_mask);
 +
                /* time for ilb owner also to sleep */
                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
@@@ -4427,11 -4062,6 +4427,11 @@@ static void run_rebalance_domains(struc
  #endif
  }
  
 +static inline int on_null_domain(int cpu)
 +{
 +      return !rcu_dereference(cpu_rq(cpu)->sd);
 +}
 +
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
   *
@@@ -4489,9 -4119,7 +4489,9 @@@ static inline void trigger_load_balance
            cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
  #endif
 -      if (time_after_eq(jiffies, rq->next_balance))
 +      /* Don't need to rebalance while attached to NULL domain */
 +      if (time_after_eq(jiffies, rq->next_balance) &&
 +          likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
  }
  
@@@ -4885,33 -4513,11 +4885,33 @@@ static inline void schedule_debug(struc
  #endif
  }
  
 +static void put_prev_task(struct rq *rq, struct task_struct *prev)
 +{
 +      if (prev->state == TASK_RUNNING) {
 +              u64 runtime = prev->se.sum_exec_runtime;
 +
 +              runtime -= prev->se.prev_sum_exec_runtime;
 +              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
 +
 +              /*
 +               * In order to avoid avg_overlap growing stale when we are
 +               * indeed overlapping and hence not getting put to sleep, grow
 +               * the avg_overlap on preemption.
 +               *
 +               * We use the average preemption runtime because that
 +               * correlates to the amount of cache footprint a task can
 +               * build up.
 +               */
 +              update_avg(&prev->se.avg_overlap, runtime);
 +      }
 +      prev->sched_class->put_prev_task(rq, prev);
 +}
 +
  /*
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
 -pick_next_task(struct rq *rq, struct task_struct *prev)
 +pick_next_task(struct rq *rq)
  {
        const struct sched_class *class;
        struct task_struct *p;
  /*
   * schedule() is the main scheduler function.
   */
- asmlinkage void __sched schedule(void)
+ asmlinkage void __sched __schedule(void)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
  
- need_resched:
-       preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@@ -4985,8 -4589,8 +4983,8 @@@ need_resched_nonpreemptible
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
  
 -      prev->sched_class->put_prev_task(rq, prev);
 -      next = pick_next_task(rq, prev);
 +      put_prev_task(rq, prev);
 +      next = pick_next_task(rq);
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
  
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
+ }
  
+ asmlinkage void __sched schedule(void)
+ {
+ need_resched:
+       preempt_disable();
+       __schedule();
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
  
+ #ifdef CONFIG_SMP
+ /*
+  * Look out! "owner" is an entirely speculative pointer
+  * access and not reliable.
+  */
+ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+ {
+       unsigned int cpu;
+       struct rq *rq;
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+ #else
+       cpu = owner->cpu;
+ #endif
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+       rq = cpu_rq(cpu);
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+               cpu_relax();
+       }
+ out:
+       return 1;
+ }
+ #endif
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@@ -5041,7 -4712,7 +5106,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
 -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 +      } while (need_resched());
  }
  EXPORT_SYMBOL(preempt_schedule);
  
@@@ -5070,7 -4741,7 +5135,7 @@@ asmlinkage void __sched preempt_schedul
                 * between schedule and now.
                 */
                barrier();
 -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 +      } while (need_resched());
  }
  
  #endif /* CONFIG_PREEMPT */
@@@ -5544,7 -5215,7 +5609,7 @@@ SYSCALL_DEFINE1(nice, int, increment
        if (increment > 40)
                increment = 40;
  
 -      nice = PRIO_TO_NICE(current->static_prio) + increment;
 +      nice = TASK_NICE(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@@ -6343,7 -6014,12 +6408,7 @@@ void sched_show_task(struct task_struc
                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
  #endif
  #ifdef CONFIG_DEBUG_STACK_USAGE
 -      {
 -              unsigned long *n = end_of_stack(p);
 -              while (!*n)
 -                      n++;
 -              free = (unsigned long)n - (unsigned long)end_of_stack(p);
 -      }
 +      free = stack_not_used(p);
  #endif
        printk(KERN_CONT "%5lu %5d %6d\n", free,
                task_pid_nr(p), task_pid_nr(p->real_parent));
@@@ -6817,7 -6493,7 +6882,7 @@@ static void migrate_dead_tasks(unsigne
                if (!rq->nr_running)
                        break;
                update_rq_clock(rq);
 -              next = pick_next_task(rq, rq->curr);
 +              next = pick_next_task(rq);
                if (!next)
                        break;
                next->sched_class->put_prev_task(rq, next);
@@@ -7338,26 -7014,20 +7403,26 @@@ static void free_rootdomain(struct root
  
  static void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
 +      struct root_domain *old_rd = NULL;
        unsigned long flags;
  
        spin_lock_irqsave(&rq->lock, flags);
  
        if (rq->rd) {
 -              struct root_domain *old_rd = rq->rd;
 +              old_rd = rq->rd;
  
                if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
  
                cpumask_clear_cpu(rq->cpu, old_rd->span);
  
 -              if (atomic_dec_and_test(&old_rd->refcount))
 -                      free_rootdomain(old_rd);
 +              /*
 +               * If we dont want to free the old_rt yet then
 +               * set old_rd to NULL to skip the freeing later
 +               * in this function:
 +               */
 +              if (!atomic_dec_and_test(&old_rd->refcount))
 +                      old_rd = NULL;
        }
  
        atomic_inc(&rd->refcount);
                set_rq_online(rq);
  
        spin_unlock_irqrestore(&rq->lock, flags);
 +
 +      if (old_rd)
 +              free_rootdomain(old_rd);
  }
  
  static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@@ -8612,15 -8279,11 +8677,15 @@@ static void init_rt_rq(struct rt_rq *rt
        __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 -      rt_rq->highest_prio = MAX_RT_PRIO;
 +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
 +#ifdef CONFIG_SMP
 +      rt_rq->highest_prio.next = MAX_RT_PRIO;
 +#endif
  #endif
  #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
 +      plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
        rt_rq->rt_time = 0;
@@@ -9622,16 -9285,6 +9687,16 @@@ static int sched_rt_global_constraints(
  
        return ret;
  }
 +
 +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 +{
 +      /* Don't accept realtime tasks when there is no way for them to run */
 +      if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 +              return 0;
 +
 +      return 1;
 +}
 +
  #else /* !CONFIG_RT_GROUP_SCHED */
  static int sched_rt_global_constraints(void)
  {
@@@ -9725,7 -9378,8 +9790,7 @@@ cpu_cgroup_can_attach(struct cgroup_sub
                      struct task_struct *tsk)
  {
  #ifdef CONFIG_RT_GROUP_SCHED
 -      /* Don't accept realtime tasks when there is no way for them to run */
 -      if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
 +      if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                return -EINVAL;
  #else
        /* We don't support RT-tasks being in separate groups */
@@@ -9888,7 -9542,7 +9953,7 @@@ cpuacct_destroy(struct cgroup_subsys *s
  
  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  {
 -      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +      u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
  
  #ifndef CONFIG_64BIT
  
  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
  {
 -      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +      u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
  
  #ifndef CONFIG_64BIT
        /*
@@@ -9996,14 -9650,14 +10061,14 @@@ static void cpuacct_charge(struct task_
        struct cpuacct *ca;
        int cpu;
  
 -      if (!cpuacct_subsys.active)
 +      if (unlikely(!cpuacct_subsys.active))
                return;
  
        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
  
        for (; ca; ca = ca->parent) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +              u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
  }
diff --combined kernel/sched_features.h
index 76f61756e677647dda4ae01ede5ac942329a09c8,07bc02e99ab1998a0b32117c2132a3c853937ca2..4569bfa7df9b9fee6f057e22aca18299d3c45f3b
@@@ -1,6 -1,5 +1,6 @@@
  SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
 -SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 +SCHED_FEAT(NORMALIZED_SLEEPER, 0)
 +SCHED_FEAT(ADAPTIVE_GRAN, 1)
  SCHED_FEAT(WAKEUP_PREEMPT, 1)
  SCHED_FEAT(START_DEBIT, 1)
  SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@@ -14,3 -13,4 +14,4 @@@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1
  SCHED_FEAT(ASYM_EFF_LOAD, 1)
  SCHED_FEAT(WAKEUP_OVERLAP, 0)
  SCHED_FEAT(LAST_BUDDY, 1)
+ SCHED_FEAT(OWNER_SPIN, 1)
diff --combined kernel/timer.c
index 9b77fc9a9ac8286074b1784771dac7c0eb2a08e2,ef1c385bc5726ee6185575d4fcb3c3cab987f3ed..b4555568b4e4ad16f34a887eabed6f21e05abfba
@@@ -491,14 -491,18 +491,18 @@@ static inline void debug_timer_free(str
        debug_object_free(timer, &timer_debug_descr);
  }
  
- static void __init_timer(struct timer_list *timer);
+ static void __init_timer(struct timer_list *timer,
+                        const char *name,
+                        struct lock_class_key *key);
  
- void init_timer_on_stack(struct timer_list *timer)
+ void init_timer_on_stack_key(struct timer_list *timer,
+                            const char *name,
+                            struct lock_class_key *key)
  {
        debug_object_init_on_stack(timer, &timer_debug_descr);
-       __init_timer(timer);
+       __init_timer(timer, name, key);
  }
- EXPORT_SYMBOL_GPL(init_timer_on_stack);
+ EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
  
  void destroy_timer_on_stack(struct timer_list *timer)
  {
@@@ -512,7 -516,9 +516,9 @@@ static inline void debug_timer_activate
  static inline void debug_timer_deactivate(struct timer_list *timer) { }
  #endif
  
- static void __init_timer(struct timer_list *timer)
+ static void __init_timer(struct timer_list *timer,
+                        const char *name,
+                        struct lock_class_key *key)
  {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
        timer->start_pid = -1;
        memset(timer->start_comm, 0, TASK_COMM_LEN);
  #endif
+       lockdep_init_map(&timer->lockdep_map, name, key, 0);
  }
  
  /**
   * init_timer() must be done to a timer prior calling *any* of the
   * other timer functions.
   */
- void init_timer(struct timer_list *timer)
+ void init_timer_key(struct timer_list *timer,
+                   const char *name,
+                   struct lock_class_key *key)
  {
        debug_timer_init(timer);
-       __init_timer(timer);
+       __init_timer(timer, name, key);
  }
- EXPORT_SYMBOL(init_timer);
+ EXPORT_SYMBOL(init_timer_key);
  
- void init_timer_deferrable(struct timer_list *timer)
+ void init_timer_deferrable_key(struct timer_list *timer,
+                              const char *name,
+                              struct lock_class_key *key)
  {
-       init_timer(timer);
+       init_timer_key(timer, name, key);
        timer_set_deferrable(timer);
  }
- EXPORT_SYMBOL(init_timer_deferrable);
+ EXPORT_SYMBOL(init_timer_deferrable_key);
  
  static inline void detach_timer(struct timer_list *timer,
                                int clear_pending)
@@@ -589,14 -600,11 +600,14 @@@ static struct tvec_base *lock_timer_bas
        }
  }
  
 -int __mod_timer(struct timer_list *timer, unsigned long expires)
 +static inline int
 +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
  {
        struct tvec_base *base, *new_base;
        unsigned long flags;
 -      int ret = 0;
 +      int ret;
 +
 +      ret = 0;
  
        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
        if (timer_pending(timer)) {
                detach_timer(timer, 0);
                ret = 1;
 +      } else {
 +              if (pending_only)
 +                      goto out_unlock;
        }
  
        debug_timer_activate(timer);
  
        timer->expires = expires;
        internal_add_timer(base, timer);
 +
 +out_unlock:
        spin_unlock_irqrestore(&base->lock, flags);
  
        return ret;
  }
  
 -EXPORT_SYMBOL(__mod_timer);
 -
  /**
 - * add_timer_on - start a timer on a particular CPU
 - * @timer: the timer to be added
 - * @cpu: the CPU to start it on
 + * mod_timer_pending - modify a pending timer's timeout
 + * @timer: the pending timer to be modified
 + * @expires: new timeout in jiffies
   *
 - * This is not very scalable on SMP. Double adds are not possible.
 + * mod_timer_pending() is the same for pending timers as mod_timer(),
 + * but will not re-activate and modify already deleted timers.
 + *
 + * It is useful for unserialized use of timers.
   */
 -void add_timer_on(struct timer_list *timer, int cpu)
 +int mod_timer_pending(struct timer_list *timer, unsigned long expires)
  {
 -      struct tvec_base *base = per_cpu(tvec_bases, cpu);
 -      unsigned long flags;
 -
 -      timer_stats_timer_set_start_info(timer);
 -      BUG_ON(timer_pending(timer) || !timer->function);
 -      spin_lock_irqsave(&base->lock, flags);
 -      timer_set_base(timer, base);
 -      debug_timer_activate(timer);
 -      internal_add_timer(base, timer);
 -      /*
 -       * Check whether the other CPU is idle and needs to be
 -       * triggered to reevaluate the timer wheel when nohz is
 -       * active. We are protected against the other CPU fiddling
 -       * with the timer by holding the timer base lock. This also
 -       * makes sure that a CPU on the way to idle can not evaluate
 -       * the timer wheel.
 -       */
 -      wake_up_idle_cpu(cpu);
 -      spin_unlock_irqrestore(&base->lock, flags);
 +      return __mod_timer(timer, expires, true);
  }
 +EXPORT_SYMBOL(mod_timer_pending);
  
  /**
   * mod_timer - modify a timer's timeout
   */
  int mod_timer(struct timer_list *timer, unsigned long expires)
  {
 -      BUG_ON(!timer->function);
 -
 -      timer_stats_timer_set_start_info(timer);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
        if (timer->expires == expires && timer_pending(timer))
                return 1;
  
 -      return __mod_timer(timer, expires);
 +      return __mod_timer(timer, expires, false);
  }
 -
  EXPORT_SYMBOL(mod_timer);
  
 +/**
 + * add_timer - start a timer
 + * @timer: the timer to be added
 + *
 + * The kernel will do a ->function(->data) callback from the
 + * timer interrupt at the ->expires point in the future. The
 + * current time is 'jiffies'.
 + *
 + * The timer's ->expires, ->function (and if the handler uses it, ->data)
 + * fields must be set prior calling this function.
 + *
 + * Timers with an ->expires field in the past will be executed in the next
 + * timer tick.
 + */
 +void add_timer(struct timer_list *timer)
 +{
 +      BUG_ON(timer_pending(timer));
 +      mod_timer(timer, timer->expires);
 +}
 +EXPORT_SYMBOL(add_timer);
 +
 +/**
 + * add_timer_on - start a timer on a particular CPU
 + * @timer: the timer to be added
 + * @cpu: the CPU to start it on
 + *
 + * This is not very scalable on SMP. Double adds are not possible.
 + */
 +void add_timer_on(struct timer_list *timer, int cpu)
 +{
 +      struct tvec_base *base = per_cpu(tvec_bases, cpu);
 +      unsigned long flags;
 +
 +      timer_stats_timer_set_start_info(timer);
 +      BUG_ON(timer_pending(timer) || !timer->function);
 +      spin_lock_irqsave(&base->lock, flags);
 +      timer_set_base(timer, base);
 +      debug_timer_activate(timer);
 +      internal_add_timer(base, timer);
 +      /*
 +       * Check whether the other CPU is idle and needs to be
 +       * triggered to reevaluate the timer wheel when nohz is
 +       * active. We are protected against the other CPU fiddling
 +       * with the timer by holding the timer base lock. This also
 +       * makes sure that a CPU on the way to idle can not evaluate
 +       * the timer wheel.
 +       */
 +      wake_up_idle_cpu(cpu);
 +      spin_unlock_irqrestore(&base->lock, flags);
 +}
 +
  /**
   * del_timer - deactive a timer.
   * @timer: the timer to be deactivated
@@@ -772,6 -744,7 +783,6 @@@ int del_timer(struct timer_list *timer
  
        return ret;
  }
 -
  EXPORT_SYMBOL(del_timer);
  
  #ifdef CONFIG_SMP
@@@ -805,6 -778,7 +816,6 @@@ out
  
        return ret;
  }
 -
  EXPORT_SYMBOL(try_to_del_timer_sync);
  
  /**
   */
  int del_timer_sync(struct timer_list *timer)
  {
+ #ifdef CONFIG_LOCKDEP
+       unsigned long flags;
+       local_irq_save(flags);
+       lock_map_acquire(&timer->lockdep_map);
+       lock_map_release(&timer->lockdep_map);
+       local_irq_restore(flags);
+ #endif
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
                cpu_relax();
        }
  }
 -
  EXPORT_SYMBOL(del_timer_sync);
  #endif
  
@@@ -897,10 -881,36 +917,36 @@@ static inline void __run_timers(struct 
  
                        set_running_timer(base, timer);
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
                        {
                                int preempt_count = preempt_count();
+ #ifdef CONFIG_LOCKDEP
+                               /*
+                                * It is permissible to free the timer from
+                                * inside the function that is called from
+                                * it, this we need to take into account for
+                                * lockdep too. To avoid bogus "held lock
+                                * freed" warnings as well as problems when
+                                * looking into timer->lockdep_map, make a
+                                * copy and use that here.
+                                */
+                               struct lockdep_map lockdep_map =
+                                       timer->lockdep_map;
+ #endif
+                               /*
+                                * Couple the lock chain with the lock chain at
+                                * del_timer_sync() by acquiring the lock_map
+                                * around the fn() call here and in
+                                * del_timer_sync().
+                                */
+                               lock_map_acquire(&lockdep_map);
                                fn(data);
+                               lock_map_release(&lockdep_map);
                                if (preempt_count != preempt_count()) {
                                        printk(KERN_ERR "huh, entered %p "
                                               "with preempt_count %08x, exited"
@@@ -1304,7 -1314,7 +1350,7 @@@ signed long __sched schedule_timeout(si
        expire = timeout + jiffies;
  
        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
 -      __mod_timer(&timer, expire);
 +      __mod_timer(&timer, expire, false);
        schedule();
        del_singleshot_timer_sync(&timer);
  
diff --combined mm/page_alloc.c
index 5c44ed49ca93c5a1d7a39f5570b0d1254a45de80,22b15a4cde8a7f79d8e0ec8407e6123750f69288..a3803ea8c27d5aaaccb5bd2d1f8141786dc9cc27
@@@ -1479,6 -1479,8 +1479,8 @@@ __alloc_pages_internal(gfp_t gfp_mask, 
        unsigned long did_some_progress;
        unsigned long pages_reclaimed = 0;
  
+       lockdep_trace_alloc(gfp_mask);
        might_sleep_if(wait);
  
        if (should_fail_alloc_page(gfp_mask, order))
@@@ -1578,12 -1580,15 +1580,15 @@@ nofail_alloc
         */
        cpuset_update_task_memory_state();
        p->flags |= PF_MEMALLOC;
+       lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
  
        did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
  
        p->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
        p->flags &= ~PF_MEMALLOC;
  
        cond_resched();
@@@ -2989,7 -2994,7 +2994,7 @@@ static int __meminit next_active_region
   * was used and there are no special requirements, this is a convenient
   * alternative
   */
 -int __meminit early_pfn_to_nid(unsigned long pfn)
 +int __meminit __early_pfn_to_nid(unsigned long pfn)
  {
        int i;
  
                if (start_pfn <= pfn && pfn < end_pfn)
                        return early_node_map[i].nid;
        }
 +      /* This is a memory hole */
 +      return -1;
 +}
 +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 +
 +int __meminit early_pfn_to_nid(unsigned long pfn)
 +{
 +      int nid;
  
 +      nid = __early_pfn_to_nid(pfn);
 +      if (nid >= 0)
 +              return nid;
 +      /* just returns 0 */
        return 0;
  }
 -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 +
 +#ifdef CONFIG_NODES_SPAN_OTHER_NODES
 +bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 +{
 +      int nid;
 +
 +      nid = __early_pfn_to_nid(pfn);
 +      if (nid >= 0 && nid != node)
 +              return false;
 +      return true;
 +}
 +#endif
  
  /* Basic iterator support to walk early_node_map[] */
  #define for_each_active_range_index_in_nid(i, nid) \
diff --combined mm/slab.c
index 4d00855629c4921e3c590406335549683e6c6daf,6b61de8543ec25532d55f75f82c2576860d58974..825c606f691d11bfb3372627aff50f115358aff3
+++ b/mm/slab.c
@@@ -3318,6 -3318,8 +3318,8 @@@ __cache_alloc_node(struct kmem_cache *c
        unsigned long save_flags;
        void *ptr;
  
+       lockdep_trace_alloc(flags);
        if (slab_should_failslab(cachep, flags))
                return NULL;
  
@@@ -3394,6 -3396,8 +3396,8 @@@ __cache_alloc(struct kmem_cache *cachep
        unsigned long save_flags;
        void *objp;
  
+       lockdep_trace_alloc(flags);
        if (slab_should_failslab(cachep, flags))
                return NULL;
  
@@@ -4457,4 -4461,3 +4461,4 @@@ size_t ksize(const void *objp
  
        return obj_size(virt_to_cache(objp));
  }
 +EXPORT_SYMBOL(ksize);
diff --combined mm/slob.c
index 0bfa680a8981127307d1d922231dca3d66f9b2bd,4b1c0c1d63cb173a1dca86a1ae0ef8423332ea58..7a3411524dacd555e9ce70e287a4232072391e10
+++ b/mm/slob.c
@@@ -126,9 -126,9 +126,9 @@@ static LIST_HEAD(free_slob_medium)
  static LIST_HEAD(free_slob_large);
  
  /*
 - * slob_page: True for all slob pages (false for bigblock pages)
 + * is_slob_page: True for all slob pages (false for bigblock pages)
   */
 -static inline int slob_page(struct slob_page *sp)
 +static inline int is_slob_page(struct slob_page *sp)
  {
        return PageSlobPage((struct page *)sp);
  }
@@@ -143,11 -143,6 +143,11 @@@ static inline void clear_slob_page(stru
        __ClearPageSlobPage((struct page *)sp);
  }
  
 +static inline struct slob_page *slob_page(const void *addr)
 +{
 +      return (struct slob_page *)virt_to_page(addr);
 +}
 +
  /*
   * slob_page_free: true for pages on free_slob_pages list.
   */
@@@ -235,7 -230,7 +235,7 @@@ static int slob_last(slob_t *s
        return !((unsigned long)slob_next(s) & ~PAGE_MASK);
  }
  
 -static void *slob_new_page(gfp_t gfp, int order, int node)
 +static void *slob_new_pages(gfp_t gfp, int order, int node)
  {
        void *page;
  
        return page_address(page);
  }
  
 +static void slob_free_pages(void *b, int order)
 +{
 +      free_pages((unsigned long)b, order);
 +}
 +
  /*
   * Allocate a slob block within a given slob_page sp.
   */
  static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
  {
 -      slob_t *prev, *cur, *aligned = 0;
 +      slob_t *prev, *cur, *aligned = NULL;
        int delta = 0, units = SLOB_UNITS(size);
  
        for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@@ -359,10 -349,10 +359,10 @@@ static void *slob_alloc(size_t size, gf
  
        /* Not enough space: must allocate a new page */
        if (!b) {
 -              b = slob_new_page(gfp & ~__GFP_ZERO, 0, node);
 +              b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
                if (!b)
 -                      return 0;
 -              sp = (struct slob_page *)virt_to_page(b);
 +                      return NULL;
 +              sp = slob_page(b);
                set_slob_page(sp);
  
                spin_lock_irqsave(&slob_lock, flags);
@@@ -394,7 -384,7 +394,7 @@@ static void slob_free(void *block, int 
                return;
        BUG_ON(!size);
  
 -      sp = (struct slob_page *)virt_to_page(block);
 +      sp = slob_page(block);
        units = SLOB_UNITS(size);
  
        spin_lock_irqsave(&slob_lock, flags);
                /* Go directly to page allocator. Do not pass slob allocator */
                if (slob_page_free(sp))
                        clear_slob_page_free(sp);
 +              spin_unlock_irqrestore(&slob_lock, flags);
                clear_slob_page(sp);
                free_slob_page(sp);
                free_page((unsigned long)b);
 -              goto out;
 +              return;
        }
  
        if (!slob_page_free(sp)) {
@@@ -475,6 -464,8 +475,8 @@@ void *__kmalloc_node(size_t size, gfp_
        unsigned int *m;
        int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
  
+       lockdep_trace_alloc(gfp);
        if (size < PAGE_SIZE - align) {
                if (!size)
                        return ZERO_SIZE_PTR;
        } else {
                void *ret;
  
 -              ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
 +              ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
                if (ret) {
                        struct page *page;
                        page = virt_to_page(ret);
@@@ -505,8 -496,8 +507,8 @@@ void kfree(const void *block
        if (unlikely(ZERO_OR_NULL_PTR(block)))
                return;
  
 -      sp = (struct slob_page *)virt_to_page(block);
 -      if (slob_page(sp)) {
 +      sp = slob_page(block);
 +      if (is_slob_page(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
@@@ -524,15 -515,14 +526,15 @@@ size_t ksize(const void *block
        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
  
 -      sp = (struct slob_page *)virt_to_page(block);
 -      if (slob_page(sp)) {
 +      sp = slob_page(block);
 +      if (is_slob_page(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                return SLOB_UNITS(*m) * SLOB_UNIT;
        } else
                return sp->page.private;
  }
 +EXPORT_SYMBOL(ksize);
  
  struct kmem_cache {
        unsigned int size, align;
@@@ -584,7 -574,7 +586,7 @@@ void *kmem_cache_alloc_node(struct kmem
        if (c->size < PAGE_SIZE)
                b = slob_alloc(c->size, flags, c->align, node);
        else
 -              b = slob_new_page(flags, get_order(c->size), node);
 +              b = slob_new_pages(flags, get_order(c->size), node);
  
        if (c->ctor)
                c->ctor(b);
@@@ -598,7 -588,7 +600,7 @@@ static void __kmem_cache_free(void *b, 
        if (size < PAGE_SIZE)
                slob_free(b, size);
        else
 -              free_pages((unsigned long)b, get_order(size));
 +              slob_free_pages(b, get_order(size));
  }
  
  static void kmem_rcu_free(struct rcu_head *head)
diff --combined mm/slub.c
index c65a4edafc3343e83a2513f8edab9b0ed82f3ab5,214eb207c513f46276eb06106e94a92e2aa2060b..c4ea9158c9fbd0e4630062aa0098406000be45c3
+++ b/mm/slub.c
@@@ -374,8 -374,14 +374,8 @@@ static struct track *get_track(struct k
  static void set_track(struct kmem_cache *s, void *object,
                        enum track_item alloc, unsigned long addr)
  {
 -      struct track *p;
 -
 -      if (s->offset)
 -              p = object + s->offset + sizeof(void *);
 -      else
 -              p = object + s->inuse;
 +      struct track *p = get_track(s, object, alloc);
  
 -      p += alloc;
        if (addr) {
                p->addr = addr;
                p->cpu = smp_processor_id();
@@@ -1329,7 -1335,7 +1329,7 @@@ static struct page *get_any_partial(str
                n = get_node(s, zone_to_nid(zone));
  
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
 -                              n->nr_partial > n->min_partial) {
 +                              n->nr_partial > s->min_partial) {
                        page = get_partial_node(n);
                        if (page)
                                return page;
@@@ -1381,7 -1387,7 +1381,7 @@@ static void unfreeze_slab(struct kmem_c
                slab_unlock(page);
        } else {
                stat(c, DEACTIVATE_EMPTY);
 -              if (n->nr_partial < n->min_partial) {
 +              if (n->nr_partial < s->min_partial) {
                        /*
                         * Adding an empty slab to the partial slabs in order
                         * to avoid page allocator overhead. This slab needs
@@@ -1590,6 -1596,7 +1590,7 @@@ static __always_inline void *slab_alloc
        unsigned long flags;
        unsigned int objsize;
  
+       lockdep_trace_alloc(gfpflags);
        might_sleep_if(gfpflags & __GFP_WAIT);
  
        if (should_failslab(s->objsize, gfpflags))
@@@ -1718,7 -1725,7 +1719,7 @@@ static __always_inline void slab_free(s
        c = get_cpu_slab(s, smp_processor_id());
        debug_check_no_locks_freed(object, c->objsize);
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
 -              debug_check_no_obj_freed(object, s->objsize);
 +              debug_check_no_obj_freed(object, c->objsize);
        if (likely(page == c->page && c->node >= 0)) {
                object[c->offset] = c->freelist;
                c->freelist = object;
@@@ -1838,7 -1845,6 +1839,7 @@@ static inline int calculate_order(int s
        int order;
        int min_objects;
        int fraction;
 +      int max_objects;
  
        /*
         * Attempt to find best configuration for a slab. This
        min_objects = slub_min_objects;
        if (!min_objects)
                min_objects = 4 * (fls(nr_cpu_ids) + 1);
 +      max_objects = (PAGE_SIZE << slub_max_order)/size;
 +      min_objects = min(min_objects, max_objects);
 +
        while (min_objects > 1) {
                fraction = 16;
                while (fraction >= 4) {
                                return order;
                        fraction /= 2;
                }
 -              min_objects /= 2;
 +              min_objects --;
        }
  
        /*
@@@ -1926,6 -1929,17 +1927,6 @@@ static voi
  init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
  {
        n->nr_partial = 0;
 -
 -      /*
 -       * The larger the object size is, the more pages we want on the partial
 -       * list to avoid pounding the page allocator excessively.
 -       */
 -      n->min_partial = ilog2(s->size);
 -      if (n->min_partial < MIN_PARTIAL)
 -              n->min_partial = MIN_PARTIAL;
 -      else if (n->min_partial > MAX_PARTIAL)
 -              n->min_partial = MAX_PARTIAL;
 -
        spin_lock_init(&n->list_lock);
        INIT_LIST_HEAD(&n->partial);
  #ifdef CONFIG_SLUB_DEBUG
@@@ -2168,15 -2182,6 +2169,15 @@@ static int init_kmem_cache_nodes(struc
  }
  #endif
  
 +static void set_min_partial(struct kmem_cache *s, unsigned long min)
 +{
 +      if (min < MIN_PARTIAL)
 +              min = MIN_PARTIAL;
 +      else if (min > MAX_PARTIAL)
 +              min = MAX_PARTIAL;
 +      s->min_partial = min;
 +}
 +
  /*
   * calculate_sizes() determines the order and the distribution of data within
   * a slab object.
@@@ -2315,11 -2320,6 +2316,11 @@@ static int kmem_cache_open(struct kmem_
        if (!calculate_sizes(s, -1))
                goto error;
  
 +      /*
 +       * The larger the object size is, the more pages we want on the partial
 +       * list to avoid pounding the page allocator excessively.
 +       */
 +      set_min_partial(s, ilog2(s->size));
        s->refcount = 1;
  #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
@@@ -2476,7 -2476,7 +2477,7 @@@ EXPORT_SYMBOL(kmem_cache_destroy)
   *            Kmalloc subsystem
   *******************************************************************/
  
 -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
 +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
  EXPORT_SYMBOL(kmalloc_caches);
  
  static int __init setup_slub_min_order(char *str)
@@@ -2538,7 -2538,7 +2539,7 @@@ panic
  }
  
  #ifdef CONFIG_ZONE_DMA
 -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
 +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
  
  static void sysfs_add_func(struct work_struct *w)
  {
@@@ -2659,7 -2659,7 +2660,7 @@@ void *__kmalloc(size_t size, gfp_t flag
  {
        struct kmem_cache *s;
  
 -      if (unlikely(size > PAGE_SIZE))
 +      if (unlikely(size > SLUB_MAX_SIZE))
                return kmalloc_large(size, flags);
  
        s = get_slab(size, flags);
@@@ -2687,7 -2687,7 +2688,7 @@@ void *__kmalloc_node(size_t size, gfp_
  {
        struct kmem_cache *s;
  
 -      if (unlikely(size > PAGE_SIZE))
 +      if (unlikely(size > SLUB_MAX_SIZE))
                return kmalloc_large_node(size, flags, node);
  
        s = get_slab(size, flags);
@@@ -2737,7 -2737,6 +2738,7 @@@ size_t ksize(const void *object
         */
        return s->size;
  }
 +EXPORT_SYMBOL(ksize);
  
  void kfree(const void *x)
  {
@@@ -2987,7 -2986,7 +2988,7 @@@ void __init kmem_cache_init(void
                caches++;
        }
  
 -      for (i = KMALLOC_SHIFT_LOW; i <PAGE_SHIFT; i++) {
 +      for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
                create_kmalloc_cache(&kmalloc_caches[i],
                        "kmalloc", 1 << i, GFP_KERNEL);
                caches++;
        slab_state = UP;
  
        /* Provide the correct kmalloc names now that the caches are up */
 -      for (i = KMALLOC_SHIFT_LOW; i <PAGE_SHIFT; i++)
 +      for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
                kmalloc_caches[i]. name =
                        kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
  
@@@ -3224,7 -3223,7 +3225,7 @@@ void *__kmalloc_track_caller(size_t siz
  {
        struct kmem_cache *s;
  
 -      if (unlikely(size > PAGE_SIZE))
 +      if (unlikely(size > SLUB_MAX_SIZE))
                return kmalloc_large(size, gfpflags);
  
        s = get_slab(size, gfpflags);
@@@ -3240,7 -3239,7 +3241,7 @@@ void *__kmalloc_node_track_caller(size_
  {
        struct kmem_cache *s;
  
 -      if (unlikely(size > PAGE_SIZE))
 +      if (unlikely(size > SLUB_MAX_SIZE))
                return kmalloc_large_node(size, gfpflags, node);
  
        s = get_slab(size, gfpflags);
@@@ -3837,26 -3836,6 +3838,26 @@@ static ssize_t order_show(struct kmem_c
  }
  SLAB_ATTR(order);
  
 +static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
 +{
 +      return sprintf(buf, "%lu\n", s->min_partial);
 +}
 +
 +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 +                               size_t length)
 +{
 +      unsigned long min;
 +      int err;
 +
 +      err = strict_strtoul(buf, 10, &min);
 +      if (err)
 +              return err;
 +
 +      set_min_partial(s, min);
 +      return length;
 +}
 +SLAB_ATTR(min_partial);
 +
  static ssize_t ctor_show(struct kmem_cache *s, char *buf)
  {
        if (s->ctor) {
@@@ -4172,7 -4151,6 +4173,7 @@@ static struct attribute *slab_attrs[] 
        &object_size_attr.attr,
        &objs_per_slab_attr.attr,
        &order_attr.attr,
 +      &min_partial_attr.attr,
        &objects_attr.attr,
        &objects_partial_attr.attr,
        &total_objects_attr.attr,
diff --combined mm/vmscan.c
index 56ddf41149eb77a55158ced628c02f817b990618,cf844134527759ced5c2093b89174779006c747d..479e46719394cceccad44d55c28db3e2942616e7
@@@ -1262,6 -1262,7 +1262,6 @@@ static void shrink_active_list(unsigne
         * Move the pages to the [file or anon] inactive list.
         */
        pagevec_init(&pvec, 1);
 -      pgmoved = 0;
        lru = LRU_BASE + file * LRU_FILE;
  
        spin_lock_irq(&zone->lru_lock);
         */
        reclaim_stat->recent_rotated[!!file] += pgmoved;
  
 +      pgmoved = 0;
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
@@@ -1469,7 -1469,7 +1469,7 @@@ static void shrink_zone(int priority, s
                int file = is_file_lru(l);
                int scan;
  
 -              scan = zone_page_state(zone, NR_LRU_BASE + l);
 +              scan = zone_nr_pages(zone, sc, l);
                if (priority) {
                        scan >>= priority;
                        scan = (scan * percent[file]) / 100;
@@@ -1965,6 -1965,8 +1965,8 @@@ static int kswapd(void *p
        };
        node_to_cpumask_ptr(cpumask, pgdat->node_id);
  
+       lockdep_set_current_reclaim_state(GFP_KERNEL);
        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
        current->reclaim_state = &reclaim_state;
@@@ -2057,31 -2059,31 +2059,31 @@@ static unsigned long shrink_all_zones(u
                                      int pass, struct scan_control *sc)
  {
        struct zone *zone;
 -      unsigned long nr_to_scan, ret = 0;
 -      enum lru_list l;
 +      unsigned long ret = 0;
  
        for_each_zone(zone) {
 +              enum lru_list l;
  
                if (!populated_zone(zone))
                        continue;
 -
                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
  
                for_each_evictable_lru(l) {
 +                      enum zone_stat_item ls = NR_LRU_BASE + l;
 +                      unsigned long lru_pages = zone_page_state(zone, ls);
 +
                        /* For pass = 0, we don't shrink the active list */
 -                      if (pass == 0 &&
 -                              (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
 +                      if (pass == 0 && (l == LRU_ACTIVE_ANON ||
 +                                              l == LRU_ACTIVE_FILE))
                                continue;
  
 -                      zone->lru[l].nr_scan +=
 -                              (zone_page_state(zone, NR_LRU_BASE + l)
 -                                                              >> prio) + 1;
 +                      zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
                        if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
 +                              unsigned long nr_to_scan;
 +
                                zone->lru[l].nr_scan = 0;
 -                              nr_to_scan = min(nr_pages,
 -                                      zone_page_state(zone,
 -                                                      NR_LRU_BASE + l));
 +                              nr_to_scan = min(nr_pages, lru_pages);
                                ret += shrink_list(l, nr_to_scan, zone,
                                                                sc, prio);
                                if (ret >= nr_pages)
                        }
                }
        }
 -
        return ret;
  }
  
@@@ -2111,6 -2114,7 +2113,6 @@@ unsigned long shrink_all_memory(unsigne
                .may_swap = 0,
                .swap_cluster_max = nr_pages,
                .may_writepage = 1,
 -              .swappiness = vm_swappiness,
                .isolate_pages = isolate_pages_global,
        };
  
                int prio;
  
                /* Force reclaiming mapped pages in the passes #3 and #4 */
 -              if (pass > 2) {
 +              if (pass > 2)
                        sc.may_swap = 1;
 -                      sc.swappiness = 100;
 -              }
  
                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
                        unsigned long nr_to_scan = nr_pages - ret;