Merge branches 'sched/rt' and 'sched/urgent' into sched/core

author Ingo Molnar <mingo@elte.hu>

Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)

committer Ingo Molnar <mingo@elte.hu>

Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)
author Ingo Molnar <mingo@elte.hu>
Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)
committer Ingo Molnar <mingo@elte.hu>
Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)
diff --combined include/linux/init_task.h

index 2f3c2d4ef73b1b0df8b4e96998a9021480d5706e,9d85d9f03d183e50e1ab2f4272f54b28d0327144,ea0ea1a4c36fa0c36f2a0db8be447a2d8b6ce817..fd2b11f59077b40c06302aff39b6a0655a7d05bd
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
--- 3/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@@ -48,6 -48,6 -48,12 +48,12 @@@@ extern struct fs_struct init_fs
         .posix_timers    = LIST_HEAD_INIT(sig.posix_timers),            \
         .cpu_timers     = INIT_CPU_TIMERS(sig.cpu_timers),              \
         .rlim           = INIT_RLIMITS,                                 \
++      .cputime        = { .totals = {                                 \
++              .utime = cputime_zero,                                  \
++              .stime = cputime_zero,                                  \
++              .sum_exec_runtime = 0,                                  \
++              .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock),  \
++      }, },                                                           \
    }
    
    extern struct nsproxy init_nsproxy;
@@@@ -142,6 -142,7 -148,6 +148,7 @@@@ extern struct cred init_cred
                 .nr_cpus_allowed = NR_CPUS,                             \
         },                                                              \
         .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
+ +     .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
         .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
         .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
         .real_parent    = &tsk,                                         \
diff --combined include/linux/sched.h

index 5d56b54350a5bccde5a64c5cd5862e8f83d66fe8,c37c5141037b8cb398acd2712652a02110d5fb5c,2127e959e0f4ac86565ff0832063586ac6387dab..61d9b6c1ebc9777b4ab9083fc9a27ee53cd81d20
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
--- 3/include/linux/sched.h
+++ b/include/linux/sched.h
@@@@ -293,6 -293,6 -293,9 +293,9 @@@@ extern void sched_show_task(struct task
    extern void softlockup_tick(void);
    extern void touch_softlockup_watchdog(void);
    extern void touch_all_softlockup_watchdogs(void);
++ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
++                                  struct file *filp, void __user *buffer,
++                                  size_t *lenp, loff_t *ppos);
    extern unsigned int  softlockup_panic;
    extern unsigned long sysctl_hung_task_check_count;
    extern unsigned long sysctl_hung_task_timeout_secs;
@@@@ -440,6 -440,6 -443,7 +443,7 @@@@ struct pacct_struct 
     * @utime:           time spent in user mode, in &cputime_t units
     * @stime:           time spent in kernel mode, in &cputime_t units
     * @sum_exec_runtime:        total time spent on the CPU, in nanoseconds
++  * @lock:            lock for fields in this struct
     *
     * This structure groups together three kinds of CPU time that are
     * tracked for threads and thread groups.  Most things considering
@@@@ -450,6 -450,6 -454,7 +454,7 @@@@ struct task_cputime 
         cputime_t utime;
         cputime_t stime;
         unsigned long long sum_exec_runtime;
++      spinlock_t lock;
    };
    /* Alternate field names when used to cache expirations. */
    #define prof_exp     stime
@@@@ -465,7 -465,7 -470,7 +470,7 @@@@
     * used for thread group CPU clock calculations.
     */
    struct thread_group_cputime {
--      struct task_cputime *totals;
++      struct task_cputime totals;
    };
    
    /*
@@@@ -626,7 -626,7 -631,6 +631,6 @@@@ struct user_struct 
         atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
    #endif
    #ifdef CONFIG_EPOLL
--      atomic_t epoll_devs;    /* The number of epoll descriptors currently open */
         atomic_t epoll_watches; /* The number of file descriptors currently watched */
    #endif
    #ifdef CONFIG_POSIX_MQUEUE
@@@@ -977,6 -977,7 -981,6 +981,7 @@@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ +     int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
    
@@@@ -1031,10 -1032,6 -1035,6 +1036,10 @@@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
    
+ ++     u64                     start_runtime;
+ ++     u64                     avg_wakeup;
+ ++     u64                     nr_migrations;
+ ++
    #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@@ -1050,6 -1047,7 -1050,7 +1055,6 @@@@
         u64                     exec_max;
         u64                     slice_max;
    
- --     u64                     nr_migrations;
         u64                     nr_migrations_cold;
         u64                     nr_failed_migrations_affine;
         u64                     nr_failed_migrations_running;
@@@@ -1146,6 -1144,7 -1147,6 +1151,7 @@@@ struct task_struct 
    #endif
    
         struct list_head tasks;
+ +     struct plist_node pushable_tasks;
    
         struct mm_struct *mm, *active_mm;
    
@@@@ -2183,24 -2182,24 -2184,30 +2189,30 @@@@ static inline int spin_needbreak(spinlo
     * Thread group CPU time accounting.
     */
    
-- extern int thread_group_cputime_alloc(struct task_struct *);
-- extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
-- 
-- static inline void thread_group_cputime_init(struct signal_struct *sig)
++ static inline
++ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
    {
--      sig->cputime.totals = NULL;
++      struct task_cputime *totals = &tsk->signal->cputime.totals;
++      unsigned long flags;
++ 
++      spin_lock_irqsave(&totals->lock, flags);
++      *times = *totals;
++      spin_unlock_irqrestore(&totals->lock, flags);
    }
    
-- static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
++ static inline void thread_group_cputime_init(struct signal_struct *sig)
    {
--      if (curr->signal->cputime.totals)
--              return 0;
--      return thread_group_cputime_alloc(curr);
++      sig->cputime.totals = (struct task_cputime){
++              .utime = cputime_zero,
++              .stime = cputime_zero,
++              .sum_exec_runtime = 0,
++      };
++ 
++      spin_lock_init(&sig->cputime.totals.lock);
    }
    
    static inline void thread_group_cputime_free(struct signal_struct *sig)
    {
--      free_percpu(sig->cputime.totals);
    }
    
    /*
diff --combined init/Kconfig

index a724a149bf3f232aba760c6b584db3f52dfd7e32,19b78aa010e30e1d1b7d91a528426feca5d029dc,f068071fcc5d952a0a6c7d745797ea094d7b3999..a90fcbeeb9d23bbd542a091cace403b970ab371d
--- 1/init/Kconfig
--- 2/init/Kconfig
--- 3/init/Kconfig
+++ b/init/Kconfig
@@@@ -238,6 -238,6 -238,98 +238,98 @@@@ config AUDIT_TRE
         def_bool y
         depends on AUDITSYSCALL && INOTIFY
    
++ menu "RCU Subsystem"
++ 
++ choice
++      prompt "RCU Implementation"
++      default CLASSIC_RCU
++ 
++ config CLASSIC_RCU
++      bool "Classic RCU"
++      help
++        This option selects the classic RCU implementation that is
++        designed for best read-side performance on non-realtime
++        systems.
++ 
++        Select this option if you are unsure.
++ 
++ config TREE_RCU
++      bool "Tree-based hierarchical RCU"
++      help
++        This option selects the RCU implementation that is
++        designed for very large SMP system with hundreds or
++        thousands of CPUs.
++ 
++ config PREEMPT_RCU
++      bool "Preemptible RCU"
++      depends on PREEMPT
++      help
++        This option reduces the latency of the kernel by making certain
++        RCU sections preemptible. Normally RCU code is non-preemptible, if
++        this option is selected then read-only RCU sections become
++        preemptible. This helps latency, but may expose bugs due to
++        now-naive assumptions about each RCU read-side critical section
++        remaining on a given CPU through its execution.
++ 
++ endchoice
++ 
++ config RCU_TRACE
++      bool "Enable tracing for RCU"
++      depends on TREE_RCU || PREEMPT_RCU
++      help
++        This option provides tracing in RCU which presents stats
++        in debugfs for debugging RCU implementation.
++ 
++        Say Y here if you want to enable RCU tracing
++        Say N if you are unsure.
++ 
++ config RCU_FANOUT
++      int "Tree-based hierarchical RCU fanout value"
++      range 2 64 if 64BIT
++      range 2 32 if !64BIT
++      depends on TREE_RCU
++      default 64 if 64BIT
++      default 32 if !64BIT
++      help
++        This option controls the fanout of hierarchical implementations
++        of RCU, allowing RCU to work efficiently on machines with
++        large numbers of CPUs.  This value must be at least the cube
++        root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
++        systems and up to 262,144 for 64-bit systems.
++ 
++        Select a specific number if testing RCU itself.
++        Take the default if unsure.
++ 
++ config RCU_FANOUT_EXACT
++      bool "Disable tree-based hierarchical RCU auto-balancing"
++      depends on TREE_RCU
++      default n
++      help
++        This option forces use of the exact RCU_FANOUT value specified,
++        regardless of imbalances in the hierarchy.  This is useful for
++        testing RCU itself, and might one day be useful on systems with
++        strong NUMA behavior.
++ 
++        Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
++ 
++        Say N if unsure.
++ 
++ config TREE_RCU_TRACE
++      def_bool RCU_TRACE && TREE_RCU
++      select DEBUG_FS
++      help
++        This option provides tracing for the TREE_RCU implementation,
++        permitting Makefile to trivially select kernel/rcutree_trace.c.
++ 
++ config PREEMPT_RCU_TRACE
++      def_bool RCU_TRACE && PREEMPT_RCU
++      select DEBUG_FS
++      help
++        This option provides tracing for the PREEMPT_RCU implementation,
++        permitting Makefile to trivially select kernel/rcupreempt_trace.c.
++ 
++ endmenu # "RCU Subsystem"
++ 
    config IKCONFIG
         tristate "Kernel .config support"
         ---help---
@@@@ -323,26 -323,26 -415,26 +415,26 @@@@ config CGROUP_SCHE
           This option allows you to create arbitrary task groups
           using the "cgroup" pseudo filesystem and control
           the cpu bandwidth allocated to each such task group.
--        Refer to Documentation/cgroups.txt for more information
--        on "cgroup" pseudo filesystem.
++        Refer to Documentation/cgroups/cgroups.txt for more
++        information on "cgroup" pseudo filesystem.
    
    endchoice
    
-- menu "Control Group support"
-- config CGROUPS
--      bool "Control Group support"
++ menuconfig CGROUPS
++      boolean "Control Group support"
         help
--        This option add support for grouping sets of processes together, for
++        This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
           controls or device isolation.
           See
--              - Documentation/cpusets.txt     (Cpusets)
                 - Documentation/scheduler/sched-design-CFS.txt  (CFS)
--              - Documentation/cgroups/ (features for grouping, isolation)
--              - Documentation/controllers/ (features for resource control)
++              - Documentation/cgroups/ (features for grouping, isolation
++                                        and resource control)
    
           Say N if unsure.
    
++ if CGROUPS
++ 
    config CGROUP_DEBUG
         bool "Example debug cgroup subsystem"
         depends on CGROUPS
@@@@ -350,24 -350,24 -442,24 +442,24 @@@@
         help
           This option enables a simple cgroup subsystem that
           exports useful debugging information about the cgroups
--        framework
++        framework.
    
--        Say N if unsure
++        Say N if unsure.
    
    config CGROUP_NS
--         bool "Namespace cgroup subsystem"
--         depends on CGROUPS
--         help
--           Provides a simple namespace cgroup subsystem to
--           provide hierarchical naming of sets of namespaces,
--           for instance virtual servers and checkpoint/restart
--           jobs.
++      bool "Namespace cgroup subsystem"
++      depends on CGROUPS
++      help
++        Provides a simple namespace cgroup subsystem to
++        provide hierarchical naming of sets of namespaces,
++        for instance virtual servers and checkpoint/restart
++        jobs.
    
    config CGROUP_FREEZER
--         bool "control group freezer subsystem"
--         depends on CGROUPS
--         help
--           Provides a way to freeze and unfreeze all tasks in a
++      bool "Freezer cgroup subsystem"
++      depends on CGROUPS
++      help
++        Provides a way to freeze and unfreeze all tasks in a
           cgroup.
    
    config CGROUP_DEVICE
@@@@ -388,18 -388,18 -480,23 +480,23 @@@@ config CPUSET
    
           Say N if unsure.
    
++ config PROC_PID_CPUSET
++      bool "Include legacy /proc/<pid>/cpuset file"
++      depends on CPUSETS
++      default y
++ 
    config CGROUP_CPUACCT
         bool "Simple CPU accounting cgroup subsystem"
         depends on CGROUPS
         help
           Provides a simple Resource Controller for monitoring the
--        total CPU consumed by the tasks in a cgroup
++        total CPU consumed by the tasks in a cgroup.
    
    config RESOURCE_COUNTERS
         bool "Resource counters"
         help
           This option enables controller independent resource accounting
--           infrastructure that works with cgroups
++        infrastructure that works with cgroups.
         depends on CGROUPS
    
    config CGROUP_MEM_RES_CTLR
@@@@ -425,9 -425,9 -522,6 +522,6 @@@@
           This config option also selects MM_OWNER config option, which
           could in turn add some fork/exit overhead.
    
-- config MM_OWNER
--      bool
-- 
    config CGROUP_MEM_RES_CTLR_SWAP
         bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
         depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
@@@@ -444,8 -444,8 -538,10 +538,10 @@@@
           there will be no overhead from this. Even when you set this config=y,
           if boot option "noswapaccount" is set, swap will not be accounted.
    
++ endif # CGROUPS
    
-- endmenu
++ config MM_OWNER
++      bool
    
    config SYSFS_DEPRECATED
         bool
@@@@ -483,11 -483,11 -579,6 +579,6 @@@@ config SYSFS_DEPRECATED_V
           if the original kernel, that came with your distribution, has
           this option set to N.
    
-- config PROC_PID_CPUSET
--      bool "Include legacy /proc/<pid>/cpuset file"
--      depends on CPUSETS
--      default y
-- 
    config RELAY
         bool "Kernel->user space relay support (formerly relayfs)"
         help
@@@@ -542,6 -542,6 -633,14 +633,14 @@@@ config PID_N
           Unless you want to work with an experimental feature
           say N here.
    
++ config NET_NS
++      bool "Network namespace"
++      default n
++      depends on NAMESPACES && EXPERIMENTAL && NET
++      help
++        Allow user space to create what appear to be multiple instances
++        of the network stack.
++ 
    config BLK_DEV_INITRD
         bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
         depends on BROKEN || !FRV
@@@@ -626,13 -626,13 -725,6 +725,6 @@@@ config KALLSYMS_AL
    
            Say N.
    
-- config KALLSYMS_STRIP_GENERATED
--      bool "Strip machine generated symbols from kallsyms"
--      depends on KALLSYMS_ALL
--      default y
--      help
--        Say N if you want kallsyms to retain even machine generated symbols.
-- 
    config KALLSYMS_EXTRA_PASS
         bool "Do an extra kallsyms pass"
         depends on KALLSYMS
@@@@ -874,7 -874,6 -966,7 +966,6 @@@@ config SLABINF
    
    config RT_MUTEXES
         boolean
- -     select PLIST
    
    config BASE_SMALL
         int
@@@@ -973,90 -972,90 -1065,3 +1064,3 @@@@ source "block/Kconfig
    config PREEMPT_NOTIFIERS
         bool
    
-- choice
--      prompt "RCU Implementation"
--      default CLASSIC_RCU
-- 
-- config CLASSIC_RCU
--      bool "Classic RCU"
--      help
--        This option selects the classic RCU implementation that is
--        designed for best read-side performance on non-realtime
--        systems.
-- 
--        Select this option if you are unsure.
-- 
-- config TREE_RCU
--      bool "Tree-based hierarchical RCU"
--      help
--        This option selects the RCU implementation that is
--        designed for very large SMP system with hundreds or
--        thousands of CPUs.
-- 
-- config PREEMPT_RCU
--      bool "Preemptible RCU"
--      depends on PREEMPT
--      help
--        This option reduces the latency of the kernel by making certain
--        RCU sections preemptible. Normally RCU code is non-preemptible, if
--        this option is selected then read-only RCU sections become
--        preemptible. This helps latency, but may expose bugs due to
--        now-naive assumptions about each RCU read-side critical section
--        remaining on a given CPU through its execution.
-- 
-- endchoice
-- 
-- config RCU_TRACE
--      bool "Enable tracing for RCU"
--      depends on TREE_RCU || PREEMPT_RCU
--      help
--        This option provides tracing in RCU which presents stats
--        in debugfs for debugging RCU implementation.
-- 
--        Say Y here if you want to enable RCU tracing
--        Say N if you are unsure.
-- 
-- config RCU_FANOUT
--      int "Tree-based hierarchical RCU fanout value"
--      range 2 64 if 64BIT
--      range 2 32 if !64BIT
--      depends on TREE_RCU
--      default 64 if 64BIT
--      default 32 if !64BIT
--      help
--        This option controls the fanout of hierarchical implementations
--        of RCU, allowing RCU to work efficiently on machines with
--        large numbers of CPUs.  This value must be at least the cube
--        root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
--        systems and up to 262,144 for 64-bit systems.
-- 
--        Select a specific number if testing RCU itself.
--        Take the default if unsure.
-- 
-- config RCU_FANOUT_EXACT
--      bool "Disable tree-based hierarchical RCU auto-balancing"
--      depends on TREE_RCU
--      default n
--      help
--        This option forces use of the exact RCU_FANOUT value specified,
--        regardless of imbalances in the hierarchy.  This is useful for
--        testing RCU itself, and might one day be useful on systems with
--        strong NUMA behavior.
-- 
--        Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
-- 
--        Say N if unsure.
-- 
-- config TREE_RCU_TRACE
--      def_bool RCU_TRACE && TREE_RCU
--      select DEBUG_FS
--      help
--        This option provides tracing for the TREE_RCU implementation,
--        permitting Makefile to trivially select kernel/rcutree_trace.c.
-- 
-- config PREEMPT_RCU_TRACE
--      def_bool RCU_TRACE && PREEMPT_RCU
--      select DEBUG_FS
--      help
--        This option provides tracing for the PREEMPT_RCU implementation,
--        permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --combined kernel/sched.c

index 86f5a063f0b929ab351eadfcdf7a762f473f8e8d,2b703f1fac3a1de8cefd93509cf87ae3813c22c5,e1fc67d0674c0882788dd95ec6ddbdd7e8d25c34..1dae85a1221ad7b2214a12bb939b07cd1a14576c
--- 1/kernel/sched.c
--- 2/kernel/sched.c
--- 3/kernel/sched.c
+++ b/kernel/sched.c
@@@@ -125,9 -125,6 -125,9 +125,9 @@@@ DEFINE_TRACE(sched_switch)
    DEFINE_TRACE(sched_migrate_task);
    
    #ifdef CONFIG_SMP
+ + 
+ + static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+ + 
    /*
     * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
     * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@@@ -467,11 -464,17 -467,11 +467,17 @@@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
    #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -     int highest_prio; /* highest queued rt task prio */
+ +     struct {
+ +             int curr; /* highest queued rt task prio */
+ +#ifdef CONFIG_SMP
+ +             int next; /* next highest */
+ +#endif
+ +     } highest_prio;
    #endif
    #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+ +     struct plist_head pushable_tasks;
    #endif
         int rt_throttled;
         u64 rt_time;
@@@@ -1323,8 -1326,8 -1323,8 +1329,8 @@@@ static inline void update_load_sub(stru
     * slice expiry etc.
     */
    
-- #define WEIGHT_IDLEPRIO              2
-- #define WMULT_IDLEPRIO               (1 << 31)
++ #define WEIGHT_IDLEPRIO                3
++ #define WMULT_IDLEPRIO         1431655765
    
    /*
     * Nice levels are multiplicative, with a gentle 10% change for every
@@@@ -1610,21 -1613,42 -1610,21 +1616,42 @@@@ static inline void update_shares_locked
    
    #endif
    
+ +#ifdef CONFIG_PREEMPT
+ +
    /*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ + * way at the expense of forcing extra atomic operations in all
+ + * invocations.  This assures that the double_lock is acquired using the
+ + * same underlying policy as the spinlock_t on this architecture, which
+ + * reduces latency compared to the unfair variant below.  However, it
+ + * also adds more overhead and therefore may reduce throughput.
     */
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +     __releases(this_rq->lock)
+ +     __acquires(busiest->lock)
+ +     __acquires(this_rq->lock)
+ +{
+ +     spin_unlock(&this_rq->lock);
+ +     double_rq_lock(this_rq, busiest);
+ +
+ +     return 1;
+ +}
+ +
+ +#else
+ +/*
+ + * Unfair double_lock_balance: Optimizes throughput at the expense of
+ + * latency by eliminating extra atomic operations when the locks are
+ + * already in proper order on entry.  This favors lower cpu-ids and will
+ + * grant the double lock to lower cpus over higher ids under contention,
+ + * regardless of entry order into the function.
+ + */
+ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
    {
         int ret = 0;
    
- -     if (unlikely(!irqs_disabled())) {
- -             /* printk() doesn't work good under rq->lock */
- -             spin_unlock(&this_rq->lock);
- -             BUG_ON(1);
- -     }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@@ -1637,6 -1661,22 -1637,6 +1664,22 @@@@
         return ret;
    }
    
+ +#endif /* CONFIG_PREEMPT */
+ +
+ +/*
+ + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + */
+ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +{
+ +     if (unlikely(!irqs_disabled())) {
+ +             /* printk() doesn't work good under rq->lock */
+ +             spin_unlock(&this_rq->lock);
+ +             BUG_ON(1);
+ +     }
+ +
+ +     return _double_lock_balance(this_rq, busiest);
+ +}
+ +
    static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
    {
@@@@ -1705,9 -1745,6 -1705,6 +1748,9 @@@@ static void update_avg(u64 *avg, u64 sa
    
    static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
    {
+ ++     if (wakeup)
+ ++             p->se.start_runtime = p->se.sum_exec_runtime;
+ ++
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@@@ -1715,15 -1752,10 -1712,10 +1758,15 @@@@
    
    static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
    {
- --     if (sleep && p->se.last_wakeup) {
- --             update_avg(&p->se.avg_overlap,
- --                        p->se.sum_exec_runtime - p->se.last_wakeup);
- --             p->se.last_wakeup = 0;
+ ++     if (sleep) {
+ ++             if (p->se.last_wakeup) {
+ ++                     update_avg(&p->se.avg_overlap,
+ ++                             p->se.sum_exec_runtime - p->se.last_wakeup);
+ ++                     p->se.last_wakeup = 0;
+ ++             } else {
+ ++                     update_avg(&p->se.avg_wakeup,
+ ++                             sysctl_sched_wakeup_granularity);
+ ++             }
         }
    
         sched_info_dequeued(p);
@@@@ -2274,6 -2306,6 -2266,16 +2317,16 @@@@ static int try_to_wake_up(struct task_s
         if (!sched_feat(SYNC_WAKEUPS))
                 sync = 0;
    
++      if (!sync) {
++              if (current->se.avg_overlap < sysctl_sched_migration_cost &&
++                        p->se.avg_overlap < sysctl_sched_migration_cost)
++                      sync = 1;
++      } else {
++              if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
++                        p->se.avg_overlap >= sysctl_sched_migration_cost)
++                      sync = 0;
++      }
++ 
    #ifdef CONFIG_SMP
         if (sched_feat(LB_WAKEUP_UPDATE)) {
                 struct sched_domain *sd;
@@@@ -2353,22 -2385,6 -2355,6 +2406,22 @@@@ out_activate
         activate_task(rq, p, 1);
         success = 1;
    
+ ++     /*
+ ++      * Only attribute actual wakeups done by this task.
+ ++      */
+ ++     if (!in_interrupt()) {
+ ++             struct sched_entity *se = &current->se;
+ ++             u64 sample = se->sum_exec_runtime;
+ ++
+ ++             if (se->last_wakeup)
+ ++                     sample -= se->last_wakeup;
+ ++             else
+ ++                     sample -= se->start_runtime;
+ ++             update_avg(&se->avg_wakeup, sample);
+ ++
+ ++             se->last_wakeup = se->sum_exec_runtime;
+ ++     }
+ ++
    out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@@@ -2379,6 -2395,8 -2365,8 +2432,6 @@@@
                 p->sched_class->task_wake_up(rq, p);
    #endif
    out:
- --     current->se.last_wakeup = current->se.sum_exec_runtime;
- --
         task_rq_unlock(rq, &flags);
    
         return success;
@@@@ -2408,8 -2426,6 -2396,6 +2461,8 @@@@ static void __sched_fork(struct task_st
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+ ++     p->se.start_runtime             = 0;
+ ++     p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
    
    #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@@@ -2472,6 -2488,8 -2458,6 +2525,8 @@@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
    #endif
+ +     plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +
         put_cpu();
    }
    
@@@@ -2612,6 -2630,12 -2598,6 +2667,12 @@@@ static void finish_task_switch(struct r
    {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ +#ifdef CONFIG_SMP
+ +     int post_schedule = 0;
+ +
+ +     if (current->sched_class->needs_post_schedule)
+ +             post_schedule = current->sched_class->needs_post_schedule(rq);
+ +#endif
    
         rq->prev_mm = NULL;
    
@@@@ -2630,7 -2654,7 -2616,7 +2691,7 @@@@
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
    #ifdef CONFIG_SMP
- -     if (current->sched_class->post_schedule)
+ +     if (post_schedule)
                 current->sched_class->post_schedule(rq);
    #endif
    
@@@@ -3011,6 -3035,16 -2997,6 +3072,16 @@@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
    
+ +#ifdef CONFIG_PREEMPT
+ +     /*
+ +      * NEWIDLE balancing is a source of latency, so preemptible kernels
+ +      * will stop after the first task is pulled to minimize the critical
+ +      * section.
+ +      */
+ +     if (idle == CPU_NEWLY_IDLE)
+ +             goto out;
+ +#endif
+ +
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@@ -3057,9 -3091,15 -3043,9 +3128,15 @@@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
    
+ +#ifdef CONFIG_PREEMPT
+ +             /*
+ +              * NEWIDLE balancing is a source of latency, so preemptible
+ +              * kernels will stop after the first task is pulled to minimize
+ +              * the critical section.
+ +              */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- -
+ +#endif
         } while (class && max_load_move > total_load_moved);
    
         return total_load_moved > 0;
@@@@ -3904,19 -3944,19 -3890,24 +3981,24 @@@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
    
         if (stop_tick) {
--              cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
    
--              /*
--               * If we are going offline and still the leader, give up!
--               */
--              if (!cpu_active(cpu) &&
--                  atomic_read(&nohz.load_balancer) == cpu) {
++              if (!cpu_active(cpu)) {
++                      if (atomic_read(&nohz.load_balancer) != cpu)
++                              return 0;
++ 
++                      /*
++                       * If we are going offline and still the leader,
++                       * give up!
++                       */
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
++ 
                         return 0;
                 }
    
++              cpumask_set_cpu(cpu, nohz.cpu_mask);
++ 
                 /* time for ilb owner also to sleep */
                 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
@@@@ -4464,7 -4504,7 -4455,7 +4546,7 @@@@ void __kprobes sub_preempt_count(int va
         /*
          * Underflow?
          */
--        if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
++      if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
                 return;
         /*
          * Is the spinlock portion underflowing?
@@@@ -5150,7 -5190,7 -5141,7 +5232,7 @@@@ int can_nice(const struct task_struct *
     * sys_setpriority is a more generic, but much slower function that
     * does similar things.
     */
-- asmlinkage long sys_nice(int increment)
++ SYSCALL_DEFINE1(nice, int, increment)
    {
         long nice, retval;
    
@@@@ -5457,8 -5497,8 -5448,8 +5539,8 @@@@ do_sched_setscheduler(pid_t pid, int po
     * @policy: new policy.
     * @param: structure containing the new RT priority.
     */
-- asmlinkage long
-- sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
++ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
++              struct sched_param __user *, param)
    {
         /* negative values for policy are not valid */
         if (policy < 0)
@@@@ -5472,7 -5512,7 -5463,7 +5554,7 @@@@
     * @pid: the pid in question.
     * @param: structure containing the new RT priority.
     */
-- asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
++ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
    {
         return do_sched_setscheduler(pid, -1, param);
    }
@@@@ -5481,7 -5521,7 -5472,7 +5563,7 @@@@
     * sys_sched_getscheduler - get the policy (scheduling class) of a thread
     * @pid: the pid in question.
     */
-- asmlinkage long sys_sched_getscheduler(pid_t pid)
++ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
    {
         struct task_struct *p;
         int retval;
@@@@ -5506,7 -5546,7 -5497,7 +5588,7 @@@@
     * @pid: the pid in question.
     * @param: structure containing the RT priority.
     */
-- asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
++ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
    {
         struct sched_param lp;
         struct task_struct *p;
@@@@ -5624,8 -5664,8 -5615,8 +5706,8 @@@@ static int get_user_cpu_mask(unsigned l
     * @len: length in bytes of the bitmask pointed to by user_mask_ptr
     * @user_mask_ptr: user-space pointer to the new cpu mask
     */
-- asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
--                                    unsigned long __user *user_mask_ptr)
++ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
++              unsigned long __user *, user_mask_ptr)
    {
         cpumask_var_t new_mask;
         int retval;
@@@@ -5672,8 -5712,8 -5663,8 +5754,8 @@@@ out_unlock
     * @len: length in bytes of the bitmask pointed to by user_mask_ptr
     * @user_mask_ptr: user-space pointer to hold the current cpu mask
     */
-- asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
--                                    unsigned long __user *user_mask_ptr)
++ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
++              unsigned long __user *, user_mask_ptr)
    {
         int ret;
         cpumask_var_t mask;
@@@@ -5702,7 -5742,7 -5693,7 +5784,7 @@@@
     * This function yields the current CPU to other tasks. If there are no
     * other threads running on this CPU then this function will return.
     */
-- asmlinkage long sys_sched_yield(void)
++ SYSCALL_DEFINE0(sched_yield)
    {
         struct rq *rq = this_rq_lock();
    
@@@@ -5843,7 -5883,7 -5834,7 +5925,7 @@@@ long __sched io_schedule_timeout(long t
     * this syscall returns the maximum rt_priority that can be used
     * by a given scheduling class.
     */
-- asmlinkage long sys_sched_get_priority_max(int policy)
++ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
    {
         int ret = -EINVAL;
    
@@@@ -5868,7 -5908,7 -5859,7 +5950,7 @@@@
     * this syscall returns the minimum rt_priority that can be used
     * by a given scheduling class.
     */
-- asmlinkage long sys_sched_get_priority_min(int policy)
++ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
    {
         int ret = -EINVAL;
    
@@@@ -5893,8 -5933,8 -5884,8 +5975,8 @@@@
     * this syscall writes the default timeslice value of a given process
     * into the user-space timespec buffer. A value of '0' means infinity.
     */
-- asmlinkage
-- long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
++ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
++              struct timespec __user *, interval)
    {
         struct task_struct *p;
         unsigned int time_slice;
@@@@ -7309,10 -7349,10 -7300,10 +7391,10 @@@@ cpu_to_phys_group(int cpu, const struc
     * groups, so roll our own. Now each node has its own list of groups which
     * gets dynamically allocated.
     */
- - static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ + static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
    static struct sched_group ***sched_group_nodes_bycpu;
    
- - static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+ + static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
    static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
    
    static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@@@ -7587,7 -7627,7 -7578,7 +7669,7 @@@@ static int __build_sched_domains(const 
    #ifdef CONFIG_NUMA
                 if (cpumask_weight(cpu_map) >
                                 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
- -                      sd = &per_cpu(allnodes_domains, i);
+ +                      sd = &per_cpu(allnodes_domains, i).sd;
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
                         cpumask_copy(sched_domain_span(sd), cpu_map);
@@@@ -7597,7 -7637,7 -7588,7 +7679,7 @@@@
                 } else
                         p = NULL;
    
- -              sd = &per_cpu(node_domains, i);
+ +              sd = &per_cpu(node_domains, i).sd;
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
                 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
@@@@ -7715,7 -7755,7 -7706,7 +7797,7 @@@@
                 for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
    
- -                      sd = &per_cpu(node_domains, j);
+ +                      sd = &per_cpu(node_domains, j).sd;
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
@@@@ -8228,11 -8268,15 -8219,11 +8310,15 @@@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
    
    #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -     rt_rq->highest_prio = MAX_RT_PRIO;
+ +     rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +#ifdef CONFIG_SMP
+ +     rt_rq->highest_prio.next = MAX_RT_PRIO;
+ +#endif
    #endif
    #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+ +     plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
    #endif
    
         rt_rq->rt_time = 0;
@@@@ -9074,6 -9118,6 -9065,13 +9160,13 @@@@ static int tg_schedulable(struct task_g
                 runtime = d->rt_runtime;
         }
    
++ #ifdef CONFIG_USER_SCHED
++      if (tg == &root_task_group) {
++              period = global_rt_period();
++              runtime = global_rt_runtime();
++      }
++ #endif
++ 
         /*
          * Cannot have more runtime than the period.
          */
diff --combined kernel/sched_fair.c

index bdf64346b4d15b97f213cedf04b5632d659c68b8,8e1352c75557308bf1e892bb260b39e9feffd10f,a7e50ba185ac0a843ac16604e13a721ca8e4cb8e..bc1563e7a248d421932f60ce0e4ebabd68a59145
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
--- 3/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@@ -283,7 -283,7 -283,7 +283,7 @@@@ static void update_min_vruntime(struct 
                                                    struct sched_entity,
                                                    run_node);
    
--              if (vruntime == cfs_rq->min_vruntime)
++              if (!cfs_rq->curr)
                         vruntime = se->vruntime;
                 else
                         vruntime = min_vruntime(vruntime, se->vruntime);
@@@@ -429,7 -429,7 -429,10 +429,10 @@@@ static u64 sched_slice(struct cfs_rq *c
         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
    
         for_each_sched_entity(se) {
--              struct load_weight *load = &cfs_rq->load;
++              struct load_weight *load;
++ 
++              cfs_rq = cfs_rq_of(se);
++              load = &cfs_rq->load;
    
                 if (unlikely(!se->on_rq)) {
                         struct load_weight lw = cfs_rq->load;
@@@@ -677,9 -677,9 -680,13 +680,13 @@@@ place_entity(struct cfs_rq *cfs_rq, str
                         unsigned long thresh = sysctl_sched_latency;
    
                         /*
--                       * convert the sleeper threshold into virtual time
++                       * Convert the sleeper threshold into virtual time.
++                       * SCHED_IDLE is a special sub-class.  We care about
++                       * fairness only relative to other SCHED_IDLE tasks,
++                       * all of which have the same weight.
                          */
--                      if (sched_feat(NORMALIZED_SLEEPER))
++                      if (sched_feat(NORMALIZED_SLEEPER) &&
++                                      task_of(se)->policy != SCHED_IDLE)
                                 thresh = calc_delta_fair(thresh, se);
    
                         vruntime -= thresh;
@@@@ -712,7 -712,7 -719,7 +719,7 @@@@ enqueue_entity(struct cfs_rq *cfs_rq, s
                 __enqueue_entity(cfs_rq, se);
    }
    
-- static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
++ static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
    {
         if (cfs_rq->last == se)
                 cfs_rq->last = NULL;
@@@@ -721,6 -721,6 -728,12 +728,12 @@@@
                 cfs_rq->next = NULL;
    }
    
++ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
++ {
++      for_each_sched_entity(se)
++              __clear_buddies(cfs_rq_of(se), se);
++ }
++ 
    static void
    dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
    {
@@@@ -761,8 -761,8 -774,14 +774,14 @@@@ check_preempt_tick(struct cfs_rq *cfs_r
    
         ideal_runtime = sched_slice(cfs_rq, curr);
         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
--      if (delta_exec > ideal_runtime)
++      if (delta_exec > ideal_runtime) {
                 resched_task(rq_of(cfs_rq)->curr);
++              /*
++               * The current task ran long enough, ensure it doesn't get
++               * re-elected due to buddy favours.
++               */
++              clear_buddies(cfs_rq, curr);
++      }
    }
    
    static void
@@@@ -1172,20 -1172,20 -1191,15 +1191,15 @@@@ wake_affine(struct sched_domain *this_s
             int idx, unsigned long load, unsigned long this_load,
             unsigned int imbalance)
    {
--      struct task_struct *curr = this_rq->curr;
--      struct task_group *tg;
         unsigned long tl = this_load;
         unsigned long tl_per_task;
++      struct task_group *tg;
         unsigned long weight;
         int balanced;
    
         if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                 return 0;
    
--      if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
--                      p->se.avg_overlap > sysctl_sched_migration_cost))
--              sync = 0;
-- 
         /*
          * If sync wakeup then subtract the (maximum possible)
          * effect of the currently running task from the load
@@@@ -1295,63 -1295,16 -1309,16 +1309,63 @@@@ out
    }
    #endif /* CONFIG_SMP */
    
- --static unsigned long wakeup_gran(struct sched_entity *se)
+ ++/*
+ ++ * Adaptive granularity
+ ++ *
+ ++ * se->avg_wakeup gives the average time a task runs until it does a wakeup,
+ ++ * with the limit of wakeup_gran -- when it never does a wakeup.
+ ++ *
+ ++ * So the smaller avg_wakeup is the faster we want this task to preempt,
+ ++ * but we don't want to treat the preemptee unfairly and therefore allow it
+ ++ * to run for at least the amount of time we'd like to run.
+ ++ *
+ ++ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
+ ++ *
+ ++ * NOTE: we use *nr_running to scale with load, this nicely matches the
+ ++ *       degrading latency on load.
+ ++ */
+ ++static unsigned long
+ ++adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
+ ++{
+ ++     u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ ++     u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
+ ++     u64 gran = 0;
+ ++
+ ++     if (this_run < expected_wakeup)
+ ++             gran = expected_wakeup - this_run;
+ ++
+ ++     return min_t(s64, gran, sysctl_sched_wakeup_granularity);
+ ++}
+ ++
+ ++static unsigned long
+ ++wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
    {
         unsigned long gran = sysctl_sched_wakeup_granularity;
    
+ ++     if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
+ ++             gran = adaptive_gran(curr, se);
+ ++
         /*
- --      * More easily preempt - nice tasks, while not making it harder for
- --      * + nice tasks.
+ ++      * Since its curr running now, convert the gran from real-time
+ ++      * to virtual-time in his units.
          */
- --     if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
- --             gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+ ++     if (sched_feat(ASYM_GRAN)) {
+ ++             /*
+ ++              * By using 'se' instead of 'curr' we penalize light tasks, so
+ ++              * they get preempted easier. That is, if 'se' < 'curr' then
+ ++              * the resulting gran will be larger, therefore penalizing the
+ ++              * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ ++              * be smaller, again penalizing the lighter task.
+ ++              *
+ ++              * This is especially important for buddies when the leftmost
+ ++              * task is higher priority than the buddy.
+ ++              */
+ ++             if (unlikely(se->load.weight != NICE_0_LOAD))
+ ++                     gran = calc_delta_fair(gran, se);
+ ++     } else {
+ ++             if (unlikely(curr->load.weight != NICE_0_LOAD))
+ ++                     gran = calc_delta_fair(gran, curr);
+ ++     }
    
         return gran;
    }
@@@@ -1378,7 -1331,7 -1345,7 +1392,7 @@@@ wakeup_preempt_entity(struct sched_enti
         if (vdiff <= 0)
                 return -1;
    
- --     gran = wakeup_gran(curr);
+ ++     gran = wakeup_gran(curr, se);
         if (vdiff > gran)
                 return 1;
    
@@@@ -1387,14 -1340,14 -1354,18 +1401,18 @@@@
    
    static void set_last_buddy(struct sched_entity *se)
    {
--      for_each_sched_entity(se)
--              cfs_rq_of(se)->last = se;
++      if (likely(task_of(se)->policy != SCHED_IDLE)) {
++              for_each_sched_entity(se)
++                      cfs_rq_of(se)->last = se;
++      }
    }
    
    static void set_next_buddy(struct sched_entity *se)
    {
--      for_each_sched_entity(se)
--              cfs_rq_of(se)->next = se;
++      if (likely(task_of(se)->policy != SCHED_IDLE)) {
++              for_each_sched_entity(se)
++                      cfs_rq_of(se)->next = se;
++      }
    }
    
    /*
@@@@ -1440,18 -1393,18 -1411,22 +1458,22 @@@@ static void check_preempt_wakeup(struc
                 return;
    
         /*
--       * Batch tasks do not preempt (their preemption is driven by
++       * Batch and idle tasks do not preempt (their preemption is driven by
          * the tick):
          */
--      if (unlikely(p->policy == SCHED_BATCH))
++      if (unlikely(p->policy != SCHED_NORMAL))
+ +              return;
+ + 
++      /* Idle tasks are by definition preempted by everybody. */
++      if (unlikely(curr->policy == SCHED_IDLE)) {
++              resched_task(curr);
+               return;
++      }
+  
         if (!sched_feat(WAKEUP_PREEMPT))
                 return;
    
--      if (sched_feat(WAKEUP_OVERLAP) && (sync ||
--                      (se->avg_overlap < sysctl_sched_migration_cost &&
--                       pse->avg_overlap < sysctl_sched_migration_cost))) {
++      if (sched_feat(WAKEUP_OVERLAP) && sync) {
                 resched_task(curr);
                 return;
         }
@@@@ -1482,6 -1435,6 -1457,11 +1504,11 @@@@ static struct task_struct *pick_next_ta
    
         do {
                 se = pick_next_entity(cfs_rq);
++              /*
++               * If se was a buddy, clear it so that it will have to earn
++               * the favour again.
++               */
++              __clear_buddies(cfs_rq, se);
                 set_next_entity(cfs_rq, se);
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
diff --combined kernel/sched_rt.c

index 954e1a81b7967567b617b918b254deaa12a848a4,48d1f6e8497ae60a082962e91a47ae3d535e6d16,bac1061cea2f4ab67f7c6d5b81300d02b717d5ba..c79dc7844012d7b69926298673bdffc6fa3ff79b
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
--- 3/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@@ -3,6 -3,40 -3,6 +3,40 @@@@
     * policies)
     */
    
+ +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +{
+ +     return container_of(rt_se, struct task_struct, rt);
+ +}
+ +
+ +#ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +     return rt_rq->rq;
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ +     return rt_se->rt_rq;
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +     return container_of(rt_rq, struct rq, rt);
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ +     struct task_struct *p = rt_task_of(rt_se);
+ +     struct rq *rq = task_rq(p);
+ +
+ +     return &rq->rt;
+ +}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
    #ifdef CONFIG_SMP
    
    static inline int rt_overloaded(struct rq *rq)
@@@@ -37,25 -71,69 -37,25 +71,69 @@@@ static inline void rt_clear_overload(st
         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
    }
    
- -static void update_rt_migration(struct rq *rq)
+ +static void update_rt_migration(struct rt_rq *rt_rq)
    {
- -     if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- -             if (!rq->rt.overloaded) {
- -                     rt_set_overload(rq);
- -                     rq->rt.overloaded = 1;
+ +     if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ +             if (!rt_rq->overloaded) {
+ +                     rt_set_overload(rq_of_rt_rq(rt_rq));
+ +                     rt_rq->overloaded = 1;
                 }
- -     } else if (rq->rt.overloaded) {
- -             rt_clear_overload(rq);
- -             rq->rt.overloaded = 0;
+ +     } else if (rt_rq->overloaded) {
+ +             rt_clear_overload(rq_of_rt_rq(rt_rq));
+ +             rt_rq->overloaded = 0;
         }
    }
- -#endif /* CONFIG_SMP */
    
- -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     if (rt_se->nr_cpus_allowed > 1)
+ +             rt_rq->rt_nr_migratory++;
+ +
+ +     update_rt_migration(rt_rq);
+ +}
+ +
+ +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     if (rt_se->nr_cpus_allowed > 1)
+ +             rt_rq->rt_nr_migratory--;
+ +
+ +     update_rt_migration(rt_rq);
+ +}
+ +
+ +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +     plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +     plist_node_init(&p->pushable_tasks, p->prio);
+ +     plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +     plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +#else
+ +
+ +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
    {
- -     return container_of(rt_se, struct task_struct, rt);
    }
    
+ +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +}
+ +
+ +static inline
+ +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +static inline
+ +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +#endif /* CONFIG_SMP */
+ +
    static inline int on_rt_rq(struct sched_rt_entity *rt_se)
    {
         return !list_empty(&rt_se->run_list);
@@@@ -79,16 -157,6 -79,16 +157,6 @@@@ static inline u64 sched_rt_period(struc
    #define for_each_leaf_rt_rq(rt_rq, rq) \
         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
    
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- -     return rt_rq->rq;
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- -     return rt_se->rt_rq;
- -}
- -
    #define for_each_sched_rt_entity(rt_se) \
         for (; rt_se; rt_se = rt_se->parent)
    
@@@@ -108,7 -176,7 -108,7 +176,7 @@@@ static void sched_rt_rq_enqueue(struct 
         if (rt_rq->rt_nr_running) {
                 if (rt_se && !on_rt_rq(rt_se))
                         enqueue_rt_entity(rt_se);
- -             if (rt_rq->highest_prio < curr->prio)
+ +             if (rt_rq->highest_prio.curr < curr->prio)
                         resched_task(curr);
         }
    }
@@@@ -176,19 -244,6 -176,19 +244,6 @@@@ static inline u64 sched_rt_period(struc
    #define for_each_leaf_rt_rq(rt_rq, rq) \
         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
    
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- -     return container_of(rt_rq, struct rq, rt);
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- -     struct task_struct *p = rt_task_of(rt_se);
- -     struct rq *rq = task_rq(p);
- -
- -     return &rq->rt;
- -}
- -
    #define for_each_sched_rt_entity(rt_se) \
         for (; rt_se; rt_se = NULL)
    
@@@@ -473,7 -528,7 -473,7 +528,7 @@@@ static inline int rt_se_prio(struct sch
         struct rt_rq *rt_rq = group_rt_rq(rt_se);
    
         if (rt_rq)
- -             return rt_rq->highest_prio;
+ +             return rt_rq->highest_prio.curr;
    #endif
    
         return rt_task_of(rt_se)->prio;
@@@@ -547,91 -602,174 -547,91 +602,174 @@@@ static void update_curr_rt(struct rq *r
         }
    }
    
- -static inline
- -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +#if defined CONFIG_SMP
+ +
+ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+ +
+ +static inline int next_prio(struct rq *rq)
    {
- -     WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- -     rt_rq->rt_nr_running++;
- -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -     if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
- -#ifdef CONFIG_SMP
- -             struct rq *rq = rq_of_rt_rq(rt_rq);
- -#endif
+ +     struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+ +
+ +     if (next && rt_prio(next->prio))
+ +             return next->prio;
+ +     else
+ +             return MAX_RT_PRIO;
+ +}
+ +
+ +static void
+ +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ +     struct rq *rq = rq_of_rt_rq(rt_rq);
+ +
+ +     if (prio < prev_prio) {
+ +
+ +             /*
+ +              * If the new task is higher in priority than anything on the
+ +              * run-queue, we know that the previous high becomes our
+ +              * next-highest.
+ +              */
+ +             rt_rq->highest_prio.next = prev_prio;
    
- -             rt_rq->highest_prio = rt_se_prio(rt_se);
- -#ifdef CONFIG_SMP
                 if (rq->online)
- -                     cpupri_set(&rq->rd->cpupri, rq->cpu,
- -                                rt_se_prio(rt_se));
- -#endif
- -     }
- -#endif
- -#ifdef CONFIG_SMP
- -     if (rt_se->nr_cpus_allowed > 1) {
- -             struct rq *rq = rq_of_rt_rq(rt_rq);
+ +                     cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
    
- -             rq->rt.rt_nr_migratory++;
- -     }
+ +     } else if (prio == rt_rq->highest_prio.curr)
+ +             /*
+ +              * If the next task is equal in priority to the highest on
+ +              * the run-queue, then we implicitly know that the next highest
+ +              * task cannot be any lower than current
+ +              */
+ +             rt_rq->highest_prio.next = prio;
+ +     else if (prio < rt_rq->highest_prio.next)
+ +             /*
+ +              * Otherwise, we need to recompute next-highest
+ +              */
+ +             rt_rq->highest_prio.next = next_prio(rq);
+ +}
    
- -     update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif
- -#ifdef CONFIG_RT_GROUP_SCHED
- -     if (rt_se_boosted(rt_se))
- -             rt_rq->rt_nr_boosted++;
+ +static void
+ +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ +     struct rq *rq = rq_of_rt_rq(rt_rq);
    
- -     if (rt_rq->tg)
- -             start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
- -#else
- -     start_rt_bandwidth(&def_rt_bandwidth);
- -#endif
+ +     if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ +             rt_rq->highest_prio.next = next_prio(rq);
+ +
+ +     if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ +             cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
    }
    
+ +#else /* CONFIG_SMP */
+ +
    static inline
- -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
- -{
- -#ifdef CONFIG_SMP
- -     int highest_prio = rt_rq->highest_prio;
- -#endif
+ +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +static inline
+ +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +
+ +#endif /* CONFIG_SMP */
    
- -     WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- -     WARN_ON(!rt_rq->rt_nr_running);
- -     rt_rq->rt_nr_running--;
    #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ +static void
+ +inc_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ +     int prev_prio = rt_rq->highest_prio.curr;
+ +
+ +     if (prio < prev_prio)
+ +             rt_rq->highest_prio.curr = prio;
+ +
+ +     inc_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
+ +
+ +static void
+ +dec_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ +     int prev_prio = rt_rq->highest_prio.curr;
+ +
         if (rt_rq->rt_nr_running) {
- -             struct rt_prio_array *array;
    
- -             WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- -             if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- -                     /* recalculate */
- -                     array = &rt_rq->active;
- -                     rt_rq->highest_prio =
+ +             WARN_ON(prio < prev_prio);
+ +
+ +             /*
+ +              * This may have been our highest task, and therefore
+ +              * we may have some recomputation to do
+ +              */
+ +             if (prio == prev_prio) {
+ +                     struct rt_prio_array *array = &rt_rq->active;
+ +
+ +                     rt_rq->highest_prio.curr =
                                 sched_find_first_bit(array->bitmap);
- -             } /* otherwise leave rq->highest prio alone */
+ +             }
+ +
         } else
- -             rt_rq->highest_prio = MAX_RT_PRIO;
- -#endif
- -#ifdef CONFIG_SMP
- -     if (rt_se->nr_cpus_allowed > 1) {
- -             struct rq *rq = rq_of_rt_rq(rt_rq);
- -             rq->rt.rt_nr_migratory--;
- -     }
+ +             rt_rq->highest_prio.curr = MAX_RT_PRIO;
    
- -     if (rt_rq->highest_prio != highest_prio) {
- -             struct rq *rq = rq_of_rt_rq(rt_rq);
+ +     dec_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
    
- -             if (rq->online)
- -                     cpupri_set(&rq->rd->cpupri, rq->cpu,
- -                                rt_rq->highest_prio);
- -     }
+ +#else
+ +
+ +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +
+ +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
    
- -     update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif /* CONFIG_SMP */
    #ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     if (rt_se_boosted(rt_se))
+ +             rt_rq->rt_nr_boosted++;
+ +
+ +     if (rt_rq->tg)
+ +             start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ +}
+ +
+ +static void
+ +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
         if (rt_se_boosted(rt_se))
                 rt_rq->rt_nr_boosted--;
    
         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
- -#endif
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     start_rt_bandwidth(&def_rt_bandwidth);
+ +}
+ +
+ +static inline
+ +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline
+ +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     int prio = rt_se_prio(rt_se);
+ +
+ +     WARN_ON(!rt_prio(prio));
+ +     rt_rq->rt_nr_running++;
+ +
+ +     inc_rt_prio(rt_rq, prio);
+ +     inc_rt_migration(rt_se, rt_rq);
+ +     inc_rt_group(rt_se, rt_rq);
+ +}
+ +
+ +static inline
+ +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +     WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ +     WARN_ON(!rt_rq->rt_nr_running);
+ +     rt_rq->rt_nr_running--;
+ +
+ +     dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ +     dec_rt_migration(rt_se, rt_rq);
+ +     dec_rt_group(rt_se, rt_rq);
    }
    
    static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@@@ -718,6 -856,9 -718,6 +856,9 @@@@ static void enqueue_task_rt(struct rq *
    
         enqueue_rt_entity(rt_se);
    
+ +     if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ +             enqueue_pushable_task(rq, p);
+ +
         inc_cpu_load(rq, p->se.load.weight);
    }
    
@@@@ -728,6 -869,8 -728,6 +869,8 @@@@ static void dequeue_task_rt(struct rq *
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se);
    
+ +     dequeue_pushable_task(rq, p);
+ +
         dec_cpu_load(rq, p->se.load.weight);
    }
    
@@@@ -878,7 -1021,7 -878,7 +1021,7 @@@@ static struct sched_rt_entity *pick_nex
         return next;
    }
    
- -static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +static struct task_struct *_pick_next_task_rt(struct rq *rq)
    {
         struct sched_rt_entity *rt_se;
         struct task_struct *p;
@@@@ -900,6 -1043,18 -900,6 +1043,18 @@@@
    
         p = rt_task_of(rt_se);
         p->se.exec_start = rq->clock;
+ +
+ +     return p;
+ +}
+ +
+ +static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +{
+ +     struct task_struct *p = _pick_next_task_rt(rq);
+ +
+ +     /* The running task is never eligible for pushing */
+ +     if (p)
+ +             dequeue_pushable_task(rq, p);
+ +
         return p;
    }
    
@@@@ -907,6 -1062,13 -907,6 +1062,13 @@@@ static void put_prev_task_rt(struct rq 
    {
         update_curr_rt(rq);
         p->se.exec_start = 0;
+ +
+ +     /*
+ +      * The previous task needs to be made eligible for pushing
+ +      * if it is still active
+ +      */
+ +     if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ +             enqueue_pushable_task(rq, p);
    }
    
    #ifdef CONFIG_SMP
@@@@ -968,8 -1130,8 -968,8 +1130,8 @@@@ static inline int pick_optimal_cpu(int 
         if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
                 return this_cpu;
    
--      first = first_cpu(*mask);
--      if (first != NR_CPUS)
++      first = cpumask_first(mask);
++      if (first < nr_cpu_ids)
                 return first;
    
         return -1;
@@@@ -1072,7 -1234,7 -1072,7 +1234,7 @@@@ static struct rq *find_lock_lowest_rq(s
                 }
    
                 /* If this rq is still suitable use it. */
- -             if (lowest_rq->rt.highest_prio > task->prio)
+ +             if (lowest_rq->rt.highest_prio.curr > task->prio)
                         break;
    
                 /* try again */
@@@@ -1083,6 -1245,31 -1083,6 +1245,31 @@@@
         return lowest_rq;
    }
    
+ +static inline int has_pushable_tasks(struct rq *rq)
+ +{
+ +     return !plist_head_empty(&rq->rt.pushable_tasks);
+ +}
+ +
+ +static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ +{
+ +     struct task_struct *p;
+ +
+ +     if (!has_pushable_tasks(rq))
+ +             return NULL;
+ +
+ +     p = plist_first_entry(&rq->rt.pushable_tasks,
+ +                           struct task_struct, pushable_tasks);
+ +
+ +     BUG_ON(rq->cpu != task_cpu(p));
+ +     BUG_ON(task_current(rq, p));
+ +     BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ +
+ +     BUG_ON(!p->se.on_rq);
+ +     BUG_ON(!rt_task(p));
+ +
+ +     return p;
+ +}
+ +
    /*
     * If the current CPU has more than one RT task, see if the non
     * running task can migrate over to a CPU that is running a task
@@@@ -1092,13 -1279,11 -1092,13 +1279,11 @@@@ static int push_rt_task(struct rq *rq
    {
         struct task_struct *next_task;
         struct rq *lowest_rq;
- -     int ret = 0;
- -     int paranoid = RT_MAX_TRIES;
    
         if (!rq->rt.overloaded)
                 return 0;
    
- -     next_task = pick_next_highest_task_rt(rq, -1);
+ +     next_task = pick_next_pushable_task(rq);
         if (!next_task)
                 return 0;
    
@@@@ -1127,16 -1312,34 -1127,16 +1312,34 @@@@
                 struct task_struct *task;
                 /*
                  * find lock_lowest_rq releases rq->lock
- -              * so it is possible that next_task has changed.
- -              * If it has, then try again.
+ +              * so it is possible that next_task has migrated.
+ +              *
+ +              * We need to make sure that the task is still on the same
+ +              * run-queue and is also still the next task eligible for
+ +              * pushing.
                  */
- -             task = pick_next_highest_task_rt(rq, -1);
- -             if (unlikely(task != next_task) && task && paranoid--) {
- -                     put_task_struct(next_task);
- -                     next_task = task;
- -                     goto retry;
+ +             task = pick_next_pushable_task(rq);
+ +             if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ +                     /*
+ +                      * If we get here, the task hasnt moved at all, but
+ +                      * it has failed to push.  We will not try again,
+ +                      * since the other cpus will pull from us when they
+ +                      * are ready.
+ +                      */
+ +                     dequeue_pushable_task(rq, next_task);
+ +                     goto out;
                 }
- -             goto out;
+ +
+ +             if (!task)
+ +                     /* No more tasks, just exit */
+ +                     goto out;
+ +
+ +             /*
+ +              * Something has shifted, try again.
+ +              */
+ +             put_task_struct(next_task);
+ +             next_task = task;
+ +             goto retry;
         }
    
         deactivate_task(rq, next_task, 0);
@@@@ -1147,23 -1350,12 -1147,23 +1350,12 @@@@
    
         double_unlock_balance(rq, lowest_rq);
    
- -     ret = 1;
    out:
         put_task_struct(next_task);
    
- -     return ret;
+ +     return 1;
    }
    
- -/*
- - * TODO: Currently we just use the second highest prio task on
- - *       the queue, and stop when it can't migrate (or there's
- - *       no more RT tasks).  There may be a case where a lower
- - *       priority RT task has a different affinity than the
- - *       higher RT task. In this case the lower RT task could
- - *       possibly be able to migrate where as the higher priority
- - *       RT task could not.  We currently ignore this issue.
- - *       Enhancements are welcome!
- - */
    static void push_rt_tasks(struct rq *rq)
    {
         /* push_rt_task will return true if it moved an RT */
@@@@ -1174,33 -1366,35 -1174,33 +1366,35 @@@@
    static int pull_rt_task(struct rq *this_rq)
    {
         int this_cpu = this_rq->cpu, ret = 0, cpu;
- -     struct task_struct *p, *next;
+ +     struct task_struct *p;
         struct rq *src_rq;
    
         if (likely(!rt_overloaded(this_rq)))
                 return 0;
    
- -     next = pick_next_task_rt(this_rq);
- -
         for_each_cpu(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
    
                 src_rq = cpu_rq(cpu);
+ +
+ +             /*
+ +              * Don't bother taking the src_rq->lock if the next highest
+ +              * task is known to be lower-priority than our current task.
+ +              * This may look racy, but if this value is about to go
+ +              * logically higher, the src_rq will push this task away.
+ +              * And if its going logically lower, we do not care
+ +              */
+ +             if (src_rq->rt.highest_prio.next >=
+ +                 this_rq->rt.highest_prio.curr)
+ +                     continue;
+ +
                 /*
                  * We can potentially drop this_rq's lock in
                  * double_lock_balance, and another CPU could
- -              * steal our next task - hence we must cause
- -              * the caller to recalculate the next task
- -              * in that case:
+ +              * alter this_rq
                  */
- -             if (double_lock_balance(this_rq, src_rq)) {
- -                     struct task_struct *old_next = next;
- -
- -                     next = pick_next_task_rt(this_rq);
- -                     if (next != old_next)
- -                             ret = 1;
- -             }
+ +             double_lock_balance(this_rq, src_rq);
    
                 /*
                  * Are there still pullable RT tasks?
@@@@ -1214,7 -1408,7 -1214,7 +1408,7 @@@@
                  * Do we have an RT task that preempts
                  * the to-be-scheduled task?
                  */
- -             if (p && (!next || (p->prio < next->prio))) {
+ +             if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                         WARN_ON(p == src_rq->curr);
                         WARN_ON(!p->se.on_rq);
    
@@@@ -1224,12 -1418,9 -1224,12 +1418,9 @@@@
                          * This is just that p is wakeing up and hasn't
                          * had a chance to schedule. We only pull
                          * p if it is lower in priority than the
- -                      * current task on the run queue or
- -                      * this_rq next task is lower in prio than
- -                      * the current task on that rq.
+ +                      * current task on the run queue
                          */
- -                     if (p->prio < src_rq->curr->prio ||
- -                         (next && next->prio < src_rq->curr->prio))
+ +                     if (p->prio < src_rq->curr->prio)
                                 goto skip;
    
                         ret = 1;
@@@@ -1242,13 -1433,7 -1242,13 +1433,7 @@@@
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
- -                      *
- -                      * Update next so that we won't pick a task
- -                      * on another cpu with a priority lower (or equal)
- -                      * than the one we just picked.
                          */
- -                     next = p;
- -
                 }
     skip:
                 double_unlock_balance(this_rq, src_rq);
@@@@ -1260,24 -1445,27 -1260,24 +1445,27 @@@@
    static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
    {
         /* Try to pull RT tasks here if we lower this rq's prio */
- -     if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ +     if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
                 pull_rt_task(rq);
    }
    
+ +/*
+ + * assumes rq->lock is held
+ + */
+ +static int needs_post_schedule_rt(struct rq *rq)
+ +{
+ +     return has_pushable_tasks(rq);
+ +}
+ +
    static void post_schedule_rt(struct rq *rq)
    {
         /*
- -      * If we have more than one rt_task queued, then
- -      * see if we can push the other rt_tasks off to other CPUS.
- -      * Note we may release the rq lock, and since
- -      * the lock was owned by prev, we need to release it
- -      * first via finish_lock_switch and then reaquire it here.
+ +      * This is only called if needs_post_schedule_rt() indicates that
+ +      * we need to push tasks away
          */
- -     if (unlikely(rq->rt.overloaded)) {
- -             spin_lock_irq(&rq->lock);
- -             push_rt_tasks(rq);
- -             spin_unlock_irq(&rq->lock);
- -     }
+ +     spin_lock_irq(&rq->lock);
+ +     push_rt_tasks(rq);
+ +     spin_unlock_irq(&rq->lock);
    }
    
    /*
@@@@ -1288,7 -1476,8 -1288,7 +1476,8 @@@@ static void task_wake_up_rt(struct rq *
    {
         if (!task_running(rq, p) &&
             !test_tsk_need_resched(rq->curr) &&
- -         rq->rt.overloaded)
+ +         has_pushable_tasks(rq) &&
+ +         p->rt.nr_cpus_allowed > 1)
                 push_rt_tasks(rq);
    }
    
@@@@ -1324,6 -1513,24 -1324,6 +1513,24 @@@@ static void set_cpus_allowed_rt(struct 
         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
    
+ +             if (!task_current(rq, p)) {
+ +                     /*
+ +                      * Make sure we dequeue this task from the pushable list
+ +                      * before going further.  It will either remain off of
+ +                      * the list because we are no longer pushable, or it
+ +                      * will be requeued.
+ +                      */
+ +                     if (p->rt.nr_cpus_allowed > 1)
+ +                             dequeue_pushable_task(rq, p);
+ +
+ +                     /*
+ +                      * Requeue if our weight is changing and still > 1
+ +                      */
+ +                     if (weight > 1)
+ +                             enqueue_pushable_task(rq, p);
+ +
+ +             }
+ +
                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
                         rq->rt.rt_nr_migratory++;
                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@@@ -1331,7 -1538,7 -1331,7 +1538,7 @@@@
                         rq->rt.rt_nr_migratory--;
                 }
    
- -             update_rt_migration(rq);
+ +             update_rt_migration(&rq->rt);
         }
    
         cpumask_copy(&p->cpus_allowed, new_mask);
@@@@ -1346,7 -1553,7 -1346,7 +1553,7 @@@@ static void rq_online_rt(struct rq *rq
    
         __enable_runtime(rq);
    
- -     cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ +     cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
    }
    
    /* Assumes rq->lock is held */
@@@@ -1438,7 -1645,7 -1438,7 +1645,7 @@@@ static void prio_changed_rt(struct rq *
                  * can release the rq lock and p could migrate.
                  * Only reschedule if p is still on the same runqueue.
                  */
- -             if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ +             if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
                         resched_task(p);
    #else
                 /* For UP simply resched on drop of prio */
@@@@ -1509,6 -1716,9 -1509,6 +1716,9 @@@@ static void set_curr_task_rt(struct rq 
         struct task_struct *p = rq->curr;
    
         p->se.exec_start = rq->clock;
+ +
+ +     /* The running task is never eligible for pushing */
+ +     dequeue_pushable_task(rq, p);
    }
    
    static const struct sched_class rt_sched_class = {
@@@@ -1531,6 -1741,7 -1531,6 +1741,7 @@@@
         .rq_online              = rq_online_rt,
         .rq_offline             = rq_offline_rt,
         .pre_schedule           = pre_schedule_rt,
+ +     .needs_post_schedule    = needs_post_schedule_rt,
         .post_schedule          = post_schedule_rt,
         .task_wake_up           = task_wake_up_rt,
         .switched_from          = switched_from_rt,
author	Ingo Molnar <mingo@elte.hu>
	Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 8 Feb 2009 19:12:46 +0000 (20:12 +0100)
		1	2	3
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history