Merge tag 'v3.16-rc5' into sched/core, to refresh the branch before applying bigger...

author Ingo Molnar <mingo@kernel.org>

Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)

committer Ingo Molnar <mingo@kernel.org>

Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)
author Ingo Molnar <mingo@kernel.org>
Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)
committer Ingo Molnar <mingo@kernel.org>
Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt

index 2479b2a0c77cdfb63a6c7f117d0ed6b8555eba8d..4da42616939f05002062117be457699bb8543d69 100644 (file)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1515,7 +1515,7 @@ Doing the same with chrt -r 5 and function-trace set.
    <idle>-0       3d.h4    1us+:      0:120:R   + [003]  2448: 94:R sleep
    <idle>-0       3d.h4    2us : ttwu_do_activate.constprop.87 <-try_to_wake_up
    <idle>-0       3d.h3    3us : check_preempt_curr <-ttwu_do_wakeup
-  <idle>-0       3d.h3    3us : resched_task <-check_preempt_curr
+  <idle>-0       3d.h3    3us : resched_curr <-check_preempt_curr
    <idle>-0       3dNh3    4us : task_woken_rt <-ttwu_do_wakeup
    <idle>-0       3dNh3    4us : _raw_spin_unlock <-try_to_wake_up
    <idle>-0       3dNh3    4us : sub_preempt_count <-_raw_spin_unlock
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h

index 19ae05d4b8ec26b6dc6fa5f568d2b25b698e3417..bf9422c3aefe22ddd7efa63b3f88502c0659b0b5 100644 (file)
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -33,6 +33,11 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
  #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
  
  bool irq_work_queue(struct irq_work *work);
+
+#ifdef CONFIG_SMP
+bool irq_work_queue_on(struct irq_work *work, int cpu);
+#endif
+
  void irq_work_run(void);
  void irq_work_sync(struct irq_work *work);
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 306f4f0c987a006f43f520413f7de3a780f98a23..41a19538508162319e0e8b503841d7a1a9eb3c94 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1440,8 +1440,6 @@ struct task_struct {
         struct rb_node *pi_waiters_leftmost;
         /* Deadlock detection and priority inheritance handling */
         struct rt_mutex_waiter *pi_blocked_on;
-       /* Top pi_waiters task */
-       struct task_struct *pi_top_task;
  #endif
  
  #ifdef CONFIG_DEBUG_MUTEXES
@@ -2788,7 +2786,7 @@ static inline bool __must_check current_set_polling_and_test(void)
  
         /*
          * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
+        * paired by resched_curr()
          */
         smp_mb__after_atomic();
  
@@ -2806,7 +2804,7 @@ static inline bool __must_check current_clr_polling_and_test(void)
  
         /*
          * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
+        * paired by resched_curr()
          */
         smp_mb__after_atomic();
  
@@ -2838,7 +2836,7 @@ static inline void current_clr_polling(void)
          * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
          * fold.
          */
-       smp_mb(); /* paired with resched_task() */
+       smp_mb(); /* paired with resched_curr() */
  
         preempt_fold_need_resched();
  }
diff --git a/include/linux/tick.h b/include/linux/tick.h

index b84773cb9f4c1f3fc480a460ba3c37a2121e1557..8a4987f2294a204b4e7c6768f2ae89a9bf2ee069 100644 (file)
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -181,7 +181,13 @@ static inline bool tick_nohz_full_cpu(int cpu)
  
  extern void tick_nohz_init(void);
  extern void __tick_nohz_full_check(void);
-extern void tick_nohz_full_kick(void);
+extern void tick_nohz_full_kick_cpu(int cpu);
+
+static inline void tick_nohz_full_kick(void)
+{
+       tick_nohz_full_kick_cpu(smp_processor_id());
+}
+
  extern void tick_nohz_full_kick_all(void);
  extern void __tick_nohz_task_switch(struct task_struct *tsk);
  #else
@@ -189,6 +195,7 @@ static inline void tick_nohz_init(void) { }
  static inline bool tick_nohz_full_enabled(void) { return false; }
  static inline bool tick_nohz_full_cpu(int cpu) { return false; }
  static inline void __tick_nohz_full_check(void) { }
+static inline void tick_nohz_full_kick_cpu(int cpu) { }
  static inline void tick_nohz_full_kick(void) { }
  static inline void tick_nohz_full_kick_all(void) { }
  static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
diff --git a/kernel/cpu.c b/kernel/cpu.c

index a343bde710b1025d6519dd6efc7d890bdff86118..81e2a388a0f687eeb472bf33a4e24dd763ef34cf 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
         rcu_read_unlock();
  }
  
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
  {
-       struct task_struct *p;
-       cputime_t utime, stime;
+       struct task_struct *g, *p;
  
-       write_lock_irq(&tasklist_lock);
-       for_each_process(p) {
-               task_cputime(p, &utime, &stime);
-               if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                   (utime || stime))
-                       pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
-                               p->comm, task_pid_nr(p), cpu,
-                               p->state, p->flags);
-       }
-       write_unlock_irq(&tasklist_lock);
+       read_lock_irq(&tasklist_lock);
+       do_each_thread(g, p) {
+               if (!p->on_rq)
+                       continue;
+               /*
+                * We do the check with unlocked task_rq(p)->lock.
+                * Order the reading to do not warn about a task,
+                * which was running on this cpu in the past, and
+                * it's just been woken on another cpu.
+                */
+               rmb();
+               if (task_cpu(p) != dead_cpu)
+                       continue;
+
+               pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+                       p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+       } while_each_thread(g, p);
+       read_unlock_irq(&tasklist_lock);
  }
  
  struct take_cpu_down_param {
diff --git a/kernel/fork.c b/kernel/fork.c

index 6a13c46cd87dbe72bc830bf109256fa458f22ad1..962885edbe535a5e0aebefc8e830d483167c197f 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p)
         p->pi_waiters = RB_ROOT;
         p->pi_waiters_leftmost = NULL;
         p->pi_blocked_on = NULL;
-       p->pi_top_task = NULL;
  #endif
  }
  
diff --git a/kernel/irq_work.c b/kernel/irq_work.c

index a82170e2fa78e50fbefa29c6cf71b1eccadae175..e6bcbe756663abd64adf9c416eddee7a91c9a2c9 100644 (file)
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
  #include <linux/tick.h>
  #include <linux/cpu.h>
  #include <linux/notifier.h>
+#include <linux/smp.h>
  #include <asm/processor.h>
  
  
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-static DEFINE_PER_CPU(int, irq_work_raised);
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
  
  /*
   * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
          */
  }
  
+#ifdef CONFIG_SMP
  /*
- * Enqueue the irq_work @entry unless it's already pending
+ * Enqueue the irq_work @work on @cpu unless it's already pending
   * somewhere.
   *
   * Can be re-enqueued while the callback is still in progress.
   */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+       /* All work should have been flushed before going offline */
+       WARN_ON_ONCE(cpu_is_offline(cpu));
+
+       /* Arch remote IPI send/receive backend aren't NMI safe */
+       WARN_ON_ONCE(in_nmi());
+
+       /* Only queue if not already pending */
+       if (!irq_work_claim(work))
+               return false;
+
+       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+               arch_send_call_function_single_ipi(cpu);
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+
+/* Enqueue the irq work @work on the current CPU */
  bool irq_work_queue(struct irq_work *work)
  {
         /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
         /* Queue the entry and raise the IPI if needed. */
         preempt_disable();
  
-       llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
-       /*
-        * If the work is not "lazy" or the tick is stopped, raise the irq
-        * work interrupt (if supported by the arch), otherwise, just wait
-        * for the next tick.
-        */
-       if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
-               if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+       /* If the work is "lazy", handle it from next tick if any */
+       if (work->flags & IRQ_WORK_LAZY) {
+               if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
+                   tick_nohz_tick_stopped())
+                       arch_irq_work_raise();
+       } else {
+               if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
                         arch_irq_work_raise();
         }
  
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  
  bool irq_work_needs_cpu(void)
  {
-       struct llist_head *this_list;
+       struct llist_head *raised, *lazy;
  
-       this_list = &__get_cpu_var(irq_work_list);
-       if (llist_empty(this_list))
+       raised = &__get_cpu_var(raised_list);
+       lazy = &__get_cpu_var(lazy_list);
+       if (llist_empty(raised) && llist_empty(lazy))
                 return false;
  
         /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
         return true;
  }
  
-static void __irq_work_run(void)
+static void irq_work_run_list(struct llist_head *list)
  {
         unsigned long flags;
         struct irq_work *work;
-       struct llist_head *this_list;
         struct llist_node *llnode;
  
+       BUG_ON(!irqs_disabled());
  
-       /*
-        * Reset the "raised" state right before we check the list because
-        * an NMI may enqueue after we find the list empty from the runner.
-        */
-       __this_cpu_write(irq_work_raised, 0);
-       barrier();
-
-       this_list = &__get_cpu_var(irq_work_list);
-       if (llist_empty(this_list))
+       if (llist_empty(list))
                 return;
  
-       BUG_ON(!irqs_disabled());
-
-       llnode = llist_del_all(this_list);
+       llnode = llist_del_all(list);
         while (llnode != NULL) {
                 work = llist_entry(llnode, struct irq_work, llnode);
  
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
  }
  
  /*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
+ * hotplug calls this through:
+ *  hotplug_cfd() -> flush_smp_call_function_queue()
   */
  void irq_work_run(void)
  {
-       BUG_ON(!in_irq());
-       __irq_work_run();
+       irq_work_run_list(&__get_cpu_var(raised_list));
+       irq_work_run_list(&__get_cpu_var(lazy_list));
  }
  EXPORT_SYMBOL_GPL(irq_work_run);
  
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
                 cpu_relax();
  }
  EXPORT_SYMBOL_GPL(irq_work_sync);
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int irq_work_cpu_notify(struct notifier_block *self,
-                              unsigned long action, void *hcpu)
-{
-       long cpu = (long)hcpu;
-
-       switch (action) {
-       case CPU_DYING:
-               /* Called from stop_machine */
-               if (WARN_ON_ONCE(cpu != smp_processor_id()))
-                       break;
-               __irq_work_run();
-               break;
-       default:
-               break;
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_notify;
-
-static __init int irq_work_init_cpu_notifier(void)
-{
-       cpu_notify.notifier_call = irq_work_cpu_notify;
-       cpu_notify.priority = 0;
-       register_cpu_notifier(&cpu_notify);
-       return 0;
-}
-device_initcall(irq_work_init_cpu_notifier);
-
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3bdf01b494fe29c267a0abe73828b02a799a737d..8705125bb9b1c989cd611f2803fd7e3ab5552460 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
                 return;
  
         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       if (delta < 0)
+               return;
         rq->clock += delta;
         update_rq_clock_task(rq, delta);
  }
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         char buf[64];
         char *cmp;
         int i;
+       struct inode *inode;
  
         if (cnt > 63)
                 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         buf[cnt] = 0;
         cmp = strstrip(buf);
  
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       mutex_lock(&inode->i_mutex);
         i = sched_feat_set(cmp);
+       mutex_unlock(&inode->i_mutex);
         if (i == __SCHED_FEAT_NR)
                 return -EINVAL;
  
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
  #endif
  
  /*
- * resched_task - mark a task 'to be rescheduled now'.
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
   * On UP this means the setting of the need_resched flag, on SMP it
   * might also involve a cross-CPU call to trigger the scheduler on
   * the target CPU.
   */
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
  {
+       struct task_struct *curr = rq->curr;
         int cpu;
  
-       lockdep_assert_held(&task_rq(p)->lock);
+       lockdep_assert_held(&rq->lock);
  
-       if (test_tsk_need_resched(p))
+       if (test_tsk_need_resched(curr))
                 return;
  
-       cpu = task_cpu(p);
+       cpu = cpu_of(rq);
  
         if (cpu == smp_processor_id()) {
-               set_tsk_need_resched(p);
+               set_tsk_need_resched(curr);
                 set_preempt_need_resched();
                 return;
         }
  
-       if (set_nr_and_not_polling(p))
+       if (set_nr_and_not_polling(curr))
                 smp_send_reschedule(cpu);
         else
                 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
  
         if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                 return;
-       resched_task(cpu_curr(cpu));
+       resched_curr(rq);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
  
  static bool wake_up_full_nohz_cpu(int cpu)
  {
+       /*
+        * We just need the target to call irq_exit() and re-evaluate
+        * the next tick. The nohz full kick at least implies that.
+        * If needed we can still optimize that later with an
+        * empty IRQ.
+        */
         if (tick_nohz_full_cpu(cpu)) {
                 if (cpu != smp_processor_id() ||
                     tick_nohz_tick_stopped())
-                       smp_send_reschedule(cpu);
+                       tick_nohz_full_kick_cpu(cpu);
                 return true;
         }
  
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
  #ifdef CONFIG_NO_HZ_FULL
  bool sched_can_stop_tick(void)
  {
-       struct rq *rq;
-
-       rq = this_rq();
-
-       /* Make sure rq->nr_running update is visible after the IPI */
-       smp_rmb();
-
-       /* More than one running task need preemption */
-       if (rq->nr_running > 1)
-               return false;
+       /*
+        * More than one running task need preemption.
+        * nr_running update is assumed to be visible
+        * after IPI is sent from wakers.
+        */
+       if (this_rq()->nr_running > 1)
+               return false;
  
-       return true;
+       return true;
  }
  #endif /* CONFIG_NO_HZ_FULL */
  
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                         if (class == rq->curr->sched_class)
                                 break;
                         if (class == p->sched_class) {
-                               resched_task(rq->curr);
+                               resched_curr(rq);
                                 break;
                         }
                 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
          */
         preempt_fold_need_resched();
  
-       if (llist_empty(&this_rq()->wake_list)
-                       && !tick_nohz_full_cpu(smp_processor_id())
-                       && !got_nohz_idle_kick())
+       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
                 return;
  
         /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
          * somewhat pessimize the simple resched case.
          */
         irq_enter();
-       tick_nohz_full_check();
         sched_ttwu_pending();
  
         /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  {
         u64 ns = 0;
  
-       if (task_current(rq, p)) {
+       /*
+        * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+        * project cycles that may never be accounted to this
+        * thread, breaking clock_gettime().
+        */
+       if (task_current(rq, p) && p->on_rq) {
                 update_rq_clock(rq);
                 ns = rq_clock_task(rq) - p->se.exec_start;
                 if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
          * If we race with it leaving cpu, we'll take a lock. So we're correct.
          * If we race with it entering cpu, unaccounted time is 0. This is
          * indistinguishable from the read occurring a few cycles earlier.
+        * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+        * been accounted, so we're correct here as well.
          */
-       if (!p->on_cpu)
+       if (!p->on_cpu || !p->on_rq)
                 return p->se.sum_exec_runtime;
  #endif
  
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         }
  
         trace_sched_pi_setprio(p, prio);
-       p->pi_top_task = rt_mutex_get_top_task(p);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
          *          running task
          */
         if (dl_prio(prio)) {
-               if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
-                       dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+               struct task_struct *pi_task = rt_mutex_get_top_task(p);
+               if (!dl_prio(p->normal_prio) ||
+                   (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
                         p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
                  * lowered its priority, then reschedule its CPU:
                  */
                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                       resched_task(rq->curr);
+                       resched_curr(rq);
         }
  out_unlock:
         task_rq_unlock(rq, p, &flags);
@@ -4290,7 +4305,7 @@ again:
                  * fairness.
                  */
                 if (preempt && rq != p_rq)
-                       resched_task(p_rq->curr);
+                       resched_curr(p_rq);
         }
  
  out_unlock:
@@ -7097,7 +7112,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
         __setscheduler(rq, p, &attr);
         if (on_rq) {
                 enqueue_task(rq, p, 0);
-               resched_task(rq->curr);
+               resched_curr(rq);
         }
  
         check_class_changed(rq, p, prev_class, old_prio);
@@ -7808,6 +7823,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
         if (period > max_cfs_quota_period)
                 return -EINVAL;
  
+       /*
+        * Prevent race between setting of cfs_rq->runtime_enabled and
+        * unthrottle_offline_cfs_rqs().
+        */
+       get_online_cpus();
         mutex_lock(&cfs_constraints_mutex);
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
@@ -7833,7 +7853,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
         }
         raw_spin_unlock_irq(&cfs_b->lock);
  
-       for_each_possible_cpu(i) {
+       for_each_online_cpu(i) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                 struct rq *rq = cfs_rq->rq;
  
@@ -7849,6 +7869,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                 cfs_bandwidth_usage_dec();
  out_unlock:
         mutex_unlock(&cfs_constraints_mutex);
+       put_online_cpus();
  
         return ret;
  }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index fc4f98b1258f66cbbf3cf1fc1082cb909c0f1144..255ce138b65232dabcc553c54340f45440ca2792 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
   * the overrunning entity can't interfere with other entity in the system and
   * can't make them miss their deadlines. Reasons why this kind of overruns
   * could happen are, typically, a entity voluntarily trying to overcome its
- * runtime, or it just underestimated it during sched_setscheduler_ex().
+ * runtime, or it just underestimated it during sched_setattr().
   */
  static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                                 struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
                 if (task_has_dl_policy(rq->curr))
                         check_preempt_curr_dl(rq, p, 0);
                 else
-                       resched_task(rq->curr);
+                       resched_curr(rq);
  #ifdef CONFIG_SMP
                 /*
                  * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
                         enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
  
                 if (!is_leftmost(curr, &rq->dl))
-                       resched_task(curr);
+                       resched_curr(rq);
         }
  
         /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
             cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
                 return;
  
-       resched_task(rq->curr);
+       resched_curr(rq);
  }
  
  static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
                                   int flags)
  {
         if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                 return;
         }
  
@@ -1333,7 +1333,7 @@ retry:
         if (dl_task(rq->curr) &&
             dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
             rq->curr->nr_cpus_allowed > 1) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                 return 0;
         }
  
@@ -1373,7 +1373,7 @@ retry:
         set_task_cpu(next_task, later_rq->cpu);
         activate_task(later_rq, next_task, 0);
  
-       resched_task(later_rq->curr);
+       resched_curr(later_rq);
  
         double_unlock_balance(rq, later_rq);
  
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                  */
                 if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
                     rq->curr == p)
-                       resched_task(p);
+                       resched_curr(rq);
  #else
                 /*
                  * Again, we don't know if p has a earlier
                  * or later deadline, so let's blindly set a
                  * (maybe not needed) rescheduling point.
                  */
-               resched_task(p);
+               resched_curr(rq);
  #endif /* CONFIG_SMP */
         } else
                 switched_to_dl(rq, p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index fea7d3335e1fdf3502fc72f5d64b9181bc7e4243..45943b2fa82b9c978128263ff32574b108ece180 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
         if (!cpus)
                 return;
  
-       ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
         ns->task_capacity =
                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
         env->best_cpu = env->dst_cpu;
  }
  
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
-                               long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
                                 struct task_numa_env *env)
  {
         long imb, old_imb;
+       long orig_src_load, orig_dst_load;
+       long src_capacity, dst_capacity;
+
+       /*
+        * The load is corrected for the CPU capacity available on each node.
+        *
+        * src_load        dst_load
+        * ------------ vs ---------
+        * src_capacity    dst_capacity
+        */
+       src_capacity = env->src_stats.compute_capacity;
+       dst_capacity = env->dst_stats.compute_capacity;
  
         /* We care about the slope of the imbalance, not the direction. */
         if (dst_load < src_load)
                 swap(dst_load, src_load);
  
         /* Is the difference below the threshold? */
-       imb = dst_load * 100 - src_load * env->imbalance_pct;
+       imb = dst_load * src_capacity * 100 -
+             src_load * dst_capacity * env->imbalance_pct;
         if (imb <= 0)
                 return false;
  
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
          * The imbalance is above the allowed threshold.
          * Compare it with the old imbalance.
          */
+       orig_src_load = env->src_stats.load;
+       orig_dst_load = env->dst_stats.load;
+
         if (orig_dst_load < orig_src_load)
                 swap(orig_dst_load, orig_src_load);
  
-       old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+       old_imb = orig_dst_load * src_capacity * 100 -
+                 orig_src_load * dst_capacity * env->imbalance_pct;
  
         /* Would this change make things worse? */
         return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
         struct rq *src_rq = cpu_rq(env->src_cpu);
         struct rq *dst_rq = cpu_rq(env->dst_cpu);
         struct task_struct *cur;
-       long orig_src_load, src_load;
-       long orig_dst_load, dst_load;
+       long src_load, dst_load;
         long load;
-       long imp = (groupimp > 0) ? groupimp : taskimp;
+       long imp = env->p->numa_group ? groupimp : taskimp;
+       long moveimp = imp;
  
         rcu_read_lock();
         cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
                          * itself (not part of a group), use the task weight
                          * instead.
                          */
-                       if (env->p->numa_group)
-                               imp = groupimp;
-                       else
-                               imp = taskimp;
-
                         if (cur->numa_group)
                                 imp += group_weight(cur, env->src_nid) -
                                        group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
                 }
         }
  
-       if (imp < env->best_imp)
+       if (imp <= env->best_imp && moveimp <= env->best_imp)
                 goto unlock;
  
         if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
         }
  
         /* Balance doesn't matter much if we're running a task per cpu */
-       if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+       if (imp > env->best_imp && src_rq->nr_running == 1 &&
+                       dst_rq->nr_running == 1)
                 goto assign;
  
         /*
          * In the overloaded case, try and keep the load balanced.
          */
  balance:
-       orig_dst_load = env->dst_stats.load;
-       orig_src_load = env->src_stats.load;
-
-       /* XXX missing capacity terms */
         load = task_h_load(env->p);
-       dst_load = orig_dst_load + load;
-       src_load = orig_src_load - load;
+       dst_load = env->dst_stats.load + load;
+       src_load = env->src_stats.load - load;
+
+       if (moveimp > imp && moveimp > env->best_imp) {
+               /*
+                * If the improvement from just moving env->p direction is
+                * better than swapping tasks around, check if a move is
+                * possible. Store a slightly smaller score than moveimp,
+                * so an actually idle CPU will win.
+                */
+               if (!load_too_imbalanced(src_load, dst_load, env)) {
+                       imp = moveimp - 1;
+                       cur = NULL;
+                       goto assign;
+               }
+       }
+
+       if (imp <= env->best_imp)
+               goto unlock;
  
         if (cur) {
                 load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
                 src_load += load;
         }
  
-       if (load_too_imbalanced(orig_src_load, orig_dst_load,
-                               src_load, dst_load, env))
+       if (load_too_imbalanced(src_load, dst_load, env))
                 goto unlock;
  
  assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
         groupimp = group_weight(p, env.dst_nid) - groupweight;
         update_numa_stats(&env.dst_stats, env.dst_nid);
  
-       /* If the preferred nid has free capacity, try to use it. */
-       if (env.dst_stats.has_free_capacity)
-               task_numa_find_cpu(&env, taskimp, groupimp);
+       /* Try to find a spot on the preferred nid. */
+       task_numa_find_cpu(&env, taskimp, groupimp);
  
         /* No space available on the preferred nid. Look elsewhere. */
         if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
                 }
         }
  
-       /* No better CPU than the current one was found. */
-       if (env.best_cpu == -1)
-               return -EAGAIN;
-
         /*
          * If the task is part of a workload that spans multiple NUMA nodes,
          * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
          * A task that migrated to a second choice node will be better off
          * trying for a better one later. Do not set the preferred node here.
          */
-       if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
-               sched_setnuma(p, env.dst_nid);
+       if (p->numa_group) {
+               if (env.best_cpu == -1)
+                       nid = env.src_nid;
+               else
+                       nid = env.dst_nid;
+
+               if (node_isset(nid, p->numa_group->active_nodes))
+                       sched_setnuma(p, env.dst_nid);
+       }
+
+       /* No better CPU than the current one was found. */
+       if (env.best_cpu == -1)
+               return -EAGAIN;
  
         /*
          * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
  /*
   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   * increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
   */
  #define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
  
  /*
   * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
  
         if (p->numa_group) {
                 update_numa_active_node_mask(p->numa_group);
-               /*
-                * If the preferred task and group nids are different,
-                * iterate over the nodes again to find the best place.
-                */
-               if (max_nid != max_group_nid) {
-                       unsigned long weight, max_weight = 0;
-
-                       for_each_online_node(nid) {
-                               weight = task_weight(p, nid) + group_weight(p, nid);
-                               if (weight > max_weight) {
-                                       max_weight = weight;
-                                       max_nid = nid;
-                               }
-                       }
-               }
-
                 spin_unlock_irq(group_lock);
+               max_nid = max_group_nid;
         }
  
-       /* Preferred node as the node with the most faults */
-       if (max_faults && max_nid != p->numa_preferred_nid) {
-               /* Update the preferred nid and migrate task if possible */
-               sched_setnuma(p, max_nid);
-               numa_migrate_preferred(p);
+       if (max_faults) {
+               /* Set the new preferred node */
+               if (max_nid != p->numa_preferred_nid)
+                       sched_setnuma(p, max_nid);
+
+               if (task_node(p) != p->numa_preferred_nid)
+                       numa_migrate_preferred(p);
         }
  }
  
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         ideal_runtime = sched_slice(cfs_rq, curr);
         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
         if (delta_exec > ideal_runtime) {
-               resched_task(rq_of(cfs_rq)->curr);
+               resched_curr(rq_of(cfs_rq));
                 /*
                  * The current task ran long enough, ensure it doesn't get
                  * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 return;
  
         if (delta > ideal_runtime)
-               resched_task(rq_of(cfs_rq)->curr);
+               resched_curr(rq_of(cfs_rq));
  }
  
  static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          * validating it and just reschedule.
          */
         if (queued) {
-               resched_task(rq_of(cfs_rq)->curr);
+               resched_curr(rq_of(cfs_rq));
                 return;
         }
         /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
          * hierarchy can be throttled
          */
         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-               resched_task(rq_of(cfs_rq)->curr);
+               resched_curr(rq_of(cfs_rq));
  }
  
  static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         cfs_rq->throttled = 1;
         cfs_rq->throttled_clock = rq_clock(rq);
         raw_spin_lock(&cfs_b->lock);
-       list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       /*
+        * Add to the _head_ of the list, so that an already-started
+        * distribute_cfs_runtime will not see us
+        */
+       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
         if (!cfs_b->timer_active)
                 __start_cfs_bandwidth(cfs_b, false);
         raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
  
         /* determine whether we need to wake up potentially idle cpu */
         if (rq->curr == rq->idle && rq->cfs.nr_running)
-               resched_task(rq->curr);
+               resched_curr(rq);
  }
  
  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                 u64 remaining, u64 expires)
  {
         struct cfs_rq *cfs_rq;
-       u64 runtime = remaining;
+       u64 runtime;
+       u64 starting_runtime = remaining;
  
         rcu_read_lock();
         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
         }
         rcu_read_unlock();
  
-       return remaining;
+       return starting_runtime - remaining;
  }
  
  /*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         /* account preceding periods in which throttling occurred */
         cfs_b->nr_throttled += overrun;
  
-       /*
-        * There are throttled entities so we must first use the new bandwidth
-        * to unthrottle them before making it generally available.  This
-        * ensures that all existing debts will be paid before a new cfs_rq is
-        * allowed to run.
-        */
-       runtime = cfs_b->runtime;
         runtime_expires = cfs_b->runtime_expires;
-       cfs_b->runtime = 0;
  
         /*
-        * This check is repeated as we are holding onto the new bandwidth
-        * while we unthrottle.  This can potentially race with an unthrottled
-        * group trying to acquire new bandwidth from the global pool.
+        * This check is repeated as we are holding onto the new bandwidth while
+        * we unthrottle. This can potentially race with an unthrottled group
+        * trying to acquire new bandwidth from the global pool. This can result
+        * in us over-using our runtime if it is all used during this loop, but
+        * only by limited amounts in that extreme case.
          */
-       while (throttled && runtime > 0) {
+       while (throttled && cfs_b->runtime > 0) {
+               runtime = cfs_b->runtime;
                 raw_spin_unlock(&cfs_b->lock);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
                 raw_spin_lock(&cfs_b->lock);
  
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+               cfs_b->runtime -= min(runtime, cfs_b->runtime);
         }
  
-       /* return (any) remaining runtime */
-       cfs_b->runtime = runtime;
         /*
          * While we are ensured activity in the period following an
          * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                 return;
         }
  
-       if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+       if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                 runtime = cfs_b->runtime;
-               cfs_b->runtime = 0;
-       }
+
         expires = cfs_b->runtime_expires;
         raw_spin_unlock(&cfs_b->lock);
  
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  
         raw_spin_lock(&cfs_b->lock);
         if (expires == cfs_b->runtime_expires)
-               cfs_b->runtime = runtime;
+               cfs_b->runtime -= min(runtime, cfs_b->runtime);
         raw_spin_unlock(&cfs_b->lock);
  }
  
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         hrtimer_cancel(&cfs_b->slack_timer);
  }
  
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+       struct cfs_rq *cfs_rq;
+
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+               raw_spin_lock(&cfs_b->lock);
+               cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+               raw_spin_unlock(&cfs_b->lock);
+       }
+}
+
  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
  {
         struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                  * there's some valid quota amount
                  */
                 cfs_rq->runtime_remaining = 1;
+               /*
+                * Offline rq is schedulable till cpu is completely disabled
+                * in take_cpu_down(), so we prevent new cfs throttling here.
+                */
+               cfs_rq->runtime_enabled = 0;
+
                 if (cfs_rq_throttled(cfs_rq))
                         unthrottle_cfs_rq(cfs_rq);
         }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
         return NULL;
  }
  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
  
  #endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  
                 if (delta < 0) {
                         if (rq->curr == p)
-                               resched_task(p);
+                               resched_curr(rq);
                         return;
                 }
  
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         return;
  
  preempt:
-       resched_task(curr);
+       resched_curr(rq);
         /*
          * Only set the backward buddy when the current task is still
          * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
  /*
   * Is this task likely cache-hot:
   */
-static int
-task_hot(struct task_struct *p, u64 now)
+static int task_hot(struct task_struct *p, struct lb_env *env)
  {
         s64 delta;
  
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
         /*
          * Buddy candidates are cache hot:
          */
-       if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+       if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
                         (&p->se == cfs_rq_of(&p->se)->next ||
                          &p->se == cfs_rq_of(&p->se)->last))
                 return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
         if (sysctl_sched_migration_cost == 0)
                 return 0;
  
-       delta = now - p->se.exec_start;
+       delta = rq_clock_task(env->src_rq) - p->se.exec_start;
  
         return delta < (s64)sysctl_sched_migration_cost;
  }
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
          * 2) task is cache cold, or
          * 3) too many balance attempts have failed.
          */
-       tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+       tsk_cache_hot = task_hot(p, env);
         if (!tsk_cache_hot)
                 tsk_cache_hot = migrate_degrades_locality(p, env);
  
@@ -5867,7 +5901,8 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
   */
  static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
-                       int local_group, struct sg_lb_stats *sgs)
+                       int local_group, struct sg_lb_stats *sgs,
+                       bool *overload)
  {
         unsigned long load;
         int i;
@@ -5885,6 +5920,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += rq->nr_running;
+
+               if (rq->nr_running > 1)
+                       *overload = true;
+
  #ifdef CONFIG_NUMA_BALANCING
                 sgs->nr_numa_running += rq->nr_numa_running;
                 sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6034,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats tmp_sgs;
         int load_idx, prefer_sibling = 0;
+       bool overload = false;
  
         if (child && child->flags & SD_PREFER_SIBLING)
                 prefer_sibling = 1;
@@ -6015,7 +6055,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                                 update_group_capacity(env->sd, env->dst_cpu);
                 }
  
-               update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+               update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+                                               &overload);
  
                 if (local_group)
                         goto next_group;
@@ -6049,6 +6090,13 @@ next_group:
  
         if (env->sd->flags & SD_NUMA)
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+       if (!env->sd->parent) {
+               /* update overload indicator if we are at root domain */
+               if (env->dst_rq->rd->overload != overload)
+                       env->dst_rq->rd->overload = overload;
+       }
+
  }
  
  /**
@@ -6767,7 +6815,8 @@ static int idle_balance(struct rq *this_rq)
          */
         this_rq->idle_stamp = rq_clock(this_rq);
  
-       if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+           !this_rq->rd->overload) {
                 rcu_read_lock();
                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
                 if (sd)
@@ -7325,6 +7374,8 @@ void trigger_load_balance(struct rq *rq)
  static void rq_online_fair(struct rq *rq)
  {
         update_sysctl();
+
+       update_runtime_enabled(rq);
  }
  
  static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7449,7 @@ static void task_fork_fair(struct task_struct *p)
                  * 'current' within the tree based on its new key value.
                  */
                 swap(curr->vruntime, se->vruntime);
-               resched_task(rq->curr);
+               resched_curr(rq);
         }
  
         se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7474,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
          */
         if (rq->curr == p) {
                 if (p->prio > oldprio)
-                       resched_task(rq->curr);
+                       resched_curr(rq);
         } else
                 check_preempt_curr(rq, p, 0);
  }
@@ -7486,7 +7537,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
          * if we can still preempt the current task.
          */
         if (rq->curr == p)
-               resched_task(rq->curr);
+               resched_curr(rq);
         else
                 check_preempt_curr(rq, p, 0);
  }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index cf009fb0bc25b1427683d614a3a40ad98ad0c958..9f1608f998196cae04fd5a97eb162d3c0db6034c 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
         struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
         struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
         int next_state, entered_state;
-       bool broadcast;
+       unsigned int broadcast;
  
         /*
          * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
                 goto exit_idle;
         }
  
-       broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+       broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
  
         /*
          * Tell the time framework to switch to a broadcast timer
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c

index 879f2b75266a9933823a2056541af469cdc9c3dd..67ad4e7f506a2509a0493138662c0fec7dc4d7fa 100644 (file)
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
   */
  static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
  {
-       resched_task(rq->idle);
+       resched_curr(rq);
  }
  
  static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index a49083192c64c306952c752ec6b3a05a0df7205d..5f6edca4fafd85b59838a048e711b361861f39a6 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  {
         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+       struct rq *rq = rq_of_rt_rq(rt_rq);
         struct sched_rt_entity *rt_se;
  
-       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+       int cpu = cpu_of(rq);
  
         rt_se = rt_rq->tg->rt_se[cpu];
  
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                         enqueue_rt_entity(rt_se, false);
  
                 if (rt_rq->highest_prio.curr < curr->prio)
-                       resched_task(curr);
+                       resched_curr(rq);
         }
  }
  
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                 return;
  
         enqueue_top_rt_rq(rt_rq);
-       resched_task(rq->curr);
+       resched_curr(rq);
  }
  
  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
                 rt_rq->rt_throttled = 0;
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
                 raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+               /* Make rt_rq available for pick_next_task() */
+               sched_rt_rq_enqueue(rt_rq);
         }
  }
  
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
                         raw_spin_lock(&rt_rq->rt_runtime_lock);
                         rt_rq->rt_time += delta_exec;
                         if (sched_rt_runtime_exceeded(rt_rq))
-                               resched_task(curr);
+                               resched_curr(rq);
                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
                 }
         }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
          * to try and push current away:
          */
         requeue_task_rt(rq, p, 1);
-       resched_task(rq->curr);
+       resched_curr(rq);
  }
  
  #endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
  {
         if (p->prio < rq->curr->prio) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                 return;
         }
  
@@ -1690,7 +1694,7 @@ retry:
          * just reschedule current.
          */
         if (unlikely(next_task->prio < rq->curr->prio)) {
-               resched_task(rq->curr);
+               resched_curr(rq);
                 return 0;
         }
  
@@ -1737,7 +1741,7 @@ retry:
         activate_task(lowest_rq, next_task, 0);
         ret = 1;
  
-       resched_task(lowest_rq->curr);
+       resched_curr(lowest_rq);
  
         double_unlock_balance(rq, lowest_rq);
  
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
                 return;
  
         if (pull_rt_task(rq))
-               resched_task(rq->curr);
+               resched_curr(rq);
  }
  
  void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
                         check_resched = 0;
  #endif /* CONFIG_SMP */
                 if (check_resched && p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
+                       resched_curr(rq);
         }
  }
  
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                  * Only reschedule if p is still on the same runqueue.
                  */
                 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
-                       resched_task(p);
+                       resched_curr(rq);
  #else
                 /* For UP simply resched on drop of prio */
                 if (oldprio < p->prio)
-                       resched_task(p);
+                       resched_curr(rq);
  #endif /* CONFIG_SMP */
         } else {
                 /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                  * then reschedule.
                  */
                 if (p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
+                       resched_curr(rq);
         }
  }
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 31cc02ebc54ed82f5bf3f62fae879a1c0343a97d..579712f4e9d56535e9d981e489dfcc66165457ff 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
         cpumask_var_t span;
         cpumask_var_t online;
  
+       /* Indicate more than one runnable task for any CPU */
+       bool overload;
+
         /*
          * The bit corresponding to a CPU gets set here if such CPU has more
          * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
  #undef SCHED_FEAT
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
-       return static_key_true(key); /* Not out of line branch. */
-}
-
-static __always_inline bool static_branch__false(struct static_key *key)
-{
-       return static_key_false(key); /* Out of line branch. */
-}
-
  #define SCHED_FEAT(name, enabled)                                      \
  static __always_inline bool static_branch_##name(struct static_key *key) \
  {                                                                      \
-       return static_branch__##enabled(key);                           \
+       return static_key_##enabled(key);                               \
  }
  
  #include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
  extern void init_sched_fair_class(void);
  extern void init_sched_dl_class(void);
  
-extern void resched_task(struct task_struct *p);
+extern void resched_curr(struct rq *rq);
  extern void resched_cpu(int cpu);
  
  extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
  
         rq->nr_running = prev_nr + count;
  
-#ifdef CONFIG_NO_HZ_FULL
         if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+               if (!rq->rd->overload)
+                       rq->rd->overload = true;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
                 if (tick_nohz_full_cpu(rq->cpu)) {
-                       /* Order rq->nr_running write against the IPI */
-                       smp_wmb();
-                       smp_send_reschedule(rq->cpu);
+                       /*
+                        * Tick is needed if more than one task runs on a CPU.
+                        * Send the target an IPI to kick it out of nohz mode.
+                        *
+                        * We assume that IPI implies full memory barrier and the
+                        * new value of rq->nr_running is visible on reception
+                        * from the target.
+                        */
+                       tick_nohz_full_kick_cpu(rq->cpu);
                 }
-       }
  #endif
+       }
  }
  
  static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/smp.c b/kernel/smp.c

index 80c33f8de14ffbdb83aaf6be0bc5c31c5d3e6351..487653b5844f92c722ae781210064824310452a7 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
   *
   * (C) Jens Axboe <jens.axboe@oracle.com> 2008
   */
+#include <linux/irq_work.h>
  #include <linux/rcupdate.h>
  #include <linux/rculist.h>
  #include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
                 csd->func(csd->info);
                 csd_unlock(csd);
         }
+
+       /*
+        * Handle irq works queued remotely by irq_work_queue_on().
+        * Smp functions above are typically synchronous so they
+        * better run first since some other CPUs may be busy waiting
+        * for them.
+        */
+       irq_work_run();
  }
  
  /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 6558b7ac112d2e1293886721a22d020ea78a3e4d..3d63944a3eca6d81b3c4b185febc6372876dffdb 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -224,13 +224,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
  };
  
  /*
- * Kick the current CPU if it's full dynticks in order to force it to
+ * Kick the CPU if it's full dynticks in order to force it to
   * re-evaluate its dependency on the tick and restart it if necessary.
   */
-void tick_nohz_full_kick(void)
+void tick_nohz_full_kick_cpu(int cpu)
  {
-       if (tick_nohz_full_cpu(smp_processor_id()))
-               irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+       if (!tick_nohz_full_cpu(cpu))
+               return;
+
+       irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
  }
  
  static void nohz_full_kick_ipi(void *info)
author	Ingo Molnar <mingo@kernel.org>
	Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 16 Jul 2014 13:10:07 +0000 (15:10 +0200)
Documentation/trace/ftrace.txt		patch \| blob \| history
include/linux/irq_work.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/tick.h		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/irq_work.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/idle.c		patch \| blob \| history
kernel/sched/idle_task.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/smp.c		patch \| blob \| history
kernel/time/tick-sched.c		patch \| blob \| history