Merge tag 'for-linus-20150724' of git://git.infradead.org/linux-mtd

[karo-tx-linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 123673291ffbb160734ed889b934d557611a1cf1..78b4bad10081c6b23894ac1d5d7b6900ab32362a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
  
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-       unsigned long delta;
-       ktime_t soft, hard, now;
-
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
-
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
-
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
-}
-
  DEFINE_MUTEX(sched_domains_mutex);
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
  
  #ifdef CONFIG_SMP
  
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
  {
         struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = hrtimer_get_softexpires(timer);
  
-       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
  }
  
  /*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
          * doesn't make sense. Rely on vruntime for fairness.
          */
         delay = max_t(u64, delay, 10000LL);
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+                     HRTIMER_MODE_REL_PINNED);
  }
  
  static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
  static bool set_nr_if_polling(struct task_struct *p)
  {
         struct thread_info *ti = task_thread_info(p);
-       typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+       typeof(ti->flags) old, val = READ_ONCE(ti->flags);
  
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
  #endif
  #endif
  
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+       struct wake_q_node *node = &task->wake_q;
+
+       /*
+        * Atomically grab the task, if ->wake_q is !nil already it means
+        * its already queued (either by us or someone else) and will get the
+        * wakeup due to that.
+        *
+        * This cmpxchg() implies a full barrier, which pairs with the write
+        * barrier implied by the wakeup in wake_up_list().
+        */
+       if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+               return;
+
+       get_task_struct(task);
+
+       /*
+        * The head is context local, there can be no concurrency.
+        */
+       *head->lastp = node;
+       head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+       struct wake_q_node *node = head->first;
+
+       while (node != WAKE_Q_TAIL) {
+               struct task_struct *task;
+
+               task = container_of(node, struct task_struct, wake_q);
+               BUG_ON(!task);
+               /* task can safely be re-inserted now */
+               node = node->next;
+               task->wake_q.next = NULL;
+
+               /*
+                * wake_up_process() implies a wmb() to pair with the queueing
+                * in wake_q_add() so as not to miss wakeups.
+                */
+               wake_up_process(task);
+               put_task_struct(task);
+       }
+}
+
  /*
   * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
@@ -593,13 +618,12 @@ void resched_cpu(int cpu)
   * selecting an idle cpu will add more delays to the timers than intended
   * (as that cpu's timer base may not be uptodate wrt jiffies etc).
   */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
  {
-       int cpu = smp_processor_id();
-       int i;
+       int i, cpu = smp_processor_id();
         struct sched_domain *sd;
  
-       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+       if (!idle_cpu(cpu))
                 return cpu;
  
         rcu_read_lock();
@@ -976,7 +1000,11 @@ inline int task_curr(const struct task_struct *p)
  }
  
  /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
   */
  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
@@ -985,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
         if (prev_class != p->sched_class) {
                 if (prev_class->switched_from)
                         prev_class->switched_from(rq, p);
-               /* Possble rq->lock 'hole'.  */
+
                 p->sched_class->switched_to(rq, p);
         } else if (oldprio != p->prio || dl_task(p))
                 p->sched_class->prio_changed(rq, p, oldprio);
@@ -1017,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
  }
  
  #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+       lockdep_assert_held(&rq->lock);
+
+       dequeue_task(rq, p, 0);
+       p->on_rq = TASK_ON_RQ_MIGRATING;
+       set_task_cpu(p, new_cpu);
+       raw_spin_unlock(&rq->lock);
+
+       rq = cpu_rq(new_cpu);
+
+       raw_spin_lock(&rq->lock);
+       BUG_ON(task_cpu(p) != new_cpu);
+       p->on_rq = TASK_ON_RQ_QUEUED;
+       enqueue_task(rq, p, 0);
+       check_preempt_curr(rq, p, 0);
+
+       return rq;
+}
+
+struct migration_arg {
+       struct task_struct *task;
+       int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+       if (unlikely(!cpu_active(dest_cpu)))
+               return rq;
+
+       /* Affinity changed (again). */
+       if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+               return rq;
+
+       rq = move_queued_task(rq, p, dest_cpu);
+
+       return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+       struct migration_arg *arg = data;
+       struct task_struct *p = arg->task;
+       struct rq *rq = this_rq();
+
+       /*
+        * The original target cpu might have gone down and we might
+        * be on another cpu but it doesn't matter.
+        */
+       local_irq_disable();
+       /*
+        * We need to explicitly wake pending tasks before running
+        * __migrate_task() such that we will not miss enforcing cpus_allowed
+        * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+        */
+       sched_ttwu_pending();
+
+       raw_spin_lock(&p->pi_lock);
+       raw_spin_lock(&rq->lock);
+       /*
+        * If task_rq(p) != rq, it cannot be migrated here, because we're
+        * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+        * we're holding p->pi_lock.
+        */
+       if (task_rq(p) == rq && task_on_rq_queued(p))
+               rq = __migrate_task(rq, p, arg->dest_cpu);
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock(&p->pi_lock);
+
+       local_irq_enable();
+       return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+       if (p->sched_class->set_cpus_allowed)
+               p->sched_class->set_cpus_allowed(p, new_mask);
+
+       cpumask_copy(&p->cpus_allowed, new_mask);
+       p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+       unsigned long flags;
+       struct rq *rq;
+       unsigned int dest_cpu;
+       int ret = 0;
+
+       rq = task_rq_lock(p, &flags);
+
+       if (cpumask_equal(&p->cpus_allowed, new_mask))
+               goto out;
+
+       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       do_set_cpus_allowed(p, new_mask);
+
+       /* Can the task run on the task's current CPU? If so, we're done */
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
+               goto out;
+
+       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       if (task_running(rq, p) || p->state == TASK_WAKING) {
+               struct migration_arg arg = { p, dest_cpu };
+               /* Need help from migration thread: drop lock and wait. */
+               task_rq_unlock(rq, p, &flags);
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+               tlb_migrate_finish(p->mm);
+               return 0;
+       } else if (task_on_rq_queued(p)) {
+               /*
+                * OK, since we're going to drop the lock immediately
+                * afterwards anyway.
+                */
+               lockdep_unpin_lock(&rq->lock);
+               rq = move_queued_task(rq, p, dest_cpu);
+               lockdep_pin_lock(&rq->lock);
+       }
+out:
+       task_rq_unlock(rq, p, &flags);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
  #ifdef CONFIG_SCHED_DEBUG
@@ -1049,7 +1248,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
-               perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+               perf_event_task_migrate(p);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -1157,13 +1356,6 @@ out:
         return ret;
  }
  
-struct migration_arg {
-       struct task_struct *task;
-       int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
@@ -1296,9 +1488,7 @@ void kick_process(struct task_struct *p)
         preempt_enable();
  }
  EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
  
-#ifdef CONFIG_SMP
  /*
   * ->cpus_allowed is protected by both rq->lock and p->pi_lock
   */
@@ -1378,6 +1568,8 @@ out:
  static inline
  int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  {
+       lockdep_assert_held(&p->pi_lock);
+
         if (p->nr_cpus_allowed > 1)
                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
  
@@ -1403,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample)
         s64 diff = sample - *avg;
         *avg += diff >> 3;
  }
-#endif
+#endif /* CONFIG_SMP */
  
  static void
  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1466,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  
         p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
+       if (p->sched_class->task_woken) {
+               /*
+                * Our task @p is fully woken up and running; so its safe to
+                * drop the rq->lock, hereafter rq is only used for statistics.
+                */
+               lockdep_unpin_lock(&rq->lock);
                 p->sched_class->task_woken(rq, p);
+               lockdep_pin_lock(&rq->lock);
+       }
  
         if (rq->idle_stamp) {
                 u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1486,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  static void
  ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  {
+       lockdep_assert_held(&rq->lock);
+
  #ifdef CONFIG_SMP
         if (p->sched_contributes_to_load)
                 rq->nr_uninterruptible--;
@@ -1530,6 +1731,7 @@ void sched_ttwu_pending(void)
                 return;
  
         raw_spin_lock_irqsave(&rq->lock, flags);
+       lockdep_pin_lock(&rq->lock);
  
         while (llist) {
                 p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1537,6 +1739,7 @@ void sched_ttwu_pending(void)
                 ttwu_do_activate(rq, p, 0);
         }
  
+       lockdep_unpin_lock(&rq->lock);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -1633,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
  #endif
  
         raw_spin_lock(&rq->lock);
+       lockdep_pin_lock(&rq->lock);
         ttwu_do_activate(rq, p, 0);
+       lockdep_unpin_lock(&rq->lock);
         raw_spin_unlock(&rq->lock);
  }
  
@@ -1728,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p)
         lockdep_assert_held(&rq->lock);
  
         if (!raw_spin_trylock(&p->pi_lock)) {
+               /*
+                * This is OK, because current is on_cpu, which avoids it being
+                * picked for load-balance and preemption/IRQs are still
+                * disabled avoiding further scheduler activity on it and we've
+                * not yet picked a replacement task.
+                */
+               lockdep_unpin_lock(&rq->lock);
                 raw_spin_unlock(&rq->lock);
                 raw_spin_lock(&p->pi_lock);
                 raw_spin_lock(&rq->lock);
+               lockdep_pin_lock(&rq->lock);
         }
  
         if (!(p->state & TASK_NORMAL))
@@ -1951,7 +2164,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         set_task_cpu(p, cpu);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
         if (likely(sched_info_on()))
                 memset(&p->sched_info, 0, sizeof(p->sched_info));
  #endif
@@ -2105,12 +2318,29 @@ void wake_up_new_task(struct task_struct *p)
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+       static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+       static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
  /**
   * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
  {
+       if (!static_key_false(&preempt_notifier_key))
+               WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
         hlist_add_head(&notifier->link, &current->preempt_notifiers);
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,7 +2349,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
   * preempt_notifier_unregister - no longer interested in preemption notifications
   * @notifier: notifier struct to unregister
   *
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
   */
  void preempt_notifier_unregister(struct preempt_notifier *notifier)
  {
@@ -2127,7 +2357,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
  }
  EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
         struct preempt_notifier *notifier;
  
@@ -2135,9 +2365,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
  }
  
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_in_preempt_notifiers(curr);
+}
+
  static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                  struct task_struct *next)
  {
         struct preempt_notifier *notifier;
  
@@ -2145,13 +2381,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                 notifier->ops->sched_out(notifier, next);
  }
  
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       if (static_key_false(&preempt_notifier_key))
+               __fire_sched_out_preempt_notifiers(curr, next);
+}
+
  #else /* !CONFIG_PREEMPT_NOTIFIERS */
  
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
  {
  }
  
-static void
+static inline void
  fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                  struct task_struct *next)
  {
@@ -2252,23 +2496,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
  #ifdef CONFIG_SMP
  
  /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
  {
-       if (rq->post_schedule) {
-               unsigned long flags;
+       struct callback_head *head, *next;
+       void (*func)(struct rq *rq);
+       unsigned long flags;
  
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->curr->sched_class->post_schedule)
-                       rq->curr->sched_class->post_schedule(rq);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       head = rq->balance_callback;
+       rq->balance_callback = NULL;
+       while (head) {
+               func = (void (*)(struct rq *))head->func;
+               next = head->next;
+               head->next = NULL;
+               head = next;
  
-               rq->post_schedule = 0;
+               func(rq);
         }
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+       if (unlikely(rq->balance_callback))
+               __balance_callback(rq);
  }
  
  #else
  
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
  {
  }
  
@@ -2286,7 +2542,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         /* finish_task_switch() drops rq->lock and enables preemtion */
         preempt_disable();
         rq = finish_task_switch(prev);
-       post_schedule(rq);
+       balance_callback(rq);
         preempt_enable();
  
         if (current->set_child_tid)
@@ -2330,9 +2586,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
          * of the scheduler it's an obvious special-case), so we
          * do an early lockdep release here:
          */
+       lockdep_unpin_lock(&rq->lock);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  
-       context_tracking_task_switch(prev, next);
         /* Here we just switch the register state and the stack. */
         switch_to(prev, next, prev);
         barrier();
@@ -2397,9 +2653,9 @@ unsigned long nr_iowait_cpu(int cpu)
  
  void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
  {
-       struct rq *this = this_rq();
-       *nr_waiters = atomic_read(&this->nr_iowait);
-       *load = this->cpu_load[0];
+       struct rq *rq = this_rq();
+       *nr_waiters = atomic_read(&rq->nr_iowait);
+       *load = rq->load.weight;
  }
  
  #ifdef CONFIG_SMP
@@ -2497,6 +2753,7 @@ void scheduler_tick(void)
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         update_cpu_load_active(rq);
+       calc_global_load_tick(rq);
         raw_spin_unlock(&rq->lock);
  
         perf_event_task_tick();
@@ -2525,7 +2782,7 @@ void scheduler_tick(void)
  u64 scheduler_tick_max_deferment(void)
  {
         struct rq *rq = this_rq();
-       unsigned long next, now = ACCESS_ONCE(jiffies);
+       unsigned long next, now = READ_ONCE(jiffies);
  
         next = rq->last_sched_tick + HZ;
  
@@ -2726,9 +2983,7 @@ again:
   *          - return from syscall or exception to user-space
   *          - return from interrupt-handler to user-space
   *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
   */
  static void __sched __schedule(void)
  {
@@ -2737,7 +2992,6 @@ static void __sched __schedule(void)
         struct rq *rq;
         int cpu;
  
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_note_context_switch();
@@ -2755,6 +3009,7 @@ static void __sched __schedule(void)
          */
         smp_mb__before_spinlock();
         raw_spin_lock_irq(&rq->lock);
+       lockdep_pin_lock(&rq->lock);
  
         rq->clock_skip_update <<= 1; /* promote REQ to ACT */
  
@@ -2797,12 +3052,12 @@ static void __sched __schedule(void)
  
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
                 cpu = cpu_of(rq);
-       } else
+       } else {
+               lockdep_unpin_lock(&rq->lock);
                 raw_spin_unlock_irq(&rq->lock);
+       }
  
-       post_schedule(rq);
-
-       sched_preempt_enable_no_resched();
+       balance_callback(rq);
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +3078,9 @@ asmlinkage __visible void __sched schedule(void)
  
         sched_submit_work(tsk);
         do {
+               preempt_disable();
                 __schedule();
+               sched_preempt_enable_no_resched();
         } while (need_resched());
  }
  EXPORT_SYMBOL(schedule);
@@ -2862,15 +3119,14 @@ void __sched schedule_preempt_disabled(void)
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
+               preempt_active_exit();
  
                 /*
                  * Check again in case we missed a preemption opportunity
                  * between schedule and now.
                  */
-               barrier();
         } while (need_resched());
  }
  
@@ -2894,9 +3150,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
  
-#ifdef CONFIG_CONTEXT_TRACKING
  /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
   *
   * The tracing infrastructure uses preempt_enable_notrace to prevent
   * recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +3164,7 @@ EXPORT_SYMBOL(preempt_schedule);
   * instead of preempt_schedule() to exit user context if needed before
   * calling the scheduler.
   */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
  {
         enum ctx_state prev_ctx;
  
@@ -2917,7 +3172,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 return;
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               /*
+                * Use raw __prempt_count() ops that don't call function.
+                * We can't call functions before disabling preemption which
+                * disarm preemption tracing recursions.
+                */
+               __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+               barrier();
                 /*
                  * Needs preempt disabled in case user_exit() is traced
                  * and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +3188,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
                 __schedule();
                 exception_exit(prev_ctx);
  
-               __preempt_count_sub(PREEMPT_ACTIVE);
                 barrier();
+               __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
         } while (need_resched());
  }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
  #endif /* CONFIG_PREEMPT */
  
@@ -2952,17 +3212,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         prev_state = exception_enter();
  
         do {
-               __preempt_count_add(PREEMPT_ACTIVE);
+               preempt_active_enter();
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
+               preempt_active_exit();
         } while (need_resched());
  
         exception_exit(prev_state);
@@ -3040,7 +3294,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
-                       p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
@@ -3068,7 +3321,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
+       preempt_disable(); /* avoid rq from going away on us */
         __task_rq_unlock(rq);
+
+       balance_callback(rq);
+       preempt_enable();
  }
  #endif
  
@@ -3406,7 +3663,7 @@ static bool dl_param_changed(struct task_struct *p,
  
  static int __sched_setscheduler(struct task_struct *p,
                                 const struct sched_attr *attr,
-                               bool user)
+                               bool user, bool pi)
  {
         int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                       MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3592,18 +3849,20 @@ change:
         p->sched_reset_on_fork = reset_on_fork;
         oldprio = p->prio;
  
-       /*
-        * Take priority boosted tasks into account. If the new
-        * effective priority is unchanged, we just store the new
-        * normal parameters and do not touch the scheduler class and
-        * the runqueue. This will be done when the task deboost
-        * itself.
-        */
-       new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-       if (new_effective_prio == oldprio) {
-               __setscheduler_params(p, attr);
-               task_rq_unlock(rq, p, &flags);
-               return 0;
+       if (pi) {
+               /*
+                * Take priority boosted tasks into account. If the new
+                * effective priority is unchanged, we just store the new
+                * normal parameters and do not touch the scheduler class and
+                * the runqueue. This will be done when the task deboost
+                * itself.
+                */
+               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+               if (new_effective_prio == oldprio) {
+                       __setscheduler_params(p, attr);
+                       task_rq_unlock(rq, p, &flags);
+                       return 0;
+               }
         }
  
         queued = task_on_rq_queued(p);
@@ -3614,7 +3873,7 @@ change:
                 put_prev_task(rq, p);
  
         prev_class = p->sched_class;
-       __setscheduler(rq, p, attr, true);
+       __setscheduler(rq, p, attr, pi);
  
         if (running)
                 p->sched_class->set_curr_task(rq);
@@ -3627,9 +3886,17 @@ change:
         }
  
         check_class_changed(rq, p, prev_class, oldprio);
+       preempt_disable(); /* avoid rq from going away on us */
         task_rq_unlock(rq, p, &flags);
  
-       rt_mutex_adjust_pi(p);
+       if (pi)
+               rt_mutex_adjust_pi(p);
+
+       /*
+        * Run balance callbacks after we've adjusted the PI chain.
+        */
+       balance_callback(rq);
+       preempt_enable();
  
         return 0;
  }
@@ -3650,7 +3917,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
                 attr.sched_policy = policy;
         }
  
-       return __sched_setscheduler(p, &attr, check);
+       return __sched_setscheduler(p, &attr, check, true);
  }
  /**
   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3671,7 +3938,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  
  int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
  {
-       return __sched_setscheduler(p, attr, true);
+       return __sched_setscheduler(p, attr, true, true);
  }
  EXPORT_SYMBOL_GPL(sched_setattr);
  
@@ -4719,149 +4986,6 @@ out:
  }
  
  #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-       struct rq *rq = task_rq(p);
-
-       lockdep_assert_held(&rq->lock);
-
-       dequeue_task(rq, p, 0);
-       p->on_rq = TASK_ON_RQ_MIGRATING;
-       set_task_cpu(p, new_cpu);
-       raw_spin_unlock(&rq->lock);
-
-       rq = cpu_rq(new_cpu);
-
-       raw_spin_lock(&rq->lock);
-       BUG_ON(task_cpu(p) != new_cpu);
-       p->on_rq = TASK_ON_RQ_QUEUED;
-       enqueue_task(rq, p, 0);
-       check_preempt_curr(rq, p, 0);
-
-       return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-       if (p->sched_class->set_cpus_allowed)
-               p->sched_class->set_cpus_allowed(p, new_mask);
-
-       cpumask_copy(&p->cpus_allowed, new_mask);
-       p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-       unsigned long flags;
-       struct rq *rq;
-       unsigned int dest_cpu;
-       int ret = 0;
-
-       rq = task_rq_lock(p, &flags);
-
-       if (cpumask_equal(&p->cpus_allowed, new_mask))
-               goto out;
-
-       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       do_set_cpus_allowed(p, new_mask);
-
-       /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpumask_test_cpu(task_cpu(p), new_mask))
-               goto out;
-
-       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (task_running(rq, p) || p->state == TASK_WAKING) {
-               struct migration_arg arg = { p, dest_cpu };
-               /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &flags);
-               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-               tlb_migrate_finish(p->mm);
-               return 0;
-       } else if (task_on_rq_queued(p))
-               rq = move_queued_task(p, dest_cpu);
-out:
-       task_rq_unlock(rq, p, &flags);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-       struct rq *rq;
-       int ret = 0;
-
-       if (unlikely(!cpu_active(dest_cpu)))
-               return ret;
-
-       rq = cpu_rq(src_cpu);
-
-       raw_spin_lock(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
-       /* Already moved. */
-       if (task_cpu(p) != src_cpu)
-               goto done;
-
-       /* Affinity changed (again). */
-       if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-               goto fail;
-
-       /*
-        * If we're not on a rq, the next wake-up will ensure we're
-        * placed properly.
-        */
-       if (task_on_rq_queued(p))
-               rq = move_queued_task(p, dest_cpu);
-done:
-       ret = 1;
-fail:
-       raw_spin_unlock(&rq->lock);
-       raw_spin_unlock(&p->pi_lock);
-       return ret;
-}
  
  #ifdef CONFIG_NUMA_BALANCING
  /* Migrate current task p to target_cpu */
@@ -4909,35 +5033,9 @@ void sched_setnuma(struct task_struct *p, int nid)
                 enqueue_task(rq, p, 0);
         task_rq_unlock(rq, p, &flags);
  }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-       struct migration_arg *arg = data;
-
-       /*
-        * The original target cpu might have gone down and we might
-        * be on another cpu but it doesn't matter.
-        */
-       local_irq_disable();
-       /*
-        * We need to explicitly wake pending tasks before running
-        * __migrate_task() such that we will not miss enforcing cpus_allowed
-        * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-        */
-       sched_ttwu_pending();
-       __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-       local_irq_enable();
-       return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
  
  #ifdef CONFIG_HOTPLUG_CPU
-
  /*
   * Ensures that the idle task is using init_mm right before its cpu goes
   * offline.
@@ -4993,9 +5091,9 @@ static struct task_struct fake_task = {
   * there's no concurrency possible, we hold the required locks anyway
   * because of lock validation efforts.
   */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
  {
-       struct rq *rq = cpu_rq(dead_cpu);
+       struct rq *rq = dead_rq;
         struct task_struct *next, *stop = rq->stop;
         int dest_cpu;
  
@@ -5017,7 +5115,7 @@ static void migrate_tasks(unsigned int dead_cpu)
          */
         update_rq_clock(rq);
  
-       for ( ; ; ) {
+       for (;;) {
                 /*
                  * There's this thread running, bail when that's the only
                  * remaining thread.
@@ -5025,22 +5123,29 @@ static void migrate_tasks(unsigned int dead_cpu)
                 if (rq->nr_running == 1)
                         break;
  
+               /*
+                * Ensure rq->lock covers the entire task selection
+                * until the migration.
+                */
+               lockdep_pin_lock(&rq->lock);
                 next = pick_next_task(rq, &fake_task);
                 BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
  
                 /* Find suitable destination for @next, with force if needed. */
-               dest_cpu = select_fallback_rq(dead_cpu, next);
-               raw_spin_unlock(&rq->lock);
-
-               __migrate_task(next, dead_cpu, dest_cpu);
-
-               raw_spin_lock(&rq->lock);
+               dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+               lockdep_unpin_lock(&rq->lock);
+               rq = __migrate_task(rq, next, dest_cpu);
+               if (rq != dead_rq) {
+                       raw_spin_unlock(&rq->lock);
+                       rq = dead_rq;
+                       raw_spin_lock(&rq->lock);
+               }
         }
  
         rq->stop = stop;
  }
-
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5219,7 +5324,7 @@ static void register_sched_domain_sysctl(void)
  static void unregister_sched_domain_sysctl(void)
  {
  }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
  
  static void set_rq_online(struct rq *rq)
  {
@@ -5288,7 +5393,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
-               migrate_tasks(cpu);
+               migrate_tasks(rq);
                 BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
                 break;
@@ -5314,7 +5419,7 @@ static struct notifier_block migration_notifier = {
         .priority = CPU_PRI_MIGRATION,
  };
  
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
  {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
@@ -5366,9 +5471,6 @@ static int __init migration_init(void)
         return 0;
  }
  early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
  
  static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
  
@@ -6594,7 +6696,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                         struct sched_group *sg;
                         struct sched_group_capacity *sgc;
  
-                       sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                       sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                         GFP_KERNEL, cpu_to_node(j));
                         if (!sd)
                                 return -ENOMEM;
@@ -7032,6 +7134,9 @@ void __init sched_init_smp(void)
         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
+       /* nohz_full won't take effect without isolating the cpus. */
+       tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
         sched_init_numa();
  
         /*
@@ -7068,8 +7173,6 @@ void __init sched_init_smp(void)
  }
  #endif /* CONFIG_SMP */
  
-const_debug unsigned int sysctl_timer_migration = 1;
-
  int in_sched_functions(unsigned long addr)
  {
         return in_lock_functions(addr) ||
@@ -7199,7 +7302,7 @@ void __init sched_init(void)
                 rq->sd = NULL;
                 rq->rd = NULL;
                 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-               rq->post_schedule = 0;
+               rq->balance_callback = NULL;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
@@ -7329,32 +7432,12 @@ EXPORT_SYMBOL(___might_sleep);
  #endif
  
  #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
  {
-       const struct sched_class *prev_class = p->sched_class;
+       struct task_struct *g, *p;
         struct sched_attr attr = {
                 .sched_policy = SCHED_NORMAL,
         };
-       int old_prio = p->prio;
-       int queued;
-
-       queued = task_on_rq_queued(p);
-       if (queued)
-               dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, &attr, false);
-       if (queued) {
-               enqueue_task(rq, p, 0);
-               resched_curr(rq);
-       }
-
-       check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-       struct task_struct *g, *p;
-       unsigned long flags;
-       struct rq *rq;
  
         read_lock(&tasklist_lock);
         for_each_process_thread(g, p) {
@@ -7381,9 +7464,7 @@ void normalize_rt_tasks(void)
                         continue;
                 }
  
-               rq = task_rq_lock(p, &flags);
-               normalize_task(rq, p);
-               task_rq_unlock(rq, p, &flags);
+               __sched_setscheduler(p, &attr, false, false);
         }
         read_unlock(&tasklist_lock);
  }
@@ -7734,11 +7815,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
         return rt_runtime_us;
  }
  
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  {
         u64 rt_runtime, rt_period;
  
-       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+       rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8186,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
  
         __refill_cfs_bandwidth_runtime(cfs_b);
         /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               __start_cfs_bandwidth(cfs_b, true);
-       }
+       if (runtime_enabled)
+               start_cfs_bandwidth(cfs_b);
         raw_spin_unlock_irq(&cfs_b->lock);
  
         for_each_online_cpu(i) {