Merge branch 'rcu/next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck...

[karo-tx-linux.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index da14302a9857124c713bbbdf8841961c033afb94..e6f8f1254319b730933764fddab98e7804fd59ce 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,23 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
  
  #endif /* CONFIG_CGROUP_SCHED */
  
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
  
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
  {
-       int cpu = cpu_of(rq);
-       u64 irq_time;
+       s64 delta;
  
         if (rq->skip_clock_update)
                 return;
  
-       rq->clock = sched_clock_cpu(cpu);
-       irq_time = irq_time_cpu(cpu);
-       if (rq->clock - irq_time > rq->clock_task)
-               rq->clock_task = rq->clock - irq_time;
-
-       sched_irq_time_avg_update(rq, irq_time);
+       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       rq->clock += delta;
+       update_rq_clock_task(rq, delta);
  }
  
  /*
@@ -1925,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
   * They are read and saved off onto struct rq in update_rq_clock().
   * This may result in other CPU reading this CPU's irq time and can
   * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
   */
  static DEFINE_PER_CPU(u64, cpu_hardirq_time);
  static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1946,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
         sched_clock_irqtime = 0;
  }
  
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
  {
-       if (!sched_clock_irqtime)
-               return 0;
+       __this_cpu_inc(irq_time_seq.sequence);
+       smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+       smp_wmb();
+       __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       u64 irq_time;
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+               irq_time = per_cpu(cpu_softirq_time, cpu) +
+                          per_cpu(cpu_hardirq_time, cpu);
+       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
  
+       return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
         return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
  }
+#endif /* CONFIG_64BIT */
  
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
  void account_system_vtime(struct task_struct *curr)
  {
         unsigned long flags;
+       s64 delta;
         int cpu;
-       u64 now, delta;
  
         if (!sched_clock_irqtime)
                 return;
@@ -1966,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
         local_irq_save(flags);
  
         cpu = smp_processor_id();
-       now = sched_clock_cpu(cpu);
-       delta = now - per_cpu(irq_start_time, cpu);
-       per_cpu(irq_start_time, cpu) = now;
+       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+       __this_cpu_add(irq_start_time, delta);
+
+       irq_time_write_begin();
         /*
          * We do not account for softirq time from ksoftirqd here.
          * We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
          * that do not consume any time, but still wants to run.
          */
         if (hardirq_count())
-               per_cpu(cpu_hardirq_time, cpu) += delta;
+               __this_cpu_add(cpu_hardirq_time, delta);
         else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-               per_cpu(cpu_softirq_time, cpu) += delta;
+               __this_cpu_add(cpu_softirq_time, delta);
  
+       irq_time_write_end();
         local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(account_system_vtime);
  
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
-       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
-               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-               rq->prev_irq_time = curr_irq_time;
-               sched_rt_avg_update(rq, delta_irq);
-       }
+       s64 irq_delta;
+
+       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+       /*
+        * Since irq_time is only updated on {soft,}irq_exit, we might run into
+        * this case when a previous update_rq_clock() happened inside a
+        * {soft,}irq region.
+        *
+        * When this happens, we stop ->clock_task and only update the
+        * prev_irq_time stamp to account for the part that fit, so that a next
+        * update will consume the rest. This ensures ->clock_task is
+        * monotonic.
+        *
+        * It does however cause some slight miss-attribution of {soft,}irq
+        * time, a more accurate solution would be to update the irq_time using
+        * the current rq->clock timestamp, except that would require using
+        * atomic ops.
+        */
+       if (irq_delta > delta)
+               irq_delta = delta;
+
+       rq->prev_irq_time += irq_delta;
+       delta -= irq_delta;
+       rq->clock_task += delta;
+
+       if (irq_delta && sched_feat(NONIRQ_POWER))
+               sched_rt_avg_update(rq, irq_delta);
  }
  
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
  
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
-       return 0;
+       rq->clock_task += delta;
  }
  
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -4072,7 +4128,6 @@ need_resched_nonpreemptible:
                 rq->nr_switches++;
                 rq->curr = next;
                 ++*switch_count;
-               WARN_ON_ONCE(test_tsk_need_resched(next));
  
                 context_switch(rq, prev, next); /* unlocks the rq */
                 /*
@@ -9479,72 +9534,3 @@ struct cgroup_subsys cpuacct_subsys = {
  };
  #endif /* CONFIG_CGROUP_CPUACCT */
  
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-       barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-       int snap, trycount = 0;
-
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */