cpu hotplug: Make cpu_active_map synchronization dependency clear

[mv-sheeva.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 591d5e7f757ad7438e9696bd3735f4b098dfda92..c237624a8a042be35bfe49eeb17e6b04f7915e22 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
  #include <linux/bootmem.h>
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
+#include <linux/ftrace.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@ -645,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
  # define const_debug static const
  #endif
  
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+       int cpu = get_cpu();
+       struct rq *rq = cpu_rq(cpu);
+       int ret;
+
+       ret = spin_is_locked(&rq->lock);
+       put_cpu();
+       return ret;
+}
+
  /*
   * Debugging: various feature bits
   */
@@ -2318,6 +2337,9 @@ out_activate:
         success = 1;
  
  out_running:
+       trace_mark(kernel_sched_wakeup,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
  
         p->state = TASK_RUNNING;
@@ -2450,6 +2472,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
         check_preempt_curr(rq, p);
  #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@ -2622,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
         struct mm_struct *mm, *oldmm;
  
         prepare_task_switch(rq, prev, next);
+       trace_mark(kernel_sched_schedule,
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               prev->pid, next->pid, prev->state,
+               rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@ -2851,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
  
         rq = task_rq_lock(p, &flags);
         if (!cpu_isset(dest_cpu, p->cpus_allowed)
-           || unlikely(cpu_is_offline(dest_cpu)))
+           || unlikely(!cpu_active(dest_cpu)))
                 goto out;
  
         /* force the process onto the specified CPU */
@@ -3819,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
                 /*
                  * If we are going offline and still the leader, give up!
                  */
-               if (cpu_is_offline(cpu) &&
+               if (!cpu_active(cpu) &&
                     atomic_read(&nohz.load_balancer) == cpu) {
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
@@ -4221,26 +4251,44 @@ void scheduler_tick(void)
  #endif
  }
  
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+}
  
  void __kprobes add_preempt_count(int val)
  {
+#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
+#endif
         preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
          */
         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                 PREEMPT_MASK - 10);
+#endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  }
  EXPORT_SYMBOL(add_preempt_count);
  
  void __kprobes sub_preempt_count(int val)
  {
+#ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Underflow?
          */
@@ -4252,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                         !(preempt_count() & PREEMPT_MASK)))
                 return;
+#endif
  
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
         preempt_count() -= val;
  }
  EXPORT_SYMBOL(sub_preempt_count);
@@ -4944,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
         set_load_weight(p);
  }
  
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+static int __sched_setscheduler(struct task_struct *p, int policy,
+                               struct sched_param *param, bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@ -4985,7 +5028,7 @@ recheck:
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
-       if (!capable(CAP_SYS_NICE)) {
+       if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
                         unsigned long rlim_rtprio;
  
@@ -5021,7 +5064,8 @@ recheck:
          * Do not allow realtime tasks into groups that have no runtime
          * assigned.
          */
-       if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+       if (user
+           && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
                 return -EPERM;
  #endif
  
@@ -5070,8 +5114,39 @@ recheck:
  
         return 0;
  }
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, true);
+}
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                              struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, false);
+}
+
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  {
@@ -5566,7 +5641,7 @@ out_unlock:
         return retval;
  }
  
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
  
  void sched_show_task(struct task_struct *p)
  {
@@ -5801,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         struct rq *rq_dest, *rq_src;
         int ret = 0, on_rq;
  
-       if (unlikely(cpu_is_offline(dest_cpu)))
+       if (unlikely(!cpu_active(dest_cpu)))
                 return ret;
  
         rq_src = cpu_rq(src_cpu);
@@ -6693,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
-       int ints[NR_CPUS], i;
+       static int __initdata ints[NR_CPUS];
+       int i;
  
         str = get_options(str, ARRAY_SIZE(ints), ints);
         cpus_clear(cpu_isolated_map);
@@ -7477,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  {
  }
  
-/*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-       ndoms_cur = 0;
-       if (doms_cur != &fallback_doms)
-               kfree(doms_cur);
-       doms_cur = &fallback_doms;
-}
-
  /*
   * Set up scheduler domains and groups. Callers must hold the hotplug lock.
   * For now this just excludes isolated cpus, but could be used to
@@ -7567,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   * ownership of it and will kfree it when done with it. If the caller
   * failed the kmalloc call, then it can pass in doms_new == NULL,
   * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
   *
   * Call with hotplug lock held
   */
@@ -7581,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
  
-       if (doms_new == NULL) {
-               ndoms_new = 1;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
-       }
+       if (doms_new == NULL)
+               ndoms_new = 0;
  
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
@@ -7601,6 +7661,14 @@ match1:
                 ;
         }
  
+       if (doms_new == NULL) {
+               ndoms_cur = 0;
+               ndoms_new = 1;
+               doms_new = &fallback_doms;
+               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               dattr_new = NULL;
+       }
+
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur; j++) {
@@ -7631,17 +7699,10 @@ match2:
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  int arch_reinit_sched_domains(void)
  {
-       int err;
-
         get_online_cpus();
-       mutex_lock(&sched_domains_mutex);
-       detach_destroy_domains(&cpu_online_map);
-       free_sched_domains();
-       err = arch_init_sched_domains(&cpu_online_map);
-       mutex_unlock(&sched_domains_mutex);
+       rebuild_sched_domains();
         put_online_cpus();
-
-       return err;
+       return 0;
  }
  
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7707,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  }
  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
+#ifndef CONFIG_CPUSETS
  /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
   */
  static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
+{
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               partition_sched_domains(0, NULL, NULL);
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
  {
         int cpu = (int)(long)hcpu;
  
@@ -7722,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
                 disable_runtime(cpu_rq(cpu));
-               /* fall-through */
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               detach_destroy_domains(&cpu_online_map);
-               free_sched_domains();
                 return NOTIFY_OK;
  
-
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
                 enable_runtime(cpu_rq(cpu));
-               /* fall-through */
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /*
-                * Fall through and re-initialise the domains.
-                */
-               break;
+               return NOTIFY_OK;
+
         default:
                 return NOTIFY_DONE;
         }
-
-#ifndef CONFIG_CPUSETS
-       /*
-        * Create default domain partitioning if cpusets are disabled.
-        * Otherwise we let cpusets rebuild the domains based on the
-        * current setup.
-        */
-
-       /* The hotplug lock is already held by cpu_up/cpu_down */
-       arch_init_sched_domains(&cpu_online_map);
-#endif
-
-       return NOTIFY_OK;
  }
  
  void __init sched_init_smp(void)
@@ -7779,8 +7830,15 @@ void __init sched_init_smp(void)
                 cpu_set(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+       /* RT runtime code needs to handle some hotplug events */
+       hotcpu_notifier(update_runtime, 0);
+
         init_hrtick();
  
         /* Move init over to a non-isolated CPU */
@@ -8077,7 +8135,7 @@ void __init sched_init(void)
  #endif
  
  #ifdef CONFIG_SMP
-       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  #endif
  
  #ifdef CONFIG_RT_MUTEXES