Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-beck.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 9b1f2e533b95cf2532ffcadfc62476b18d9e27f5..05c39f030314698730349de8eb9346c1437d9aba 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
  #ifdef CONFIG_SCHED_HRTICK
  /*
   * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
   */
  
  static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
  }
  
  #ifdef CONFIG_SMP
+
+static int __hrtick_restart(struct rq *rq)
+{
+       struct hrtimer *timer = &rq->hrtick_timer;
+       ktime_t time = hrtimer_get_softexpires(timer);
+
+       return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+}
+
  /*
   * called from hardirq (IPI) context
   */
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
         struct rq *rq = arg;
  
         raw_spin_lock(&rq->lock);
-       hrtimer_restart(&rq->hrtick_timer);
+       __hrtick_restart(rq);
         rq->hrtick_csd_pending = 0;
         raw_spin_unlock(&rq->lock);
  }
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
         hrtimer_set_expires(timer, time);
  
         if (rq == this_rq()) {
-               hrtimer_restart(timer);
+               __hrtick_restart(rq);
         } else if (!rq->hrtick_csd_pending) {
                 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                 rq->hrtick_csd_pending = 1;
@@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p)
  /**
   * task_curr - is this task currently executing on a CPU?
   * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
   */
  inline int task_curr(const struct task_struct *p)
  {
@@ -1480,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
   * the simpler "current->state = TASK_RUNNING" to mark yourself
   * runnable without the overhead of this.
   *
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
   * or @state didn't match @p's state.
   */
  static int
@@ -1489,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         unsigned long flags;
         int cpu, success = 0;
  
-       smp_wmb();
+       /*
+        * If we are going to wake up a thread waiting for CONDITION we
+        * need to ensure that CONDITION=1 done by the caller can not be
+        * reordered with p->state check below. This pairs with mb() in
+        * set_current_state() the waiting thread does.
+        */
+       smp_mb__before_spinlock();
         raw_spin_lock_irqsave(&p->pi_lock, flags);
         if (!(p->state & state))
                 goto out;
@@ -1575,8 +1585,9 @@ out:
   * @p: The process to be woken up.
   *
   * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
- * running.
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
   *
   * It may be assumed that this function implies a write memory barrier before
   * changing the task state if and only if any tasks are woken up.
@@ -2189,6 +2200,8 @@ void scheduler_tick(void)
   * This makes sure that uptime, CFS vruntime, load
   * balancing, etc... continue to move forward, even
   * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
   */
  u64 scheduler_tick_max_deferment(void)
  {
@@ -2392,6 +2405,12 @@ need_resched:
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
  
+       /*
+        * Make sure that signal_pending_state()->signal_pending() below
+        * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+        * done by the caller to avoid the race with signal_wake_up().
+        */
+       smp_mb__before_spinlock();
         raw_spin_lock_irq(&rq->lock);
  
         switch_count = &prev->nivcsw;
@@ -2794,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);
   * specified timeout to expire. The timeout is in jiffies. It is not
   * interruptible.
   *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
   */
  unsigned long __sched
  wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -2827,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
   * specified timeout to expire. The timeout is in jiffies. It is not
   * interruptible. The caller is accounted as waiting for IO.
   *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
   */
  unsigned long __sched
  wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -2844,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
   * This waits for completion of a specific task to be signaled. It is
   * interruptible.
   *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
   */
  int __sched wait_for_completion_interruptible(struct completion *x)
  {
@@ -2863,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
   * This waits for either a completion of a specific task to be signaled or for a
   * specified timeout to expire. It is interruptible. The timeout is in jiffies.
   *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
   */
  long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
@@ -2881,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
   * This waits to be signaled for completion of a specific task. It can be
   * interrupted by a kill signal.
   *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
   */
  int __sched wait_for_completion_killable(struct completion *x)
  {
@@ -2901,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
   * signaled or for a specified timeout to expire. It can be
   * interrupted by a kill signal. The timeout is in jiffies.
   *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
   */
  long __sched
  wait_for_completion_killable_timeout(struct completion *x,
@@ -2916,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
   *     try_wait_for_completion - try to decrement a completion without blocking
   *     @x:     completion structure
   *
- *     Returns: 0 if a decrement cannot be done without blocking
+ *     Return: 0 if a decrement cannot be done without blocking
   *              1 if a decrement succeeded.
   *
   *     If a completion is being used as a counting completion,
@@ -2943,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
   *     completion_done - Test to see if a completion has any waiters
   *     @x:     completion structure
   *
- *     Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *     Return: 0 if there are waiters (wait_for_completion() in progress)
   *              1 if there are no waiters.
   *
   */
@@ -3180,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)
   * task_prio - return the priority value of a given task.
   * @p: the task in question.
   *
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
   * RT tasks are offset by -200. Normal tasks are centered
   * around 0, value goes from -16 to +15.
   */
@@ -3192,6 +3211,8 @@ int task_prio(const struct task_struct *p)
  /**
   * task_nice - return the nice value of a given task.
   * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
   */
  int task_nice(const struct task_struct *p)
  {
@@ -3202,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);
  /**
   * idle_cpu - is a given cpu idle currently?
   * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
   */
  int idle_cpu(int cpu)
  {
@@ -3224,6 +3247,8 @@ int idle_cpu(int cpu)
  /**
   * idle_task - return the idle task for a given cpu.
   * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
   */
  struct task_struct *idle_task(int cpu)
  {
@@ -3233,6 +3258,8 @@ struct task_struct *idle_task(int cpu)
  /**
   * find_process_by_pid - find a process with a matching PID value.
   * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
   */
  static struct task_struct *find_process_by_pid(pid_t pid)
  {
@@ -3430,6 +3457,8 @@ recheck:
   * @policy: new policy.
   * @param: structure containing the new RT priority.
   *
+ * Return: 0 on success. An error code otherwise.
+ *
   * NOTE that the task may be already dead.
   */
  int sched_setscheduler(struct task_struct *p, int policy,
@@ -3449,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
   * current context has permission.  For example, this is needed in
   * stop_machine(): we create temporary high priority worker threads,
   * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
   */
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                                const struct sched_param *param)
@@ -3483,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   * @pid: the pid in question.
   * @policy: new policy.
   * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
   */
  SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
                 struct sched_param __user *, param)
@@ -3498,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
   * sys_sched_setparam - set/change the RT priority of a thread
   * @pid: the pid in question.
   * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
   */
  SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  {
@@ -3507,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  /**
   * sys_sched_getscheduler - get the policy (scheduling class) of a thread
   * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
   */
  SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
  {
@@ -3533,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
   * sys_sched_getparam - get the RT priority of a thread
   * @pid: the pid in question.
   * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
   */
  SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
  {
@@ -3657,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
   * @pid: pid of the process
   * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
   */
  SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@ -3708,6 +3751,8 @@ out_unlock:
   * @pid: pid of the process
   * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
   */
  SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@ -3742,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
   *
   * This function yields the current CPU to other tasks. If there are no
   * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
   */
  SYSCALL_DEFINE0(sched_yield)
  {
@@ -3867,7 +3914,7 @@ EXPORT_SYMBOL(yield);
   * It's the caller's job to ensure that the target task struct
   * can't go away on us before we can do any checks.
   *
- * Returns:
+ * Return:
   *     true (>0) if we indeed boosted the target task.
   *     false (0) if we failed to boost the target.
   *     -ESRCH if there's no task to yield to.
@@ -3970,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)
   * sys_sched_get_priority_max - return maximum RT priority.
   * @policy: scheduling class.
   *
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
   */
  SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
  {
@@ -3995,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
   * sys_sched_get_priority_min - return minimum RT priority.
   * @policy: scheduling class.
   *
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
   */
  SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  {
@@ -4022,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
   *
   * this syscall writes the default timeslice value of a given process
   * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
   */
  SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                 struct timespec __user *, interval)
@@ -4131,7 +4183,7 @@ void show_state_filter(unsigned long state_filter)
                 debug_show_all_locks();
  }
  
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
  {
         idle->sched_class = &idle_sched_class;
  }
@@ -4144,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
   * NOTE: this function does not set the idle thread's NEED_RESCHED
   * flag, to make booting more robust.
   */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
@@ -4628,7 +4680,7 @@ static void set_rq_offline(struct rq *rq)
   * migration_call - callback that gets triggered when a CPU is added.
   * Here we can start up the necessary migration thread for the new CPU.
   */
-static int __cpuinit
+static int
  migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  {
         int cpu = (long)hcpu;
@@ -4682,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
   * happens before everything else.  This has to be lower priority than
   * the notifier in the perf_event subsystem, though.
   */
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
         .notifier_call = migration_call,
         .priority = CPU_PRI_MIGRATION,
  };
  
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int sched_cpu_active(struct notifier_block *nfb,
                                       unsigned long action, void *hcpu)
  {
         switch (action & ~CPU_TASKS_FROZEN) {
@@ -4700,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
         }
  }
  
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
                                         unsigned long action, void *hcpu)
  {
         switch (action & ~CPU_TASKS_FROZEN) {
@@ -6630,6 +6682,8 @@ void normalize_rt_tasks(void)
   * @cpu: the processor in question.
   *
   * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
   */
  struct task_struct *curr_task(int cpu)
  {