sched: Simplify the reacquire_kernel_lock() logic

[mv-sheeva.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index edd5a54b95da66813b617c83fecb06fd6410b084..f37a9618fac38e6c23679efa51168f9800390886 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -307,52 +307,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
   */
  struct task_group init_task_group;
  
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
-       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-                               struct task_group, css);
-#else
-       tg = &init_task_group;
-#endif
-       return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-       /*
-        * Strictly speaking this rcu_read_lock() is not needed since the
-        * task_group is tied to the cgroup, which in turn can never go away
-        * as long as there are tasks attached to it.
-        *
-        * However since task_group() uses task_subsys_state() which is an
-        * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
-        */
-       rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-       p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-       p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-       rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       return NULL;
-}
-
  #endif /* CONFIG_CGROUP_SCHED */
  
  /* CFS-related fields in a runqueue */
@@ -645,6 +599,49 @@ static inline int cpu_of(struct rq *rq)
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  #define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       struct cgroup_subsys_state *css;
+
+       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                       lockdep_is_held(&task_rq(p)->lock));
+       return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+       p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+       p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
  inline void update_rq_clock(struct rq *rq)
  {
         if (!rq->skip_clock_update)
@@ -1650,7 +1647,7 @@ static void update_shares(struct sched_domain *sd)
         if (root_task_group_empty())
                 return;
  
-       now = cpu_clock(raw_smp_processor_id());
+       now = local_clock();
         elapsed = now - sd->last_update;
  
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -3639,7 +3636,6 @@ need_resched:
         rq = cpu_rq(cpu);
         rcu_note_context_switch(cpu);
         prev = rq->curr;
-       switch_count = &prev->nivcsw;
  
         release_kernel_lock(prev);
  need_resched_nonpreemptible:
@@ -3652,6 +3648,7 @@ need_resched_nonpreemptible:
         raw_spin_lock_irq(&rq->lock);
         clear_tsk_need_resched(prev);
  
+       switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
                         prev->state = TASK_RUNNING;
@@ -3692,8 +3689,10 @@ need_resched_nonpreemptible:
  
                 context_switch(rq, prev, next); /* unlocks the rq */
                 /*
-                * the context switch might have flipped the stack from under
-                * us, hence refresh the local variables.
+                * The context switch have flipped the stack from under us
+                * and restored the local variables which were saved when
+                * this task called schedule() in the past. prev == current
+                * is still correct, but it can be moved to another cpu/rq.
                  */
                 cpu = smp_processor_id();
                 rq = cpu_rq(cpu);
@@ -3702,11 +3701,8 @@ need_resched_nonpreemptible:
  
         post_schedule(rq);
  
-       if (unlikely(reacquire_kernel_lock(current) < 0)) {
-               prev = rq->curr;
-               switch_count = &prev->nivcsw;
+       if (unlikely(reacquire_kernel_lock(prev)))
                 goto need_resched_nonpreemptible;
-       }
  
         preempt_enable_no_resched();
         if (need_resched())
@@ -4529,16 +4525,6 @@ recheck:
         }
  
         if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-               /*
-                * Do not allow realtime tasks into groups that have no runtime
-                * assigned.
-                */
-               if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0)
-                       return -EPERM;
-#endif
-
                 retval = security_task_setscheduler(p, policy, param);
                 if (retval)
                         return retval;
@@ -4554,6 +4540,22 @@ recheck:
          * runqueue lock must be held.
          */
         rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (user) {
+               /*
+                * Do not allow realtime tasks into groups that have no runtime
+                * assigned.
+                */
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                       __task_rq_unlock(rq);
+                       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                       return -EPERM;
+               }
+       }
+#endif
+
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;