Merge commit 'v2.6.31-rc8' into sched/core

author Ingo Molnar <mingo@elte.hu>

Wed, 2 Sep 2009 06:20:32 +0000 (08:20 +0200)

committer Ingo Molnar <mingo@elte.hu>

Wed, 2 Sep 2009 06:20:35 +0000 (08:20 +0200)
author Ingo Molnar <mingo@elte.hu>
Wed, 2 Sep 2009 06:20:32 +0000 (08:20 +0200)
committer Ingo Molnar <mingo@elte.hu>
Wed, 2 Sep 2009 06:20:35 +0000 (08:20 +0200)
diff --git a/fs/dcache.c b/fs/dcache.c

index 9e5cd3c3a6ba76e8bc95aec9f4dc6c250fb7b2eb..a100fa35a48f3702fbe5522e2636501dc8eb93f3 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
  #include <linux/swap.h>
  #include <linux/bootmem.h>
  #include <linux/fs_struct.h>
+#include <linux/hardirq.h>
  #include "internal.h"
  
  int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/locks.c b/fs/locks.c

index b6440f52178fad125f3d0101d7a97f82ad2cf3ee..2eb81975c99c7ee2c247cf6c7aa11e348bb1e3fc 100644 (file)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
          * give it the opportunity to lock the file.
          */
         if (found)
-               cond_resched_bkl();
+               cond_resched();
  
  find_conflict:
         for_each_lock(inode, before) {
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h

index 8246c697863d72cec6bae27889d0149a88ab830f..0d885fd75111048f1e8359b1fd49572dd385ba5e 100644 (file)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,12 @@
  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
  #define NMI_OFFSET     (1UL << NMI_SHIFT)
  
+#ifndef PREEMPT_ACTIVE
+#define PREEMPT_ACTIVE_BITS    1
+#define PREEMPT_ACTIVE_SHIFT   (NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+#endif
+
  #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS))
  #error PREEMPT_ACTIVE is too low!
  #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index d6320a3e8def0fc22a9595d8ff03f52c08983404..2b5b1e0899a81e93a4970e8f01924958423c968c 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -125,7 +125,7 @@ extern int _cond_resched(void);
  #endif
  
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-  void __might_sleep(char *file, int line);
+  void __might_sleep(char *file, int line, int preempt_offset);
  /**
   * might_sleep - annotation for functions that can sleep
   *
@@ -137,8 +137,9 @@ extern int _cond_resched(void);
   * supposed to.
   */
  # define might_sleep() \
-       do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
+       do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
  #else
+  static inline void __might_sleep(char *file, int line, int preempt_offset) { }
  # define might_sleep() do { might_resched(); } while (0)
  #endif
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 0f1ea4a6695763debe7a6e87bf586c36efcc0c76..e209ae0e7a8a6238701abdf0a972fe8fb5ade3c9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -38,6 +38,8 @@
  #define SCHED_BATCH            3
  /* SCHED_ISO: reserved but not implemented yet */
  #define SCHED_IDLE             5
+/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_RESET_ON_FORK     0x40000000
  
  #ifdef __KERNEL__
  
@@ -1045,7 +1047,6 @@ struct sched_class {
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
-       int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
@@ -1230,11 +1231,17 @@ struct task_struct {
         unsigned did_exec:1;
         unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                                  * execve */
+
+       /* Revert to default priority/policy when forking */
+       unsigned sched_reset_on_fork:1;
+
         pid_t pid;
         pid_t tgid;
  
+#ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
+#endif
  
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
@@ -2281,23 +2288,31 @@ static inline int need_resched(void)
   * cond_resched_softirq() will enable bhs before scheduling.
   */
  extern int _cond_resched(void);
-#ifdef CONFIG_PREEMPT_BKL
-static inline int cond_resched(void)
-{
-       return 0;
-}
+
+#define cond_resched() ({                      \
+       __might_sleep(__FILE__, __LINE__, 0);   \
+       _cond_resched();                        \
+})
+
+extern int __cond_resched_lock(spinlock_t *lock);
+
+#ifdef CONFIG_PREEMPT
+#define PREEMPT_LOCK_OFFSET    PREEMPT_OFFSET
  #else
-static inline int cond_resched(void)
-{
-       return _cond_resched();
-}
+#define PREEMPT_LOCK_OFFSET    0
  #endif
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
-static inline int cond_resched_bkl(void)
-{
-       return _cond_resched();
-}
+
+#define cond_resched_lock(lock) ({                             \
+       __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
+       __cond_resched_lock(lock);                              \
+})
+
+extern int __cond_resched_softirq(void);
+
+#define cond_resched_softirq() ({                              \
+       __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
+       __cond_resched_softirq();                               \
+})
  
  /*
   * Does a critical section need to be broken due to another
diff --git a/kernel/sched.c b/kernel/sched.c

index 1b59e265273b032d6aac2baec9b059646ca563bf..6244d24cafc1966b91df750944ce5a32b63e28bf 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user)
  
  /*
   * Root task group.
- *     Every UID task group (including init_task_group aka UID-0) will
- *     be a child to this group.
+ *     Every UID task group (including init_task_group aka UID-0) will
+ *     be a child to this group.
   */
  struct task_group root_task_group;
  
@@ -318,7 +318,7 @@ struct task_group root_task_group;
  /* Default task group's sched entity on each cpu */
  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
  /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +616,7 @@ struct rq {
  
         unsigned char idle_at_tick;
         /* For active balancing */
+       int post_schedule;
         int active_balance;
         int push_cpu;
         /* cpu of this runqueue: */
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq)
  #define this_rq()              (&__get_cpu_var(runqueues))
  #define task_rq(p)             cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+#define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
  inline void update_rq_clock(struct rq *rq)
  {
@@ -1513,28 +1515,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
+struct update_shares_data {
+       unsigned long rq_weight[NR_CPUS];
+};
+
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
  /*
   * Calculate and set the cpu's group shares.
   */
-static void
-update_group_shares_cpu(struct task_group *tg, int cpu,
-                       unsigned long sd_shares, unsigned long sd_rq_weight)
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
+                                   unsigned long sd_shares,
+                                   unsigned long sd_rq_weight,
+                                   struct update_shares_data *usd)
  {
-       unsigned long shares;
-       unsigned long rq_weight;
-
-       if (!tg->se[cpu])
-               return;
+       unsigned long shares, rq_weight;
+       int boost = 0;
  
-       rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       rq_weight = usd->rq_weight[cpu];
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
  
         /*
-        *           \Sum shares * rq_weight
-        * shares =  -----------------------
-        *               \Sum rq_weight
-        *
+        *             \Sum_j shares_j * rq_weight_i
+        * shares_i =  -----------------------------
+        *                  \Sum_j rq_weight_j
          */
         shares = (sd_shares * rq_weight) / sd_rq_weight;
         shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1554,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                 unsigned long flags;
  
                 spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
-
+               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
@@ -1559,22 +1568,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long weight, rq_weight = 0;
-       unsigned long shares = 0;
+       unsigned long weight, rq_weight = 0, shares = 0;
+       struct update_shares_data *usd;
         struct sched_domain *sd = data;
+       unsigned long flags;
         int i;
  
+       if (!tg->se[0])
+               return 0;
+
+       local_irq_save(flags);
+       usd = &__get_cpu_var(update_shares_data);
+
         for_each_cpu(i, sched_domain_span(sd)) {
+               weight = tg->cfs_rq[i]->load.weight;
+               usd->rq_weight[i] = weight;
+
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
                  * run here it will not get delayed by group starvation.
                  */
-               weight = tg->cfs_rq[i]->load.weight;
                 if (!weight)
                         weight = NICE_0_LOAD;
  
-               tg->cfs_rq[i]->rq_weight = weight;
                 rq_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
@@ -1586,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
                 shares = tg->shares;
  
         for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+               update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+
+       local_irq_restore(flags);
  
         return 0;
  }
@@ -1616,8 +1635,14 @@ static int tg_load_down(struct task_group *tg, void *data)
  
  static void update_shares(struct sched_domain *sd)
  {
-       u64 now = cpu_clock(raw_smp_processor_id());
-       s64 elapsed = now - sd->last_update;
+       s64 elapsed;
+       u64 now;
+
+       if (root_task_group_empty())
+               return;
+
+       now = cpu_clock(raw_smp_processor_id());
+       elapsed = now - sd->last_update;
  
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
@@ -1627,6 +1652,9 @@ static void update_shares(struct sched_domain *sd)
  
  static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  {
+       if (root_task_group_empty())
+               return;
+
         spin_unlock(&rq->lock);
         update_shares(sd);
         spin_lock(&rq->lock);
@@ -1634,6 +1662,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  static void update_h_load(long cpu)
  {
+       if (root_task_group_empty())
+               return;
+
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
@@ -2637,9 +2668,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
         set_task_cpu(p, cpu);
  
         /*
-        * Make sure we do not leak PI boosting priority to the child:
+        * Make sure we do not leak PI boosting priority to the child.
          */
         p->prio = current->normal_prio;
+
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                       p->policy = SCHED_NORMAL;
+
+               if (p->normal_prio < DEFAULT_PRIO)
+                       p->prio = DEFAULT_PRIO;
+
+               if (PRIO_TO_NICE(p->static_prio) < 0) {
+                       p->static_prio = NICE_TO_PRIO(0);
+                       set_load_weight(p);
+               }
+
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
  
@@ -2796,12 +2850,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
-#ifdef CONFIG_SMP
-       int post_schedule = 0;
-
-       if (current->sched_class->needs_post_schedule)
-               post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
  
         rq->prev_mm = NULL;
  
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         finish_arch_switch(prev);
         perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-       if (post_schedule)
-               current->sched_class->post_schedule(rq);
-#endif
  
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@ -2838,6 +2882,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         }
  }
  
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->sched_class->pre_schedule)
+               prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+       if (rq->post_schedule) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&rq->lock, flags);
+               if (rq->curr->sched_class->post_schedule)
+                       rq->curr->sched_class->post_schedule(rq);
+               spin_unlock_irqrestore(&rq->lock, flags);
+
+               rq->post_schedule = 0;
+       }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
  /**
   * schedule_tail - first thing a freshly forked thread must call.
   * @prev: the thread we just switched away from.
@@ -2848,6 +2928,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
         struct rq *rq = this_rq();
  
         finish_task_switch(rq, prev);
+
+       /*
+        * FIXME: do we need to worry about rq being invalidated by the
+        * task_switch?
+        */
+       post_schedule(rq);
+
  #ifdef __ARCH_WANT_UNLOCKED_CTXSW
         /* In this case, finish_task_switch does not reenable preemption */
         preempt_enable();
@@ -3379,9 +3466,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  {
         const struct sched_class *class;
  
-       for (class = sched_class_highest; class; class = class->next)
+       for_each_class(class) {
                 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                         return 1;
+       }
  
         return 0;
  }
@@ -5349,10 +5437,7 @@ need_resched_nonpreemptible:
                 switch_count = &prev->nvcsw;
         }
  
-#ifdef CONFIG_SMP
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
-#endif
+       pre_schedule(rq, prev);
  
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
@@ -5378,6 +5463,8 @@ need_resched_nonpreemptible:
         } else
                 spin_unlock_irq(&rq->lock);
  
+       post_schedule(rq);
+
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
  
@@ -6123,17 +6210,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
         unsigned long flags;
         const struct sched_class *prev_class = p->sched_class;
         struct rq *rq;
+       int reset_on_fork;
  
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
  recheck:
         /* double check policy once rq lock held */
-       if (policy < 0)
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
                 policy = oldpolicy = p->policy;
-       else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                       policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                       policy != SCHED_IDLE)
-               return -EINVAL;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6272,10 @@ recheck:
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
                         return -EPERM;
+
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
         }
  
         if (user) {
@@ -6220,6 +6319,8 @@ recheck:
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
+       p->sched_reset_on_fork = reset_on_fork;
+
         oldprio = p->prio;
         __setscheduler(rq, p, policy, param->sched_priority);
  
@@ -6336,14 +6437,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
         if (p) {
                 retval = security_task_getscheduler(p);
                 if (!retval)
-                       retval = p->policy;
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
         }
         read_unlock(&tasklist_lock);
         return retval;
  }
  
  /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
   * @pid: the pid in question.
   * @param: structure containing the RT priority.
   */
@@ -6571,19 +6673,9 @@ static inline int should_resched(void)
  
  static void __cond_resched(void)
  {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-       __might_sleep(__FILE__, __LINE__);
-#endif
-       /*
-        * The BKS might be reacquired before we have dropped
-        * PREEMPT_ACTIVE, which could trigger a second
-        * cond_resched() call.
-        */
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               schedule();
-               sub_preempt_count(PREEMPT_ACTIVE);
-       } while (need_resched());
+       add_preempt_count(PREEMPT_ACTIVE);
+       schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
  }
  
  int __sched _cond_resched(void)
@@ -6597,14 +6689,14 @@ int __sched _cond_resched(void)
  EXPORT_SYMBOL(_cond_resched);
  
  /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
   * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
  {
         int resched = should_resched();
         int ret = 0;
@@ -6620,9 +6712,9 @@ int cond_resched_lock(spinlock_t *lock)
         }
         return ret;
  }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
  
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
@@ -6634,7 +6726,7 @@ int __sched cond_resched_softirq(void)
         }
         return 0;
  }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
  
  /**
   * yield - yield the current processor to other threads.
@@ -6658,7 +6750,7 @@ EXPORT_SYMBOL(yield);
   */
  void __sched io_schedule(void)
  {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
  
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
@@ -6670,7 +6762,7 @@ EXPORT_SYMBOL(io_schedule);
  
  long __sched io_schedule_timeout(long timeout)
  {
-       struct rq *rq = &__raw_get_cpu_var(runqueues);
+       struct rq *rq = raw_rq();
         long ret;
  
         delayacct_blkio_start();
@@ -6992,8 +7084,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  
         if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
+               struct task_struct *mt = rq->migration_thread;
+
+               get_task_struct(mt);
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
+               put_task_struct(mt);
                 wait_for_completion(&req.done);
                 tlb_migrate_finish(p->mm);
                 return 0;
@@ -7625,7 +7721,7 @@ static int __init migration_init(void)
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
  
-       return err;
+       return 0;
  }
  early_initcall(migration_init);
  #endif
@@ -7841,7 +7937,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
         rq->rd = rd;
  
         cpumask_set_cpu(rq->cpu, rd->span);
-       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                 set_rq_online(rq);
  
         spin_unlock_irqrestore(&rq->lock, flags);
@@ -9304,11 +9400,11 @@ void __init sched_init(void)
                  * system cpu resource, based on the weight assigned to root
                  * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                  * by letting tasks of init_task_group sit in a separate cfs_rq
-                * (init_cfs_rq) and having one entity represent this group of
+                * (init_tg_cfs_rq) and having one entity represent this group of
                  * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                  */
                 init_tg_cfs_entry(&init_task_group,
-                               &per_cpu(init_cfs_rq, i),
+                               &per_cpu(init_tg_cfs_rq, i),
                                 &per_cpu(init_sched_entity, i), i, 1,
                                 root_task_group.se[i]);
  
@@ -9334,6 +9430,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
+               rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
@@ -9398,13 +9495,20 @@ void __init sched_init(void)
  }
  
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+       int nested = preempt_count() & ~PREEMPT_ACTIVE;
+
+       return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+
+void __might_sleep(char *file, int line, int preempt_offset)
  {
  #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
  
-       if ((!in_atomic() && !irqs_disabled()) ||
-                   system_state != SYSTEM_RUNNING || oops_in_progress)
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c

index d014efbf947a26a9eb241df6c462893c0d00d9cf..0f052fc674d5a0c49c4086c03fb950539edcfa48 100644 (file)
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  
         /*
          * If the cpu was currently mapped to a different value, we
-        * first need to unmap the old value
+        * need to map it to the new value then remove the old value.
+        * Note, we must add the new value first, otherwise we risk the
+        * cpu being cleared from pri_active, and this cpu could be
+        * missed for a push or pull.
          */
-       if (likely(oldpri != CPUPRI_INVALID)) {
-               struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-
-               spin_lock_irqsave(&vec->lock, flags);
-
-               vec->count--;
-               if (!vec->count)
-                       clear_bit(oldpri, cp->pri_active);
-               cpumask_clear_cpu(cpu, vec->mask);
-
-               spin_unlock_irqrestore(&vec->lock, flags);
-       }
-
         if (likely(newpri != CPUPRI_INVALID)) {
                 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
  
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  
                 spin_unlock_irqrestore(&vec->lock, flags);
         }
+       if (likely(oldpri != CPUPRI_INVALID)) {
+               struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+               spin_lock_irqsave(&vec->lock, flags);
+
+               vec->count--;
+               if (!vec->count)
+                       clear_bit(oldpri, cp->pri_active);
+               cpumask_clear_cpu(cpu, vec->mask);
+
+               spin_unlock_irqrestore(&vec->lock, flags);
+       }
  
         *currpri = newpri;
  }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 652e8bdef9aadb294b2e094a5d9620d95c26d873..342000b31ad65a89607ef1b783ac2f6057dca040 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
   * CFS operations on generic schedulable entities:
   */
  
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-       return container_of(se, struct task_struct, se);
-}
-
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
  /* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  /* An entity is a task if it doesn't "own" a runqueue */
  #define entity_is_task(se)     (!se->my_q)
  
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       WARN_ON_ONCE(!entity_is_task(se));
+#endif
+       return container_of(se, struct task_struct, se);
+}
+
  /* Walk up scheduling entities hierarchy */
  #define for_each_sched_entity(se) \
                 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
         }
  }
  
-#else  /* CONFIG_FAIR_GROUP_SCHED */
+#else  /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+       return container_of(se, struct task_struct, se);
+}
  
  static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  {
@@ -1046,17 +1054,21 @@ static void yield_task_fair(struct rq *rq)
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
   * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_mask)
+ * hence we need to mask them out (rq->rd->online)
   *
   * Returns the CPU we should wake onto.
   */
  #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+
+#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
+
  static int wake_idle(int cpu, struct task_struct *p)
  {
         struct sched_domain *sd;
         int i;
         unsigned int chosen_wakeup_cpu;
         int this_cpu;
+       struct rq *task_rq = task_rq(p);
  
         /*
          * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1101,10 @@ static int wake_idle(int cpu, struct task_struct *p)
         for_each_domain(cpu, sd) {
                 if ((sd->flags & SD_WAKE_IDLE)
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
-                       && !task_hot(p, task_rq(p)->clock, sd))) {
+                       && !task_hot(p, task_rq->clock, sd))) {
                         for_each_cpu_and(i, sched_domain_span(sd),
                                          &p->cpus_allowed) {
-                               if (cpu_active(i) && idle_cpu(i)) {
+                               if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
                                                 schedstat_inc(p,
                                                        se.nr_wakeups_idle);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 3918e01994e0a92bd734d22eebef7047e0a17a57..3d4020a9ba1b37e8eda2aaefba75c86db2919fdf 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
   * policies)
   */
  
+#ifdef CONFIG_RT_GROUP_SCHED
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
  static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  {
+#ifdef CONFIG_SCHED_DEBUG
+       WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
         return container_of(rt_se, struct task_struct, rt);
  }
  
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-
  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  {
         return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  
  #define rt_entity_is_task(rt_se) (1)
  
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+       return container_of(rt_se, struct task_struct, rt);
+}
+
  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  {
         return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
  }
  
+static inline int has_pushable_tasks(struct rq *rq)
+{
+       return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
  #else
  
  static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -1064,6 +1077,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
         if (p)
                 dequeue_pushable_task(rq, p);
  
+#ifdef CONFIG_SMP
+       /*
+        * We detect this state here so that we can avoid taking the RQ
+        * lock again later if there is no need to push
+        */
+       rq->post_schedule = has_pushable_tasks(rq);
+#endif
+
         return p;
  }
  
@@ -1161,13 +1182,6 @@ static int find_lowest_rq(struct task_struct *task)
         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
                 return -1; /* No targets found */
  
-       /*
-        * Only consider CPUs that are usable for migration.
-        * I guess we might want to change cpupri_find() to ignore those
-        * in the first place.
-        */
-       cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
-
         /*
          * At this point we have built a mask of cpus representing the
          * lowest priority tasks in the system.  Now we want to elect
@@ -1262,11 +1276,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
         return lowest_rq;
  }
  
-static inline int has_pushable_tasks(struct rq *rq)
-{
-       return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
  static struct task_struct *pick_next_pushable_task(struct rq *rq)
  {
         struct task_struct *p;
@@ -1466,23 +1475,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
                 pull_rt_task(rq);
  }
  
-/*
- * assumes rq->lock is held
- */
-static int needs_post_schedule_rt(struct rq *rq)
-{
-       return has_pushable_tasks(rq);
-}
-
  static void post_schedule_rt(struct rq *rq)
  {
-       /*
-        * This is only called if needs_post_schedule_rt() indicates that
-        * we need to push tasks away
-        */
-       spin_lock_irq(&rq->lock);
         push_rt_tasks(rq);
-       spin_unlock_irq(&rq->lock);
  }
  
  /*
@@ -1758,7 +1753,6 @@ static const struct sched_class rt_sched_class = {
         .rq_online              = rq_online_rt,
         .rq_offline             = rq_offline_rt,
         .pre_schedule           = pre_schedule_rt,
-       .needs_post_schedule    = needs_post_schedule_rt,
         .post_schedule          = post_schedule_rt,
         .task_wake_up           = task_wake_up_rt,
         .switched_from          = switched_from_rt,
author	Ingo Molnar <mingo@elte.hu>
	Wed, 2 Sep 2009 06:20:32 +0000 (08:20 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Wed, 2 Sep 2009 06:20:35 +0000 (08:20 +0200)
fs/dcache.c		patch \| blob \| history
fs/locks.c		patch \| blob \| history
include/linux/hardirq.h		patch \| blob \| history
include/linux/kernel.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_cpupri.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history