Merge branches 'sched/urgent' and 'sched/rt' into sched/devel

author Ingo Molnar <mingo@elte.hu>

Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)

committer Ingo Molnar <mingo@elte.hu>

Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)
author Ingo Molnar <mingo@elte.hu>
Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)
committer Ingo Molnar <mingo@elte.hu>
Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)
diff --combined kernel/sched.c

index 927c9307cd00f17c1c3384477f5f0b4f215f50d7,13dd2db9fb2dc185a4a95a86aab9f89b4850ccaf,4de2bfb28c58fcf2ccb8fde74d40149f799d4c7e..669c49aa57f01cdb3c52041874bbba826004b345
--- 1/kernel/sched.c
--- 2/kernel/sched.c
--- 3/kernel/sched.c
+++ b/kernel/sched.c
@@@@ -204,11 -204,11 -204,16 +204,16 @@@@ void init_rt_bandwidth(struct rt_bandwi
         rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
    }
    
++ static inline int rt_bandwidth_enabled(void)
++ {
++      return sysctl_sched_rt_runtime >= 0;
++ }
++ 
    static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
    {
         ktime_t now;
    
--      if (rt_b->rt_runtime == RUNTIME_INF)
++      if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                 return;
    
         if (hrtimer_active(&rt_b->rt_period_timer))
@@@@ -298,9 -298,9 -303,9 +303,9 @@@@ static DEFINE_PER_CPU(struct cfs_rq, in
    static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
    static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
    #endif /* CONFIG_RT_GROUP_SCHED */
-- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ #else /* !CONFIG_USER_SCHED */
    #define root_task_group init_task_group
-- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ #endif /* CONFIG_USER_SCHED */
    
    /* task_group_lock serializes add/remove of task groups and also changes to
     * a task group's cpu shares.
@@@@ -604,9 -604,9 -609,9 +609,9 @@@@ struct rq 
    
    static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
    
- --static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
    {
- --     rq->curr->sched_class->check_preempt_curr(rq, p);
+ ++     rq->curr->sched_class->check_preempt_curr(rq, p, sync);
    }
    
    static inline int cpu_of(struct rq *rq)
@@@@ -1087,7 -1087,7 -1092,7 +1092,7 @@@@ hotplug_hrtick(struct notifier_block *n
         return NOTIFY_DONE;
    }
    
- -static void init_hrtick(void)
+ +static __init void init_hrtick(void)
    {
         hotcpu_notifier(hotplug_hrtick, 0);
    }
@@@@ -1102,7 -1102,7 -1107,7 +1107,7 @@@@ static void hrtick_start(struct rq *rq
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
    }
    
- --static void init_hrtick(void)
+ ++static inline void init_hrtick(void)
    {
    }
    #endif /* CONFIG_SMP */
@@@@ -1121,7 -1121,7 -1126,7 +1126,7 @@@@ static void init_rq_hrtick(struct rq *r
         rq->hrtick_timer.function = hrtick;
         rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
    }
- --#else
+ ++#else        /* CONFIG_SCHED_HRTICK */
    static inline void hrtick_clear(struct rq *rq)
    {
    }
@@@@ -1133,7 -1133,7 -1138,7 +1138,7 @@@@ static inline void init_rq_hrtick(struc
    static inline void init_hrtick(void)
    {
    }
- --#endif
+ ++#endif       /* CONFIG_SCHED_HRTICK */
    
    /*
     * resched_task - mark a task 'to be rescheduled now'.
@@@@ -1380,38 -1380,38 -1385,24 +1385,24 @@@@ static inline void dec_cpu_load(struct 
         update_load_sub(&rq->load, load);
    }
    
-- #ifdef CONFIG_SMP
-- static unsigned long source_load(int cpu, int type);
-- static unsigned long target_load(int cpu, int type);
-- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-- 
-- static unsigned long cpu_avg_load_per_task(int cpu)
-- {
--      struct rq *rq = cpu_rq(cpu);
-- 
--      if (rq->nr_running)
--              rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-- 
--      return rq->avg_load_per_task;
-- }
-- 
-- #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
-- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ typedef int (*tg_visitor)(struct task_group *, void *);
    
    /*
     * Iterate the full tree, calling @down when first entering a node and @up when
     * leaving it for the final time.
     */
-- static void
-- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
    {
         struct task_group *parent, *child;
++      int ret;
    
         rcu_read_lock();
         parent = &root_task_group;
    down:
--      (*down)(parent, cpu, sd);
++      ret = (*down)(parent, data);
++      if (ret)
++              goto out_unlock;
         list_for_each_entry_rcu(child, &parent->children, siblings) {
                 parent = child;
                 goto down;
@@@@ -1419,15 -1419,15 -1410,43 +1410,43 @@@@
    up:
                 continue;
         }
--      (*up)(parent, cpu, sd);
++      ret = (*up)(parent, data);
++      if (ret)
++              goto out_unlock;
    
         child = parent;
         parent = parent->parent;
         if (parent)
                 goto up;
++ out_unlock:
         rcu_read_unlock();
++ 
++      return ret;
+ + }
+ + 
++ static int tg_nop(struct task_group *tg, void *data)
++ {
++      return 0;
+  }
++ #endif
++ 
++ #ifdef CONFIG_SMP
++ static unsigned long source_load(int cpu, int type);
++ static unsigned long target_load(int cpu, int type);
++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++ 
++ static unsigned long cpu_avg_load_per_task(int cpu)
++ {
++      struct rq *rq = cpu_rq(cpu);
++ 
++      if (rq->nr_running)
++              rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++ 
++      return rq->avg_load_per_task;
++ }
++ 
++ #ifdef CONFIG_FAIR_GROUP_SCHED
+  
    static void __set_se_shares(struct sched_entity *se, unsigned long shares);
    
    /*
@@@@ -1486,11 -1486,11 -1505,11 +1505,11 @@@@ __update_group_shares_cpu(struct task_g
     * This needs to be done in a bottom-up fashion because the rq weight of a
     * parent group depends on the shares of its child groups.
     */
-- static void
-- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_shares_up(struct task_group *tg, void *data)
    {
         unsigned long rq_weight = 0;
         unsigned long shares = 0;
++      struct sched_domain *sd = data;
         int i;
    
         for_each_cpu_mask(i, sd->span) {
@@@@ -1515,6 -1515,6 -1534,8 +1534,8 @@@@
                 __update_group_shares_cpu(tg, i, shares, rq_weight);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
++ 
++      return 0;
    }
    
    /*
@@@@ -1522,10 -1522,10 -1543,10 +1543,10 @@@@
     * This needs to be done in a top-down fashion because the load of a child
     * group is a fraction of its parents load.
     */
-- static void
-- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_load_down(struct task_group *tg, void *data)
    {
         unsigned long load;
++      long cpu = (long)data;
    
         if (!tg->parent) {
                 load = cpu_rq(cpu)->load.weight;
@@@@ -1536,11 -1536,11 -1557,8 +1557,8 @@@@
         }
    
         tg->cfs_rq[cpu]->h_load = load;
-- }
    
-- static void
-- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- {
++      return 0;
    }
    
    static void update_shares(struct sched_domain *sd)
@@@@ -1550,7 -1550,7 -1568,7 +1568,7 @@@@
    
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
--              walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++              walk_tg_tree(tg_nop, tg_shares_up, sd);
         }
    }
    
@@@@ -1561,9 -1561,9 -1579,9 +1579,9 @@@@ static void update_shares_locked(struc
         spin_lock(&rq->lock);
    }
    
-- static void update_h_load(int cpu)
++ static void update_h_load(long cpu)
    {
--      walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++      walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
    }
    
    #else
@@@@ -1921,8 -1921,11 -1939,11 +1939,8 @@@@ unsigned long wait_task_inactive(struc
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
- --             if (!match_state || p->state == match_state) {
- --                     ncsw = p->nivcsw + p->nvcsw;
- --                     if (unlikely(!ncsw))
- --                             ncsw = 1;
- --             }
+ ++             if (!match_state || p->state == match_state)
+ ++                     ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, &flags);
    
                 /*
@@@@ -2282,7 -2285,7 -2303,7 +2300,7 @@@@ out_running
         trace_mark(kernel_sched_wakeup,
                 "pid %d state %ld ## rq %p task %p rq->curr %p",
                 p->pid, p->state, rq, p, rq->curr);
- --     check_preempt_curr(rq, p);
+ ++     check_preempt_curr(rq, p, sync);
    
         p->state = TASK_RUNNING;
    #ifdef CONFIG_SMP
@@@@ -2417,7 -2420,7 -2438,7 +2435,7 @@@@ void wake_up_new_task(struct task_struc
         trace_mark(kernel_sched_wakeup_new,
                 "pid %d state %ld ## rq %p task %p rq->curr %p",
                 p->pid, p->state, rq, p, rq->curr);
- --     check_preempt_curr(rq, p);
+ ++     check_preempt_curr(rq, p, 0);
    #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
                 p->sched_class->task_wake_up(rq, p);
@@@@ -2877,7 -2880,7 -2898,7 +2895,7 @@@@ static void pull_task(struct rq *src_rq
          * Note that idle threads have a prio of MAX_PRIO, for this test
          * to be always true for them.
          */
- --     check_preempt_curr(this_rq, p);
+ ++     check_preempt_curr(this_rq, p, 0);
    }
    
    /*
@@@@ -4175,65 -4178,65 -4196,6 +4193,65 @@@@ void account_steal_time(struct task_str
                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
    }
    
+  +/*
+  + * Use precise platform statistics if available:
+  + */
+  +#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+  +cputime_t task_utime(struct task_struct *p)
+  +{
+  +     return p->utime;
+  +}
+  +
+  +cputime_t task_stime(struct task_struct *p)
+  +{
+  +     return p->stime;
+  +}
+  +#else
+  +cputime_t task_utime(struct task_struct *p)
+  +{
+  +     clock_t utime = cputime_to_clock_t(p->utime),
+  +             total = utime + cputime_to_clock_t(p->stime);
+  +     u64 temp;
+  +
+  +     /*
+  +      * Use CFS's precise accounting:
+  +      */
+  +     temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+  +
+  +     if (total) {
+  +             temp *= utime;
+  +             do_div(temp, total);
+  +     }
+  +     utime = (clock_t)temp;
+  +
+  +     p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+  +     return p->prev_utime;
+  +}
+  +
+  +cputime_t task_stime(struct task_struct *p)
+  +{
+  +     clock_t stime;
+  +
+  +     /*
+  +      * Use CFS's precise accounting. (we subtract utime from
+  +      * the total, to make sure the total observed by userspace
+  +      * grows monotonically - apps rely on that):
+  +      */
+  +     stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+  +                     cputime_to_clock_t(task_utime(p));
+  +
+  +     if (stime >= 0)
+  +             p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+  +
+  +     return p->prev_stime;
+  +}
+  +#endif
+  +
+  +inline cputime_t task_gtime(struct task_struct *p)
+  +{
+  +     return p->gtime;
+  +}
+  +
    /*
     * This function gets called by the timer code, with HZ frequency.
     * We call it with interrupts disabled.
@@@@ -4624,15 -4627,6 -4586,6 +4642,15 @@@@ __wake_up_sync(wait_queue_head_t *q, un
    }
    EXPORT_SYMBOL_GPL(__wake_up_sync);   /* For internal use only */
    
+ ++/**
+ ++ * complete: - signals a single thread waiting on this completion
+ ++ * @x:  holds the state of this particular completion
+ ++ *
+ ++ * This will wake up a single thread waiting on this completion. Threads will be
+ ++ * awakened in the same order in which they were queued.
+ ++ *
+ ++ * See also complete_all(), wait_for_completion() and related routines.
+ ++ */
    void complete(struct completion *x)
    {
         unsigned long flags;
@@@@ -4644,12 -4638,6 -4597,6 +4662,12 @@@@
    }
    EXPORT_SYMBOL(complete);
    
+ ++/**
+ ++ * complete_all: - signals all threads waiting on this completion
+ ++ * @x:  holds the state of this particular completion
+ ++ *
+ ++ * This will wake up all threads waiting on this particular completion event.
+ ++ */
    void complete_all(struct completion *x)
    {
         unsigned long flags;
@@@@ -4670,7 -4658,10 -4617,10 +4688,7 @@@@ do_wait_for_common(struct completion *x
                 wait.flags |= WQ_FLAG_EXCLUSIVE;
                 __add_wait_queue_tail(&x->wait, &wait);
                 do {
- --                     if ((state == TASK_INTERRUPTIBLE &&
- --                          signal_pending(current)) ||
- --                         (state == TASK_KILLABLE &&
- --                          fatal_signal_pending(current))) {
+ ++                     if (signal_pending_state(state, current)) {
                                 timeout = -ERESTARTSYS;
                                 break;
                         }
@@@@ -4698,31 -4689,12 -4648,12 +4716,31 @@@@ wait_for_common(struct completion *x, l
         return timeout;
    }
    
+ ++/**
+ ++ * wait_for_completion: - waits for completion of a task
+ ++ * @x:  holds the state of this particular completion
+ ++ *
+ ++ * This waits to be signaled for completion of a specific task. It is NOT
+ ++ * interruptible and there is no timeout.
+ ++ *
+ ++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ ++ * and interrupt capability. Also see complete().
+ ++ */
    void __sched wait_for_completion(struct completion *x)
    {
         wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
    }
    EXPORT_SYMBOL(wait_for_completion);
    
+ ++/**
+ ++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ ++ * @x:  holds the state of this particular completion
+ ++ * @timeout:  timeout value in jiffies
+ ++ *
+ ++ * This waits for either a completion of a specific task to be signaled or for a
+ ++ * specified timeout to expire. The timeout is in jiffies. It is not
+ ++ * interruptible.
+ ++ */
    unsigned long __sched
    wait_for_completion_timeout(struct completion *x, unsigned long timeout)
    {
@@@@ -4730,13 -4702,6 -4661,6 +4748,13 @@@@
    }
    EXPORT_SYMBOL(wait_for_completion_timeout);
    
+ ++/**
+ ++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ ++ * @x:  holds the state of this particular completion
+ ++ *
+ ++ * This waits for completion of a specific task to be signaled. It is
+ ++ * interruptible.
+ ++ */
    int __sched wait_for_completion_interruptible(struct completion *x)
    {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@@@ -4746,14 -4711,6 -4670,6 +4764,14 @@@@
    }
    EXPORT_SYMBOL(wait_for_completion_interruptible);
    
+ ++/**
+ ++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ ++ * @x:  holds the state of this particular completion
+ ++ * @timeout:  timeout value in jiffies
+ ++ *
+ ++ * This waits for either a completion of a specific task to be signaled or for a
+ ++ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ ++ */
    unsigned long __sched
    wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
@@@@ -4762,13 -4719,6 -4678,6 +4780,13 @@@@
    }
    EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
    
+ ++/**
+ ++ * wait_for_completion_killable: - waits for completion of a task (killable)
+ ++ * @x:  holds the state of this particular completion
+ ++ *
+ ++ * This waits to be signaled for completion of a specific task. It can be
+ ++ * interrupted by a kill signal.
+ ++ */
    int __sched wait_for_completion_killable(struct completion *x)
    {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@@ -5171,7 -5121,7 -5080,8 +5189,8 @@@@ recheck
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
                  */
--              if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++              if (rt_bandwidth_enabled() && rt_policy(policy) &&
++                              task_group(p)->rt_bandwidth.rt_runtime == 0)
                         return -EPERM;
    #endif
    
@@@@ -6007,7 -5957,7 -5917,7 +6026,7 @@@@ static int __migrate_task(struct task_s
         set_task_cpu(p, dest_cpu);
         if (on_rq) {
                 activate_task(rq_dest, p, 0);
- --             check_preempt_curr(rq_dest, p);
+ ++             check_preempt_curr(rq_dest, p, 0);
         }
    done:
         ret = 1;
@@@@ -7746,27 -7696,27 -7656,24 +7765,27 @@@@ static int dattrs_equal(struct sched_do
     * and partition_sched_domains() will fallback to the single partition
     * 'fallback_doms', it also forces the domains to be rebuilt.
     *
+  + * If doms_new==NULL it will be replaced with cpu_online_map.
+  + * ndoms_new==0 is a special case for destroying existing domains.
+  + * It will not create the default domain.
+  + *
     * Call with hotplug lock held
     */
    void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                              struct sched_domain_attr *dattr_new)
    {
-  -     int i, j;
+  +     int i, j, n;
    
         mutex_lock(&sched_domains_mutex);
    
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
    
-  -     if (doms_new == NULL)
-  -             ndoms_new = 0;
+  +     n = doms_new ? ndoms_new : 0;
    
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
-  -             for (j = 0; j < ndoms_new; j++) {
+  +             for (j = 0; j < n; j++) {
                         if (cpus_equal(doms_cur[i], doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
@@@@ -7779,6 -7729,6 -7686,7 +7798,6 @@@@ match1
    
         if (doms_new == NULL) {
                 ndoms_cur = 0;
-  -             ndoms_new = 1;
                 doms_new = &fallback_doms;
                 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                 dattr_new = NULL;
@@@@ -7815,13 -7765,13 -7723,8 +7834,13 @@@@ match2
    int arch_reinit_sched_domains(void)
    {
         get_online_cpus();
+  +
+  +     /* Destroy domains first to force the rebuild */
+  +     partition_sched_domains(0, NULL, NULL);
+  +
         rebuild_sched_domains();
         put_online_cpus();
+  +
         return 0;
    }
    
@@@@ -7905,7 -7855,7 -7808,7 +7924,7 @@@@ static int update_sched_domains(struct 
         case CPU_ONLINE_FROZEN:
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
-  -             partition_sched_domains(0, NULL, NULL);
+  +             partition_sched_domains(1, NULL, NULL);
                 return NOTIFY_OK;
    
         default:
@@@@ -8292,25 -8242,20 -8195,20 +8311,25 @@@@ void __might_sleep(char *file, int line
    #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
    
- --     if ((in_atomic() || irqs_disabled()) &&
- --         system_state == SYSTEM_RUNNING && !oops_in_progress) {
- --             if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- --                     return;
- --             prev_jiffy = jiffies;
- --             printk(KERN_ERR "BUG: sleeping function called from invalid"
- --                             " context at %s:%d\n", file, line);
- --             printk("in_atomic():%d, irqs_disabled():%d\n",
- --                     in_atomic(), irqs_disabled());
- --             debug_show_held_locks(current);
- --             if (irqs_disabled())
- --                     print_irqtrace_events(current);
- --             dump_stack();
- --     }
+ ++     if ((!in_atomic() && !irqs_disabled()) ||
+ ++                 system_state != SYSTEM_RUNNING || oops_in_progress)
+ ++             return;
+ ++     if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ ++             return;
+ ++     prev_jiffy = jiffies;
+ ++
+ ++     printk(KERN_ERR
+ ++             "BUG: sleeping function called from invalid context at %s:%d\n",
+ ++                     file, line);
+ ++     printk(KERN_ERR
+ ++             "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ ++                     in_atomic(), irqs_disabled(),
+ ++                     current->pid, current->comm);
+ ++
+ ++     debug_show_held_locks(current);
+ ++     if (irqs_disabled())
+ ++             print_irqtrace_events(current);
+ ++     dump_stack();
    #endif
    }
    EXPORT_SYMBOL(__might_sleep);
@@@@ -8808,73 -8753,73 -8706,77 +8827,77 @@@@ static DEFINE_MUTEX(rt_constraints_mute
    static unsigned long to_ratio(u64 period, u64 runtime)
    {
         if (runtime == RUNTIME_INF)
--              return 1ULL << 16;
++              return 1ULL << 20;
    
--      return div64_u64(runtime << 16, period);
++      return div64_u64(runtime << 20, period);
    }
    
-- #ifdef CONFIG_CGROUP_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ /* Must be called with tasklist_lock held */
++ static inline int tg_has_rt_tasks(struct task_group *tg)
    {
--      struct task_group *tgi, *parent = tg->parent;
--      unsigned long total = 0;
++      struct task_struct *g, *p;
    
--      if (!parent) {
--              if (global_rt_period() < period)
--                      return 0;
++      do_each_thread(g, p) {
++              if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++                      return 1;
++      } while_each_thread(g, p);
    
--              return to_ratio(period, runtime) <
--                      to_ratio(global_rt_period(), global_rt_runtime());
--      }
++      return 0;
++ }
    
--      if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
--              return 0;
++ struct rt_schedulable_data {
++      struct task_group *tg;
++      u64 rt_period;
++      u64 rt_runtime;
++ };
    
--      rcu_read_lock();
--      list_for_each_entry_rcu(tgi, &parent->children, siblings) {
--              if (tgi == tg)
--                      continue;
++ static int tg_schedulable(struct task_group *tg, void *data)
++ {
++      struct rt_schedulable_data *d = data;
++      struct task_group *child;
++      unsigned long total, sum = 0;
++      u64 period, runtime;
++ 
++      period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++      runtime = tg->rt_bandwidth.rt_runtime;
    
--              total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
--                              tgi->rt_bandwidth.rt_runtime);
++      if (tg == d->tg) {
++              period = d->rt_period;
++              runtime = d->rt_runtime;
         }
--      rcu_read_unlock();
    
--      return total + to_ratio(period, runtime) <=
--              to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
--                              parent->rt_bandwidth.rt_runtime);
-- }
-- #elif defined CONFIG_USER_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- {
--      struct task_group *tgi;
--      unsigned long total = 0;
--      unsigned long global_ratio =
--              to_ratio(global_rt_period(), global_rt_runtime());
++      if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++              return -EBUSY;
    
--      rcu_read_lock();
--      list_for_each_entry_rcu(tgi, &task_groups, list) {
--              if (tgi == tg)
--                      continue;
++      total = to_ratio(period, runtime);
+ + 
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
++      list_for_each_entry_rcu(child, &tg->children, siblings) {
++              period = ktime_to_ns(child->rt_bandwidth.rt_period);
++              runtime = child->rt_bandwidth.rt_runtime;
++ 
++              if (child == d->tg) {
++                      period = d->rt_period;
++                      runtime = d->rt_runtime;
++              }
+  
- -              total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- -                              tgi->rt_bandwidth.rt_runtime);
++              sum += to_ratio(period, runtime);
         }
--      rcu_read_unlock();
    
--      return total + to_ratio(period, runtime) < global_ratio;
++      if (sum > total)
++              return -EINVAL;
++ 
++      return 0;
    }
-- #endif
    
-- /* Must be called with tasklist_lock held */
-- static inline int tg_has_rt_tasks(struct task_group *tg)
++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
    {
--      struct task_struct *g, *p;
--      do_each_thread(g, p) {
--              if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
--                      return 1;
--      } while_each_thread(g, p);
--      return 0;
++      struct rt_schedulable_data data = {
++              .tg = tg,
++              .rt_period = period,
++              .rt_runtime = runtime,
++      };
++ 
++      return walk_tg_tree(tg_schedulable, tg_nop, &data);
    }
    
    static int tg_set_bandwidth(struct task_group *tg,
@@@@ -8884,14 -8829,14 -8786,9 +8907,9 @@@@
    
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
--      if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
--              err = -EBUSY;
- -              goto unlock;
- -      }
- -      if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- -              err = -EINVAL;
++      err = __rt_schedulable(tg, rt_period, rt_runtime);
++      if (err)
                 goto unlock;
-       }
-       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-               err = -EINVAL;
-               goto unlock;
--      }
    
         spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@@ -8964,12 -8909,15 -8861,13 +8982,16 @@@@ static int sched_rt_global_constraints(
         u64 rt_runtime, rt_period;
         int ret = 0;
    
+ +     if (sysctl_sched_rt_period <= 0)
+ +             return -EINVAL;
+ +
         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
         rt_runtime = tg->rt_bandwidth.rt_runtime;
    
         mutex_lock(&rt_constraints_mutex);
--      if (!__rt_schedulable(tg, rt_period, rt_runtime))
--              ret = -EINVAL;
++      read_lock(&tasklist_lock);
++      ret = __rt_schedulable(tg, rt_period, rt_runtime);
++      read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
    
         return ret;
@@@@ -8980,6 -8928,9 -8878,6 +9002,9 @@@@ static int sched_rt_global_constraints(
         unsigned long flags;
         int i;
    
+ +     if (sysctl_sched_rt_period <= 0)
+ +             return -EINVAL;
+ +
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --combined kernel/sched_rt.c

index 6d2d0a5d030b5c99ac28cbbb46fd3784e14d812d,1113157b20581b07cbcdf325d4d3428cdd7cd288,94daace5ee151e45303f0eac48e8f9900d13acc3..2e228bd5395e36cd49101411bca1e796fbd8fef1
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
--- 3/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@@ -199,8 -199,8 -199,6 +199,8 @@@@ static inline struct rt_rq *group_rt_rq
    
    static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
    {
+  +     if (rt_rq->rt_nr_running)
+  +             resched_task(rq_of_rt_rq(rt_rq)->curr);
    }
    
    static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@@@ -350,6 -350,7 -348,6 +350,7 @@@@ static void __enable_runtime(struct rq 
                 spin_lock(&rt_rq->rt_runtime_lock);
                 rt_rq->rt_runtime = rt_b->rt_runtime;
                 rt_rq->rt_time = 0;
+ +             rt_rq->rt_throttled = 0;
                 spin_unlock(&rt_rq->rt_runtime_lock);
                 spin_unlock(&rt_b->rt_runtime_lock);
         }
@@@@ -388,7 -389,7 -386,7 +389,7 @@@@ static int do_sched_rt_period_timer(str
         int i, idle = 1;
         cpumask_t span;
    
--      if (rt_b->rt_runtime == RUNTIME_INF)
++      if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return 1;
    
         span = sched_rt_period_mask();
@@@@ -486,6 -487,6 -484,9 +487,9 @@@@ static void update_curr_rt(struct rq *r
         curr->se.exec_start = rq->clock;
         cpuacct_charge(curr, delta_exec);
    
++      if (!rt_bandwidth_enabled())
++              return;
++ 
         for_each_sched_rt_entity(rt_se) {
                 rt_rq = rt_rq_of_se(rt_se);
    
@@@@ -783,7 -784,7 -784,7 +787,7 @@@@ static void check_preempt_equal_prio(st
    /*
     * Preempt the current task with a newly woken task if needed:
     */
- --static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+ ++static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
    {
         if (p->prio < rq->curr->prio) {
                 resched_task(rq->curr);
author	Ingo Molnar <mingo@elte.hu>
	Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Tue, 23 Sep 2008 14:23:05 +0000 (16:23 +0200)
		1	2	3
kernel/sched.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	diff3 \|	blob \| history