rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
}
++ static inline int rt_bandwidth_enabled(void)
++ {
++ return sysctl_sched_rt_runtime >= 0;
++ }
++
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
-- if (rt_b->rt_runtime == RUNTIME_INF)
++ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
-- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ #else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
-- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ #endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
--static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
-- rq->curr->sched_class->check_preempt_curr(rq, p);
++ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
return NOTIFY_DONE;
}
- -static void init_hrtick(void)
+ +static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
--static void init_hrtick(void)
++static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
rq->hrtick_timer.function = hrtick;
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
}
--#else
++#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
--#endif
++#endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
update_load_sub(&rq->load, load);
}
-- #ifdef CONFIG_SMP
-- static unsigned long source_load(int cpu, int type);
-- static unsigned long target_load(int cpu, int type);
-- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
--
-- static unsigned long cpu_avg_load_per_task(int cpu)
-- {
-- struct rq *rq = cpu_rq(cpu);
--
-- if (rq->nr_running)
-- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
--
-- return rq->avg_load_per_task;
-- }
--
-- #ifdef CONFIG_FAIR_GROUP_SCHED
--
-- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
-- static void
-- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
++ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
-- (*down)(parent, cpu, sd);
++ ret = (*down)(parent, data);
++ if (ret)
++ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
-- (*up)(parent, cpu, sd);
++ ret = (*up)(parent, data);
++ if (ret)
++ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
++ out_unlock:
rcu_read_unlock();
++
++ return ret;
+ }
+
++ static int tg_nop(struct task_group *tg, void *data)
++ {
++ return 0;
+ }
++ #endif
++
++ #ifdef CONFIG_SMP
++ static unsigned long source_load(int cpu, int type);
++ static unsigned long target_load(int cpu, int type);
++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++
++ static unsigned long cpu_avg_load_per_task(int cpu)
++ {
++ struct rq *rq = cpu_rq(cpu);
++
++ if (rq->nr_running)
++ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++
++ return rq->avg_load_per_task;
++ }
++
++ #ifdef CONFIG_FAIR_GROUP_SCHED
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
-- static void
-- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
++ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
++
++ return 0;
}
/*
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-- static void
-- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
++ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
}
tg->cfs_rq[cpu]->h_load = load;
-- }
-- static void
-- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- {
++ return 0;
}
static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
-- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
spin_lock(&rq->lock);
}
-- static void update_h_load(int cpu)
++ static void update_h_load(long cpu)
{
-- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
-- if (!match_state || p->state == match_state) {
-- ncsw = p->nivcsw + p->nvcsw;
-- if (unlikely(!ncsw))
-- ncsw = 1;
-- }
++ if (!match_state || p->state == match_state)
++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
trace_mark(kernel_sched_wakeup,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
-- check_preempt_curr(rq, p);
++ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
trace_mark(kernel_sched_wakeup_new,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
-- check_preempt_curr(rq, p);
++ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
-- check_preempt_curr(this_rq, p);
++ check_preempt_curr(this_rq, p, 0);
}
/*
cpustat->steal = cputime64_add(cpustat->steal, tmp);
}
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+ return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+ u64 temp;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+
+ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+ return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ clock_t stime;
+
+ /*
+ * Use CFS's precise accounting. (we subtract utime from
+ * the total, to make sure the total observed by userspace
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+ if (stime >= 0)
+ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+ return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
++/**
++ * complete: - signals a single thread waiting on this completion
++ * @x: holds the state of this particular completion
++ *
++ * This will wake up a single thread waiting on this completion. Threads will be
++ * awakened in the same order in which they were queued.
++ *
++ * See also complete_all(), wait_for_completion() and related routines.
++ */
void complete(struct completion *x)
{
unsigned long flags;
}
EXPORT_SYMBOL(complete);
++/**
++ * complete_all: - signals all threads waiting on this completion
++ * @x: holds the state of this particular completion
++ *
++ * This will wake up all threads waiting on this particular completion event.
++ */
void complete_all(struct completion *x)
{
unsigned long flags;
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
-- if ((state == TASK_INTERRUPTIBLE &&
-- signal_pending(current)) ||
-- (state == TASK_KILLABLE &&
-- fatal_signal_pending(current))) {
++ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
return timeout;
}
++/**
++ * wait_for_completion: - waits for completion of a task
++ * @x: holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It is NOT
++ * interruptible and there is no timeout.
++ *
++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ * and interrupt capability. Also see complete().
++ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
++/**
++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ * @x: holds the state of this particular completion
++ * @timeout: timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. The timeout is in jiffies. It is not
++ * interruptible.
++ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
}
EXPORT_SYMBOL(wait_for_completion_timeout);
++/**
++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ * @x: holds the state of this particular completion
++ *
++ * This waits for completion of a specific task to be signaled. It is
++ * interruptible.
++ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
++/**
++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ * @x: holds the state of this particular completion
++ * @timeout: timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
++/**
++ * wait_for_completion_killable: - waits for completion of a task (killable)
++ * @x: holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It can be
++ * interrupted by a kill signal.
++ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
-- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++ if (rt_bandwidth_enabled() && rt_policy(policy) &&
++ task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
-- check_preempt_curr(rq_dest, p);
++ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
- int i, j;
+ int i, j, n;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
- if (doms_new == NULL)
- ndoms_new = 0;
+ n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < ndoms_new; j++) {
+ for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
if (doms_new == NULL) {
ndoms_cur = 0;
- ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
int arch_reinit_sched_domains(void)
{
get_online_cpus();
+
+ /* Destroy domains first to force the rebuild */
+ partition_sched_domains(0, NULL, NULL);
+
rebuild_sched_domains();
put_online_cpus();
+
return 0;
}
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- partition_sched_domains(0, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK;
default:
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
-- if ((in_atomic() || irqs_disabled()) &&
-- system_state == SYSTEM_RUNNING && !oops_in_progress) {
-- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- return;
-- prev_jiffy = jiffies;
-- printk(KERN_ERR "BUG: sleeping function called from invalid"
-- " context at %s:%d\n", file, line);
-- printk("in_atomic():%d, irqs_disabled():%d\n",
-- in_atomic(), irqs_disabled());
-- debug_show_held_locks(current);
-- if (irqs_disabled())
-- print_irqtrace_events(current);
-- dump_stack();
-- }
++ if ((!in_atomic() && !irqs_disabled()) ||
++ system_state != SYSTEM_RUNNING || oops_in_progress)
++ return;
++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
++ return;
++ prev_jiffy = jiffies;
++
++ printk(KERN_ERR
++ "BUG: sleeping function called from invalid context at %s:%d\n",
++ file, line);
++ printk(KERN_ERR
++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
++ in_atomic(), irqs_disabled(),
++ current->pid, current->comm);
++
++ debug_show_held_locks(current);
++ if (irqs_disabled())
++ print_irqtrace_events(current);
++ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
-- return 1ULL << 16;
++ return 1ULL << 20;
-- return div64_u64(runtime << 16, period);
++ return div64_u64(runtime << 20, period);
}
-- #ifdef CONFIG_CGROUP_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ /* Must be called with tasklist_lock held */
++ static inline int tg_has_rt_tasks(struct task_group *tg)
{
-- struct task_group *tgi, *parent = tg->parent;
-- unsigned long total = 0;
++ struct task_struct *g, *p;
-- if (!parent) {
-- if (global_rt_period() < period)
-- return 0;
++ do_each_thread(g, p) {
++ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++ return 1;
++ } while_each_thread(g, p);
-- return to_ratio(period, runtime) <
-- to_ratio(global_rt_period(), global_rt_runtime());
-- }
++ return 0;
++ }
-- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-- return 0;
++ struct rt_schedulable_data {
++ struct task_group *tg;
++ u64 rt_period;
++ u64 rt_runtime;
++ };
-- rcu_read_lock();
-- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-- if (tgi == tg)
-- continue;
++ static int tg_schedulable(struct task_group *tg, void *data)
++ {
++ struct rt_schedulable_data *d = data;
++ struct task_group *child;
++ unsigned long total, sum = 0;
++ u64 period, runtime;
++
++ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++ runtime = tg->rt_bandwidth.rt_runtime;
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ if (tg == d->tg) {
++ period = d->rt_period;
++ runtime = d->rt_runtime;
}
-- rcu_read_unlock();
-- return total + to_ratio(period, runtime) <=
-- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-- parent->rt_bandwidth.rt_runtime);
-- }
-- #elif defined CONFIG_USER_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- {
-- struct task_group *tgi;
-- unsigned long total = 0;
-- unsigned long global_ratio =
-- to_ratio(global_rt_period(), global_rt_runtime());
++ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++ return -EBUSY;
-- rcu_read_lock();
-- list_for_each_entry_rcu(tgi, &task_groups, list) {
-- if (tgi == tg)
-- continue;
++ total = to_ratio(period, runtime);
+
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
++ list_for_each_entry_rcu(child, &tg->children, siblings) {
++ period = ktime_to_ns(child->rt_bandwidth.rt_period);
++ runtime = child->rt_bandwidth.rt_runtime;
++
++ if (child == d->tg) {
++ period = d->rt_period;
++ runtime = d->rt_runtime;
++ }
+
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
++ sum += to_ratio(period, runtime);
}
-- rcu_read_unlock();
-- return total + to_ratio(period, runtime) < global_ratio;
++ if (sum > total)
++ return -EINVAL;
++
++ return 0;
}
-- #endif
-- /* Must be called with tasklist_lock held */
-- static inline int tg_has_rt_tasks(struct task_group *tg)
++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
-- struct task_struct *g, *p;
-- do_each_thread(g, p) {
-- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-- return 1;
-- } while_each_thread(g, p);
-- return 0;
++ struct rt_schedulable_data data = {
++ .tg = tg,
++ .rt_period = period,
++ .rt_runtime = runtime,
++ };
++
++ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
-- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-- err = -EBUSY;
- goto unlock;
- }
- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- err = -EINVAL;
++ err = __rt_schedulable(tg, rt_period, rt_runtime);
++ if (err)
goto unlock;
- }
- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- err = -EINVAL;
- goto unlock;
-- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
u64 rt_runtime, rt_period;
int ret = 0;
+ + if (sysctl_sched_rt_period <= 0)
+ + return -EINVAL;
+ +
rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
rt_runtime = tg->rt_bandwidth.rt_runtime;
mutex_lock(&rt_constraints_mutex);
-- if (!__rt_schedulable(tg, rt_period, rt_runtime))
-- ret = -EINVAL;
++ read_lock(&tasklist_lock);
++ ret = __rt_schedulable(tg, rt_period, rt_runtime);
++ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
unsigned long flags;
int i;
+ + if (sysctl_sched_rt_period <= 0)
+ + return -EINVAL;
+ +
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;