/*
* Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock;
+ se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
- if (unlikely(rt_prio(p->prio)))
- goto preempt;
-
- if (unlikely(p->sched_class != &fair_sched_class))
- return;
-
if (unlikely(se == pse))
return;
* 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq->clock, sd);
+ tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
+ unsigned long this_has_capacity;
+ unsigned int this_idle_cpus;
/* Statistics of the busiest group */
+ unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
+ unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
+ unsigned long idle_cpus;
+ unsigned long group_weight;
int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
u64 total, available;
total = sched_avg_period() + (rq->clock - rq->age_stamp);
- available = total - rq->rt_avg;
+
+ if (unlikely(total < rq->rt_avg)) {
+ /* Ensures that power won't end up being negative */
+ available = 0;
+ } else {
+ available = total - rq->rt_avg;
+ }
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
+ max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
+ if (load > max_cpu_load) {
max_cpu_load = load;
+ max_nr_running = rq->nr_running;
+ }
if (min_cpu_load > load)
min_cpu_load = load;
}
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
-
+ if (idle_cpu(i))
+ sgs->idle_cpus++;
}
/*
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity =
- DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_weight = group->group_weight;
+
+ if (sgs->group_capacity > sgs->sum_nr_running)
+ sgs->group_has_capacity = 1;
}
/**
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
- * and move all the excess tasks away.
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks, i.e. nr_running < group_capacity. The
+ * extra check prevents the case where you always pull from the
+ * heaviest group when it is already under-utilized (possible
+ * with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling)
+ if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
+ sds->this_has_capacity = sgs.group_has_capacity;
+ sds->this_idle_cpus = sgs.idle_cpus;
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->busiest_has_capacity = sgs.group_has_capacity;
+ sds->busiest_group_weight = sgs.group_weight;
sds->group_imb = sgs.group_imb;
}
return fix_small_imbalance(sds, this_cpu, imbalance);
}
+
/******* find_busiest_group() helpers end here *********************/
/**
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
+ *
+ * Note: when doing newidle balance, if the local group has excess
+ * capacity (i.e. nr_running < group_capacity) and the busiest group
+ * does not have any capacity, we force a load balance to pull tasks
+ * to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ !sds.busiest_has_capacity)
+ goto force_balance;
+
if (sds.this_load >= sds.max_load)
goto out_balanced;
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
+ /*
+ * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+ * And to check for busy balance use !idle_cpu instead of
+ * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+ * even when they are idle.
+ */
+ if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+ } else {
+ /*
+ * This cpu is idle. If the busiest group load doesn't
+ * have more tasks than the number of available cpu's and
+ * there is no imbalance between this and busiest group
+ * wrt to idle cpu's, it is balanced.
+ */
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ sds.busiest_nr_running <= sds.busiest_group_weight)
+ goto out_balanced;
+ }
+force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
- sd->nr_balance_failed++;
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
this_cpu)) {
update_rq_clock(rq);
- if (unlikely(task_cpu(p) != this_cpu))
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};