2 * CPUFreq governor based on scheduler-provided CPU utilization data.
4 * Copyright (C) 2016, Intel Corporation
5 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 #include <linux/cpufreq.h>
15 #include <linux/kthread.h>
16 #include <uapi/linux/sched/types.h>
17 #include <linux/slab.h>
18 #include <trace/events/power.h>
22 #define SUGOV_KTHREAD_PRIORITY 50
24 struct sugov_tunables {
25 struct gov_attr_set attr_set;
26 unsigned int rate_limit_us;
30 struct cpufreq_policy *policy;
32 struct sugov_tunables *tunables;
33 struct list_head tunables_hook;
35 raw_spinlock_t update_lock; /* For shared policies */
36 u64 last_freq_update_time;
37 s64 freq_update_delay_ns;
38 unsigned int next_freq;
40 /* The next fields are only needed if fast switch cannot be used. */
41 struct irq_work irq_work;
42 struct kthread_work work;
43 struct mutex work_lock;
44 struct kthread_worker worker;
45 struct task_struct *thread;
46 bool work_in_progress;
48 bool need_freq_update;
52 struct update_util_data update_util;
53 struct sugov_policy *sg_policy;
55 unsigned int cached_raw_freq;
56 unsigned long iowait_boost;
57 unsigned long iowait_boost_max;
60 /* The fields below are only needed when sharing a policy. */
66 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
68 /************************ Governor internals ***********************/
70 static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
74 if (sg_policy->work_in_progress)
77 if (unlikely(sg_policy->need_freq_update)) {
78 sg_policy->need_freq_update = false;
80 * This happens when limits change, so forget the previous
81 * next_freq value and force an update.
83 sg_policy->next_freq = UINT_MAX;
87 delta_ns = time - sg_policy->last_freq_update_time;
88 return delta_ns >= sg_policy->freq_update_delay_ns;
91 static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
92 unsigned int next_freq)
94 struct cpufreq_policy *policy = sg_policy->policy;
96 sg_policy->last_freq_update_time = time;
98 if (policy->fast_switch_enabled) {
99 if (sg_policy->next_freq == next_freq) {
100 trace_cpu_frequency(policy->cur, smp_processor_id());
103 sg_policy->next_freq = next_freq;
104 next_freq = cpufreq_driver_fast_switch(policy, next_freq);
105 if (next_freq == CPUFREQ_ENTRY_INVALID)
108 policy->cur = next_freq;
109 trace_cpu_frequency(next_freq, smp_processor_id());
110 } else if (sg_policy->next_freq != next_freq) {
111 sg_policy->next_freq = next_freq;
112 sg_policy->work_in_progress = true;
113 irq_work_queue(&sg_policy->irq_work);
118 * get_next_freq - Compute a new frequency for a given cpufreq policy.
119 * @sg_cpu: schedutil cpu object to compute the new frequency for.
120 * @util: Current CPU utilization.
121 * @max: CPU capacity.
123 * If the utilization is frequency-invariant, choose the new frequency to be
124 * proportional to it, that is
126 * next_freq = C * max_freq * util / max
128 * Otherwise, approximate the would-be frequency-invariant utilization by
129 * util_raw * (curr_freq / max_freq) which leads to
131 * next_freq = C * curr_freq * util_raw / max
133 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
135 * The lowest driver-supported frequency which is equal or greater than the raw
136 * next_freq (as calculated above) is returned, subject to policy min/max and
137 * cpufreq driver limitations.
139 static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
142 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
143 struct cpufreq_policy *policy = sg_policy->policy;
144 unsigned int freq = arch_scale_freq_invariant() ?
145 policy->cpuinfo.max_freq : policy->cur;
147 freq = (freq + (freq >> 2)) * util / max;
149 if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
150 return sg_policy->next_freq;
151 sg_cpu->cached_raw_freq = freq;
152 return cpufreq_driver_resolve_freq(policy, freq);
155 static void sugov_get_util(unsigned long *util, unsigned long *max)
157 struct rq *rq = this_rq();
158 unsigned long cfs_max;
160 cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
162 *util = min(rq->cfs.avg.util_avg, cfs_max);
166 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
169 if (flags & SCHED_CPUFREQ_IOWAIT) {
170 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
171 } else if (sg_cpu->iowait_boost) {
172 s64 delta_ns = time - sg_cpu->last_update;
174 /* Clear iowait_boost if the CPU apprears to have been idle. */
175 if (delta_ns > TICK_NSEC)
176 sg_cpu->iowait_boost = 0;
180 static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
183 unsigned long boost_util = sg_cpu->iowait_boost;
184 unsigned long boost_max = sg_cpu->iowait_boost_max;
189 if (*util * boost_max < *max * boost_util) {
193 sg_cpu->iowait_boost >>= 1;
196 static void sugov_update_single(struct update_util_data *hook, u64 time,
199 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
200 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
201 struct cpufreq_policy *policy = sg_policy->policy;
202 unsigned long util, max;
205 sugov_set_iowait_boost(sg_cpu, time, flags);
206 sg_cpu->last_update = time;
208 if (!sugov_should_update_freq(sg_policy, time))
211 if (flags & SCHED_CPUFREQ_RT_DL) {
212 next_f = policy->cpuinfo.max_freq;
214 sugov_get_util(&util, &max);
215 sugov_iowait_boost(sg_cpu, &util, &max);
216 next_f = get_next_freq(sg_cpu, util, max);
218 sugov_update_commit(sg_policy, time, next_f);
221 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
222 unsigned long util, unsigned long max,
225 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
226 struct cpufreq_policy *policy = sg_policy->policy;
227 unsigned int max_f = policy->cpuinfo.max_freq;
228 u64 last_freq_update_time = sg_policy->last_freq_update_time;
231 if (flags & SCHED_CPUFREQ_RT_DL)
234 sugov_iowait_boost(sg_cpu, &util, &max);
236 for_each_cpu(j, policy->cpus) {
237 struct sugov_cpu *j_sg_cpu;
238 unsigned long j_util, j_max;
241 if (j == smp_processor_id())
244 j_sg_cpu = &per_cpu(sugov_cpu, j);
246 * If the CPU utilization was last updated before the previous
247 * frequency update and the time elapsed between the last update
248 * of the CPU utilization and the last frequency update is long
249 * enough, don't take the CPU into account as it probably is
250 * idle now (and clear iowait_boost for it).
252 delta_ns = last_freq_update_time - j_sg_cpu->last_update;
253 if (delta_ns > TICK_NSEC) {
254 j_sg_cpu->iowait_boost = 0;
257 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
260 j_util = j_sg_cpu->util;
261 j_max = j_sg_cpu->max;
262 if (j_util * max > j_max * util) {
267 sugov_iowait_boost(j_sg_cpu, &util, &max);
270 return get_next_freq(sg_cpu, util, max);
273 static void sugov_update_shared(struct update_util_data *hook, u64 time,
276 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
277 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
278 unsigned long util, max;
281 sugov_get_util(&util, &max);
283 raw_spin_lock(&sg_policy->update_lock);
287 sg_cpu->flags = flags;
289 sugov_set_iowait_boost(sg_cpu, time, flags);
290 sg_cpu->last_update = time;
292 if (sugov_should_update_freq(sg_policy, time)) {
293 next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
294 sugov_update_commit(sg_policy, time, next_f);
297 raw_spin_unlock(&sg_policy->update_lock);
300 static void sugov_work(struct kthread_work *work)
302 struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
304 mutex_lock(&sg_policy->work_lock);
305 __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
307 mutex_unlock(&sg_policy->work_lock);
309 sg_policy->work_in_progress = false;
312 static void sugov_irq_work(struct irq_work *irq_work)
314 struct sugov_policy *sg_policy;
316 sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
319 * For RT and deadline tasks, the schedutil governor shoots the
320 * frequency to maximum. Special care must be taken to ensure that this
321 * kthread doesn't result in the same behavior.
323 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
324 * updated only at the end of the sugov_work() function and before that
325 * the schedutil governor rejects all other frequency scaling requests.
327 * There is a very rare case though, where the RT thread yields right
328 * after the work_in_progress flag is cleared. The effects of that are
331 kthread_queue_work(&sg_policy->worker, &sg_policy->work);
334 /************************** sysfs interface ************************/
336 static struct sugov_tunables *global_tunables;
337 static DEFINE_MUTEX(global_tunables_lock);
339 static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
341 return container_of(attr_set, struct sugov_tunables, attr_set);
344 static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
346 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
348 return sprintf(buf, "%u\n", tunables->rate_limit_us);
351 static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
354 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
355 struct sugov_policy *sg_policy;
356 unsigned int rate_limit_us;
358 if (kstrtouint(buf, 10, &rate_limit_us))
361 tunables->rate_limit_us = rate_limit_us;
363 list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
364 sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
369 static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
371 static struct attribute *sugov_attributes[] = {
376 static struct kobj_type sugov_tunables_ktype = {
377 .default_attrs = sugov_attributes,
378 .sysfs_ops = &governor_sysfs_ops,
381 /********************** cpufreq governor interface *********************/
383 static struct cpufreq_governor schedutil_gov;
385 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
387 struct sugov_policy *sg_policy;
389 sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
393 sg_policy->policy = policy;
394 raw_spin_lock_init(&sg_policy->update_lock);
398 static void sugov_policy_free(struct sugov_policy *sg_policy)
403 static int sugov_kthread_create(struct sugov_policy *sg_policy)
405 struct task_struct *thread;
406 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
407 struct cpufreq_policy *policy = sg_policy->policy;
410 /* kthread only required for slow path */
411 if (policy->fast_switch_enabled)
414 kthread_init_work(&sg_policy->work, sugov_work);
415 kthread_init_worker(&sg_policy->worker);
416 thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
418 cpumask_first(policy->related_cpus));
419 if (IS_ERR(thread)) {
420 pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
421 return PTR_ERR(thread);
424 ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m);
426 kthread_stop(thread);
427 pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
431 sg_policy->thread = thread;
432 kthread_bind_mask(thread, policy->related_cpus);
433 init_irq_work(&sg_policy->irq_work, sugov_irq_work);
434 mutex_init(&sg_policy->work_lock);
436 wake_up_process(thread);
441 static void sugov_kthread_stop(struct sugov_policy *sg_policy)
443 /* kthread only required for slow path */
444 if (sg_policy->policy->fast_switch_enabled)
447 kthread_flush_worker(&sg_policy->worker);
448 kthread_stop(sg_policy->thread);
449 mutex_destroy(&sg_policy->work_lock);
452 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
454 struct sugov_tunables *tunables;
456 tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
458 gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
459 if (!have_governor_per_policy())
460 global_tunables = tunables;
465 static void sugov_tunables_free(struct sugov_tunables *tunables)
467 if (!have_governor_per_policy())
468 global_tunables = NULL;
473 static int sugov_init(struct cpufreq_policy *policy)
475 struct sugov_policy *sg_policy;
476 struct sugov_tunables *tunables;
480 /* State should be equivalent to EXIT */
481 if (policy->governor_data)
484 cpufreq_enable_fast_switch(policy);
486 sg_policy = sugov_policy_alloc(policy);
489 goto disable_fast_switch;
492 ret = sugov_kthread_create(sg_policy);
496 mutex_lock(&global_tunables_lock);
498 if (global_tunables) {
499 if (WARN_ON(have_governor_per_policy())) {
503 policy->governor_data = sg_policy;
504 sg_policy->tunables = global_tunables;
506 gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
510 tunables = sugov_tunables_alloc(sg_policy);
516 tunables->rate_limit_us = LATENCY_MULTIPLIER;
517 lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
519 tunables->rate_limit_us *= lat;
521 policy->governor_data = sg_policy;
522 sg_policy->tunables = tunables;
524 ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
525 get_governor_parent_kobj(policy), "%s",
531 mutex_unlock(&global_tunables_lock);
535 policy->governor_data = NULL;
536 sugov_tunables_free(tunables);
539 sugov_kthread_stop(sg_policy);
542 mutex_unlock(&global_tunables_lock);
544 sugov_policy_free(sg_policy);
547 cpufreq_disable_fast_switch(policy);
549 pr_err("initialization failed (error %d)\n", ret);
553 static void sugov_exit(struct cpufreq_policy *policy)
555 struct sugov_policy *sg_policy = policy->governor_data;
556 struct sugov_tunables *tunables = sg_policy->tunables;
559 mutex_lock(&global_tunables_lock);
561 count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
562 policy->governor_data = NULL;
564 sugov_tunables_free(tunables);
566 mutex_unlock(&global_tunables_lock);
568 sugov_kthread_stop(sg_policy);
569 sugov_policy_free(sg_policy);
570 cpufreq_disable_fast_switch(policy);
573 static int sugov_start(struct cpufreq_policy *policy)
575 struct sugov_policy *sg_policy = policy->governor_data;
578 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
579 sg_policy->last_freq_update_time = 0;
580 sg_policy->next_freq = UINT_MAX;
581 sg_policy->work_in_progress = false;
582 sg_policy->need_freq_update = false;
584 for_each_cpu(cpu, policy->cpus) {
585 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
587 sg_cpu->sg_policy = sg_policy;
588 if (policy_is_shared(policy)) {
591 sg_cpu->flags = SCHED_CPUFREQ_RT;
592 sg_cpu->last_update = 0;
593 sg_cpu->cached_raw_freq = 0;
594 sg_cpu->iowait_boost = 0;
595 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
596 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
597 sugov_update_shared);
599 cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
600 sugov_update_single);
606 static void sugov_stop(struct cpufreq_policy *policy)
608 struct sugov_policy *sg_policy = policy->governor_data;
611 for_each_cpu(cpu, policy->cpus)
612 cpufreq_remove_update_util_hook(cpu);
616 if (!policy->fast_switch_enabled) {
617 irq_work_sync(&sg_policy->irq_work);
618 kthread_cancel_work_sync(&sg_policy->work);
622 static void sugov_limits(struct cpufreq_policy *policy)
624 struct sugov_policy *sg_policy = policy->governor_data;
626 if (!policy->fast_switch_enabled) {
627 mutex_lock(&sg_policy->work_lock);
628 cpufreq_policy_apply_limits(policy);
629 mutex_unlock(&sg_policy->work_lock);
632 sg_policy->need_freq_update = true;
635 static struct cpufreq_governor schedutil_gov = {
637 .owner = THIS_MODULE,
640 .start = sugov_start,
642 .limits = sugov_limits,
645 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
646 struct cpufreq_governor *cpufreq_default_governor(void)
648 return &schedutil_gov;
652 static int __init sugov_register(void)
654 return cpufreq_register_governor(&schedutil_gov);
656 fs_initcall(sugov_register);