2 * intel_powerclamp.c - package c-state idle injection
4 * Copyright (c) 2012, Intel Corporation.
7 * Arjan van de Ven <arjan@linux.intel.com>
8 * Jacob Pan <jacob.jun.pan@linux.intel.com>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
25 * 1. better handle wakeup from external interrupts, currently a fixed
26 * compensation is added to clamping duration when excessive amount
27 * of wakeups are observed during idle time. the reason is that in
28 * case of external interrupts without need for ack, clamping down
29 * cpu in non-irq context does not reduce irq. for majority of the
30 * cases, clamping down cpu does help reduce irq as well, we should
31 * be able to differenciate the two cases and give a quantitative
32 * solution for the irqs that we can control. perhaps based on
33 * get_cpu_iowait_time_us()
35 * 2. synchronization with other hw blocks
40 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/cpu.h>
47 #include <linux/thermal.h>
48 #include <linux/slab.h>
49 #include <linux/tick.h>
50 #include <linux/debugfs.h>
51 #include <linux/seq_file.h>
52 #include <linux/sched/rt.h>
56 #include <asm/mwait.h>
57 #include <asm/cpu_device_id.h>
58 #include <asm/hardirq.h>
60 #define MAX_TARGET_RATIO (50U)
61 /* For each undisturbed clamping period (no extra wake ups during idle time),
62 * we increment the confidence counter for the given target ratio.
63 * CONFIDENCE_OK defines the level where runtime calibration results are
66 #define CONFIDENCE_OK (3)
67 /* Default idle injection duration, driver adjust sleep time to meet target
68 * idle ratio. Similar to frequency modulation.
70 #define DEFAULT_DURATION_JIFFIES (6)
72 static unsigned int target_mwait;
73 static struct dentry *debug_dir;
75 /* user selected target */
76 static unsigned int set_target_ratio;
77 static unsigned int current_ratio;
78 static bool should_skip;
79 static bool reduce_irq;
80 static atomic_t idle_wakeup_counter;
81 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
82 * control parameters. default to BSP but BSP
87 static const struct sched_param sparam = {
88 .sched_priority = MAX_USER_RT_PRIO / 2,
90 struct powerclamp_worker_data {
91 struct kthread_worker *worker;
92 struct kthread_work balancing_work;
93 struct kthread_delayed_work idle_injection_work;
97 unsigned int window_size_now;
98 unsigned int target_ratio;
99 unsigned int duration_jiffies;
103 static struct powerclamp_worker_data * __percpu worker_data;
104 static struct thermal_cooling_device *cooling_dev;
105 static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
106 * clamping kthread worker
109 static unsigned int duration;
110 static unsigned int pkg_cstate_ratio_cur;
111 static unsigned int window_size;
113 static int duration_set(const char *arg, const struct kernel_param *kp)
116 unsigned long new_duration;
118 ret = kstrtoul(arg, 10, &new_duration);
121 if (new_duration > 25 || new_duration < 6) {
122 pr_err("Out of recommended range %lu, between 6-25ms\n",
127 duration = clamp(new_duration, 6ul, 25ul);
135 static const struct kernel_param_ops duration_ops = {
137 .get = param_get_int,
141 module_param_cb(duration, &duration_ops, &duration, 0644);
142 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
144 struct powerclamp_calibration_data {
145 unsigned long confidence; /* used for calibration, basically a counter
146 * gets incremented each time a clamping
147 * period is completed without extra wakeups
148 * once that counter is reached given level,
149 * compensation is deemed usable.
151 unsigned long steady_comp; /* steady state compensation used when
152 * no extra wakeups occurred.
154 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
155 * mostly from external interrupts.
159 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
161 static int window_size_set(const char *arg, const struct kernel_param *kp)
164 unsigned long new_window_size;
166 ret = kstrtoul(arg, 10, &new_window_size);
169 if (new_window_size > 10 || new_window_size < 2) {
170 pr_err("Out of recommended window size %lu, between 2-10\n",
175 window_size = clamp(new_window_size, 2ul, 10ul);
183 static const struct kernel_param_ops window_size_ops = {
184 .set = window_size_set,
185 .get = param_get_int,
188 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
189 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
190 "\tpowerclamp controls idle ratio within this window. larger\n"
191 "\twindow size results in slower response time but more smooth\n"
192 "\tclamping results. default to 2.");
194 static void find_target_mwait(void)
196 unsigned int eax, ebx, ecx, edx;
197 unsigned int highest_cstate = 0;
198 unsigned int highest_subcstate = 0;
201 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
204 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
206 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
207 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
210 edx >>= MWAIT_SUBSTATE_SIZE;
211 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
212 if (edx & MWAIT_SUBSTATE_MASK) {
214 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
217 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
218 (highest_subcstate - 1);
222 struct pkg_cstate_info {
228 #define PKG_CSTATE_INIT(id) { \
229 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
233 static struct pkg_cstate_info pkg_cstates[] = {
244 static bool has_pkg_state_counter(void)
247 struct pkg_cstate_info *info = pkg_cstates;
249 /* check if any one of the counter msrs exists */
250 while (info->msr_index) {
251 if (!rdmsrl_safe(info->msr_index, &val))
259 static u64 pkg_state_counter(void)
263 struct pkg_cstate_info *info = pkg_cstates;
265 while (info->msr_index) {
267 if (!rdmsrl_safe(info->msr_index, &val))
278 static unsigned int get_compensation(int ratio)
280 unsigned int comp = 0;
282 /* we only use compensation if all adjacent ones are good */
284 cal_data[ratio].confidence >= CONFIDENCE_OK &&
285 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
286 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
287 comp = (cal_data[ratio].steady_comp +
288 cal_data[ratio + 1].steady_comp +
289 cal_data[ratio + 2].steady_comp) / 3;
290 } else if (ratio == MAX_TARGET_RATIO - 1 &&
291 cal_data[ratio].confidence >= CONFIDENCE_OK &&
292 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
293 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
294 comp = (cal_data[ratio].steady_comp +
295 cal_data[ratio - 1].steady_comp +
296 cal_data[ratio - 2].steady_comp) / 3;
297 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
298 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
299 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
300 comp = (cal_data[ratio].steady_comp +
301 cal_data[ratio - 1].steady_comp +
302 cal_data[ratio + 1].steady_comp) / 3;
305 /* REVISIT: simple penalty of double idle injection */
308 /* do not exceed limit */
309 if (comp + ratio >= MAX_TARGET_RATIO)
310 comp = MAX_TARGET_RATIO - ratio - 1;
315 static void adjust_compensation(int target_ratio, unsigned int win)
318 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
321 * adjust compensations if confidence level has not been reached or
322 * there are too many wakeups during the last idle injection period, we
323 * cannot trust the data for compensation.
325 if (d->confidence >= CONFIDENCE_OK ||
326 atomic_read(&idle_wakeup_counter) >
327 win * num_online_cpus())
330 delta = set_target_ratio - current_ratio;
331 /* filter out bad data */
332 if (delta >= 0 && delta <= (1+target_ratio/10)) {
335 roundup(delta+d->steady_comp, 2)/2;
337 d->steady_comp = delta;
342 static bool powerclamp_adjust_controls(unsigned int target_ratio,
343 unsigned int guard, unsigned int win)
345 static u64 msr_last, tsc_last;
346 u64 msr_now, tsc_now;
349 /* check result for the last window */
350 msr_now = pkg_state_counter();
353 /* calculate pkg cstate vs tsc ratio */
354 if (!msr_last || !tsc_last)
356 else if (tsc_now-tsc_last) {
357 val64 = 100*(msr_now-msr_last);
358 do_div(val64, (tsc_now-tsc_last));
359 current_ratio = val64;
366 adjust_compensation(target_ratio, win);
368 * too many external interrupts, set flag such
369 * that we can take measure later.
371 reduce_irq = atomic_read(&idle_wakeup_counter) >=
372 2 * win * num_online_cpus();
374 atomic_set(&idle_wakeup_counter, 0);
375 /* if we are above target+guard, skip */
376 return set_target_ratio + guard <= current_ratio;
379 static void clamp_balancing_func(struct kthread_work *work)
381 struct powerclamp_worker_data *w_data;
383 unsigned long target_jiffies;
384 unsigned int compensated_ratio;
385 int interval; /* jiffies to sleep for each attempt */
387 w_data = container_of(work, struct powerclamp_worker_data,
391 * make sure user selected ratio does not take effect until
392 * the next round. adjust target_ratio if user has changed
393 * target such that we can converge quickly.
395 w_data->target_ratio = READ_ONCE(set_target_ratio);
396 w_data->guard = 1 + w_data->target_ratio / 20;
397 w_data->window_size_now = window_size;
398 w_data->duration_jiffies = msecs_to_jiffies(duration);
402 * systems may have different ability to enter package level
403 * c-states, thus we need to compensate the injected idle ratio
404 * to achieve the actual target reported by the HW.
406 compensated_ratio = w_data->target_ratio +
407 get_compensation(w_data->target_ratio);
408 if (compensated_ratio <= 0)
409 compensated_ratio = 1;
410 interval = w_data->duration_jiffies * 100 / compensated_ratio;
412 /* align idle time */
413 target_jiffies = roundup(jiffies, interval);
414 sleeptime = target_jiffies - jiffies;
418 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419 kthread_queue_delayed_work(w_data->worker,
420 &w_data->idle_injection_work,
424 static void clamp_idle_injection_func(struct kthread_work *work)
426 struct powerclamp_worker_data *w_data;
428 w_data = container_of(work, struct powerclamp_worker_data,
429 idle_injection_work.work);
432 * only elected controlling cpu can collect stats and update
433 * control parameters.
435 if (w_data->cpu == control_cpu &&
436 !(w_data->count % w_data->window_size_now)) {
438 powerclamp_adjust_controls(w_data->target_ratio,
440 w_data->window_size_now);
447 play_idle(jiffies_to_msecs(w_data->duration_jiffies));
450 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
451 kthread_queue_work(w_data->worker, &w_data->balancing_work);
455 * 1 HZ polling while clamping is active, useful for userspace
456 * to monitor actual idle ratio.
458 static void poll_pkg_cstate(struct work_struct *dummy);
459 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
460 static void poll_pkg_cstate(struct work_struct *dummy)
469 msr_now = pkg_state_counter();
472 /* calculate pkg cstate vs tsc ratio */
473 if (!msr_last || !tsc_last)
474 pkg_cstate_ratio_cur = 1;
476 if (tsc_now - tsc_last) {
477 val64 = 100 * (msr_now - msr_last);
478 do_div(val64, (tsc_now - tsc_last));
479 pkg_cstate_ratio_cur = val64;
487 if (true == clamping)
488 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
491 static void start_power_clamp_worker(unsigned long cpu)
493 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
494 struct kthread_worker *worker;
496 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
500 w_data->worker = worker;
503 w_data->clamping = true;
504 set_bit(cpu, cpu_clamping_mask);
505 sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
506 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
507 kthread_init_delayed_work(&w_data->idle_injection_work,
508 clamp_idle_injection_func);
509 kthread_queue_work(w_data->worker, &w_data->balancing_work);
512 static void stop_power_clamp_worker(unsigned long cpu)
514 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
519 w_data->clamping = false;
521 * Make sure that all works that get queued after this point see
522 * the clamping disabled. The counter part is not needed because
523 * there is an implicit memory barrier when the queued work
527 kthread_cancel_work_sync(&w_data->balancing_work);
528 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
530 * The balancing work still might be queued here because
531 * the handling of the "clapming" variable, cancel, and queue
532 * operations are not synchronized via a lock. But it is not
533 * a big deal. The balancing work is fast and destroy kthread
536 clear_bit(w_data->cpu, cpu_clamping_mask);
537 kthread_destroy_worker(w_data->worker);
539 w_data->worker = NULL;
542 static int start_power_clamp(void)
546 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
547 /* prevent cpu hotplug */
552 if (!cpu_online(control_cpu))
553 control_cpu = smp_processor_id();
556 schedule_delayed_work(&poll_pkg_cstate_work, 0);
558 /* start one kthread worker per online cpu */
559 for_each_online_cpu(cpu) {
560 start_power_clamp_worker(cpu);
567 static void end_power_clamp(void)
572 * Block requeuing in all the kthread workers. They will flush and
576 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
577 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
578 pr_debug("clamping worker for cpu %d alive, destroy\n",
580 stop_power_clamp_worker(i);
585 static int powerclamp_cpu_online(unsigned int cpu)
587 if (clamping == false)
589 start_power_clamp_worker(cpu);
590 /* prefer BSP as controlling CPU */
598 static int powerclamp_cpu_predown(unsigned int cpu)
600 if (clamping == false)
603 stop_power_clamp_worker(cpu);
604 if (cpu != control_cpu)
607 control_cpu = cpumask_first(cpu_online_mask);
608 if (control_cpu == cpu)
609 control_cpu = cpumask_next(cpu, cpu_online_mask);
614 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
615 unsigned long *state)
617 *state = MAX_TARGET_RATIO;
622 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
623 unsigned long *state)
625 if (true == clamping)
626 *state = pkg_cstate_ratio_cur;
628 /* to save power, do not poll idle ratio while not clamping */
629 *state = -1; /* indicates invalid state */
634 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
635 unsigned long new_target_ratio)
639 new_target_ratio = clamp(new_target_ratio, 0UL,
640 (unsigned long) (MAX_TARGET_RATIO-1));
641 if (set_target_ratio == 0 && new_target_ratio > 0) {
642 pr_info("Start idle injection to reduce power\n");
643 set_target_ratio = new_target_ratio;
644 ret = start_power_clamp();
646 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
647 pr_info("Stop forced idle injection\n");
649 set_target_ratio = 0;
650 } else /* adjust currently running */ {
651 set_target_ratio = new_target_ratio;
652 /* make new set_target_ratio visible to other cpus */
660 /* bind to generic thermal layer as cooling device*/
661 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
662 .get_max_state = powerclamp_get_max_state,
663 .get_cur_state = powerclamp_get_cur_state,
664 .set_cur_state = powerclamp_set_cur_state,
667 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
668 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
671 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
673 static int __init powerclamp_probe(void)
676 if (!x86_match_cpu(intel_powerclamp_ids)) {
677 pr_err("CPU does not support MWAIT");
681 /* The goal for idle time alignment is to achieve package cstate. */
682 if (!has_pkg_state_counter()) {
683 pr_info("No package C-state available");
687 /* find the deepest mwait value */
693 static int powerclamp_debug_show(struct seq_file *m, void *unused)
697 seq_printf(m, "controlling cpu: %d\n", control_cpu);
698 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
699 for (i = 0; i < MAX_TARGET_RATIO; i++) {
700 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
702 cal_data[i].confidence,
703 cal_data[i].steady_comp,
704 cal_data[i].dynamic_comp);
710 static int powerclamp_debug_open(struct inode *inode,
713 return single_open(file, powerclamp_debug_show, inode->i_private);
716 static const struct file_operations powerclamp_debug_fops = {
717 .open = powerclamp_debug_open,
720 .release = single_release,
721 .owner = THIS_MODULE,
724 static inline void powerclamp_create_debug_files(void)
726 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
730 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
731 cal_data, &powerclamp_debug_fops))
737 debugfs_remove_recursive(debug_dir);
740 static enum cpuhp_state hp_state;
742 static int __init powerclamp_init(void)
747 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
748 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
749 if (!cpu_clamping_mask)
752 /* probe cpu features and ids here */
753 retval = powerclamp_probe();
757 /* set default limit, maybe adjusted during runtime based on feedback */
759 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
760 "thermal/intel_powerclamp:online",
761 powerclamp_cpu_online,
762 powerclamp_cpu_predown);
768 worker_data = alloc_percpu(struct powerclamp_worker_data);
771 goto exit_unregister;
774 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
775 &powerclamp_cooling_ops);
776 if (IS_ERR(cooling_dev)) {
778 goto exit_free_thread;
782 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
784 powerclamp_create_debug_files();
789 free_percpu(worker_data);
791 cpuhp_remove_state_nocalls(hp_state);
793 kfree(cpu_clamping_mask);
796 module_init(powerclamp_init);
798 static void __exit powerclamp_exit(void)
801 cpuhp_remove_state_nocalls(hp_state);
802 free_percpu(worker_data);
803 thermal_cooling_device_unregister(cooling_dev);
804 kfree(cpu_clamping_mask);
806 cancel_delayed_work_sync(&poll_pkg_cstate_work);
807 debugfs_remove_recursive(debug_dir);
809 module_exit(powerclamp_exit);
811 MODULE_LICENSE("GPL");
812 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
813 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
814 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");