]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/thermal/intel_powerclamp.c
Merge tag 'perf-urgent-for-mingo-4.11-20170317' of git://git.kernel.org/pub/scm/linux...
[karo-tx-linux.git] / drivers / thermal / intel_powerclamp.c
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *      TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differenciate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *           2. synchronization with other hw blocks
36  *
37  *
38  */
39
40 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
41
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/cpu.h>
47 #include <linux/thermal.h>
48 #include <linux/slab.h>
49 #include <linux/tick.h>
50 #include <linux/debugfs.h>
51 #include <linux/seq_file.h>
52 #include <linux/sched/rt.h>
53
54 #include <asm/nmi.h>
55 #include <asm/msr.h>
56 #include <asm/mwait.h>
57 #include <asm/cpu_device_id.h>
58 #include <asm/hardirq.h>
59
60 #define MAX_TARGET_RATIO (50U)
61 /* For each undisturbed clamping period (no extra wake ups during idle time),
62  * we increment the confidence counter for the given target ratio.
63  * CONFIDENCE_OK defines the level where runtime calibration results are
64  * valid.
65  */
66 #define CONFIDENCE_OK (3)
67 /* Default idle injection duration, driver adjust sleep time to meet target
68  * idle ratio. Similar to frequency modulation.
69  */
70 #define DEFAULT_DURATION_JIFFIES (6)
71
72 static unsigned int target_mwait;
73 static struct dentry *debug_dir;
74
75 /* user selected target */
76 static unsigned int set_target_ratio;
77 static unsigned int current_ratio;
78 static bool should_skip;
79 static bool reduce_irq;
80 static atomic_t idle_wakeup_counter;
81 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
82                                   * control parameters. default to BSP but BSP
83                                   * can be offlined.
84                                   */
85 static bool clamping;
86
87 static const struct sched_param sparam = {
88         .sched_priority = MAX_USER_RT_PRIO / 2,
89 };
90 struct powerclamp_worker_data {
91         struct kthread_worker *worker;
92         struct kthread_work balancing_work;
93         struct kthread_delayed_work idle_injection_work;
94         unsigned int cpu;
95         unsigned int count;
96         unsigned int guard;
97         unsigned int window_size_now;
98         unsigned int target_ratio;
99         unsigned int duration_jiffies;
100         bool clamping;
101 };
102
103 static struct powerclamp_worker_data * __percpu worker_data;
104 static struct thermal_cooling_device *cooling_dev;
105 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
106                                            * clamping kthread worker
107                                            */
108
109 static unsigned int duration;
110 static unsigned int pkg_cstate_ratio_cur;
111 static unsigned int window_size;
112
113 static int duration_set(const char *arg, const struct kernel_param *kp)
114 {
115         int ret = 0;
116         unsigned long new_duration;
117
118         ret = kstrtoul(arg, 10, &new_duration);
119         if (ret)
120                 goto exit;
121         if (new_duration > 25 || new_duration < 6) {
122                 pr_err("Out of recommended range %lu, between 6-25ms\n",
123                         new_duration);
124                 ret = -EINVAL;
125         }
126
127         duration = clamp(new_duration, 6ul, 25ul);
128         smp_mb();
129
130 exit:
131
132         return ret;
133 }
134
135 static const struct kernel_param_ops duration_ops = {
136         .set = duration_set,
137         .get = param_get_int,
138 };
139
140
141 module_param_cb(duration, &duration_ops, &duration, 0644);
142 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
143
144 struct powerclamp_calibration_data {
145         unsigned long confidence;  /* used for calibration, basically a counter
146                                     * gets incremented each time a clamping
147                                     * period is completed without extra wakeups
148                                     * once that counter is reached given level,
149                                     * compensation is deemed usable.
150                                     */
151         unsigned long steady_comp; /* steady state compensation used when
152                                     * no extra wakeups occurred.
153                                     */
154         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
155                                      * mostly from external interrupts.
156                                      */
157 };
158
159 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
160
161 static int window_size_set(const char *arg, const struct kernel_param *kp)
162 {
163         int ret = 0;
164         unsigned long new_window_size;
165
166         ret = kstrtoul(arg, 10, &new_window_size);
167         if (ret)
168                 goto exit_win;
169         if (new_window_size > 10 || new_window_size < 2) {
170                 pr_err("Out of recommended window size %lu, between 2-10\n",
171                         new_window_size);
172                 ret = -EINVAL;
173         }
174
175         window_size = clamp(new_window_size, 2ul, 10ul);
176         smp_mb();
177
178 exit_win:
179
180         return ret;
181 }
182
183 static const struct kernel_param_ops window_size_ops = {
184         .set = window_size_set,
185         .get = param_get_int,
186 };
187
188 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
189 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
190         "\tpowerclamp controls idle ratio within this window. larger\n"
191         "\twindow size results in slower response time but more smooth\n"
192         "\tclamping results. default to 2.");
193
194 static void find_target_mwait(void)
195 {
196         unsigned int eax, ebx, ecx, edx;
197         unsigned int highest_cstate = 0;
198         unsigned int highest_subcstate = 0;
199         int i;
200
201         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
202                 return;
203
204         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
205
206         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
207             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
208                 return;
209
210         edx >>= MWAIT_SUBSTATE_SIZE;
211         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
212                 if (edx & MWAIT_SUBSTATE_MASK) {
213                         highest_cstate = i;
214                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
215                 }
216         }
217         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
218                 (highest_subcstate - 1);
219
220 }
221
222 struct pkg_cstate_info {
223         bool skip;
224         int msr_index;
225         int cstate_id;
226 };
227
228 #define PKG_CSTATE_INIT(id) {                           \
229                 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
230                 .cstate_id = id                         \
231                         }
232
233 static struct pkg_cstate_info pkg_cstates[] = {
234         PKG_CSTATE_INIT(2),
235         PKG_CSTATE_INIT(3),
236         PKG_CSTATE_INIT(6),
237         PKG_CSTATE_INIT(7),
238         PKG_CSTATE_INIT(8),
239         PKG_CSTATE_INIT(9),
240         PKG_CSTATE_INIT(10),
241         {NULL},
242 };
243
244 static bool has_pkg_state_counter(void)
245 {
246         u64 val;
247         struct pkg_cstate_info *info = pkg_cstates;
248
249         /* check if any one of the counter msrs exists */
250         while (info->msr_index) {
251                 if (!rdmsrl_safe(info->msr_index, &val))
252                         return true;
253                 info++;
254         }
255
256         return false;
257 }
258
259 static u64 pkg_state_counter(void)
260 {
261         u64 val;
262         u64 count = 0;
263         struct pkg_cstate_info *info = pkg_cstates;
264
265         while (info->msr_index) {
266                 if (!info->skip) {
267                         if (!rdmsrl_safe(info->msr_index, &val))
268                                 count += val;
269                         else
270                                 info->skip = true;
271                 }
272                 info++;
273         }
274
275         return count;
276 }
277
278 static unsigned int get_compensation(int ratio)
279 {
280         unsigned int comp = 0;
281
282         /* we only use compensation if all adjacent ones are good */
283         if (ratio == 1 &&
284                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
285                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
286                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
287                 comp = (cal_data[ratio].steady_comp +
288                         cal_data[ratio + 1].steady_comp +
289                         cal_data[ratio + 2].steady_comp) / 3;
290         } else if (ratio == MAX_TARGET_RATIO - 1 &&
291                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
292                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
293                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
294                 comp = (cal_data[ratio].steady_comp +
295                         cal_data[ratio - 1].steady_comp +
296                         cal_data[ratio - 2].steady_comp) / 3;
297         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
298                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
299                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
300                 comp = (cal_data[ratio].steady_comp +
301                         cal_data[ratio - 1].steady_comp +
302                         cal_data[ratio + 1].steady_comp) / 3;
303         }
304
305         /* REVISIT: simple penalty of double idle injection */
306         if (reduce_irq)
307                 comp = ratio;
308         /* do not exceed limit */
309         if (comp + ratio >= MAX_TARGET_RATIO)
310                 comp = MAX_TARGET_RATIO - ratio - 1;
311
312         return comp;
313 }
314
315 static void adjust_compensation(int target_ratio, unsigned int win)
316 {
317         int delta;
318         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
319
320         /*
321          * adjust compensations if confidence level has not been reached or
322          * there are too many wakeups during the last idle injection period, we
323          * cannot trust the data for compensation.
324          */
325         if (d->confidence >= CONFIDENCE_OK ||
326                 atomic_read(&idle_wakeup_counter) >
327                 win * num_online_cpus())
328                 return;
329
330         delta = set_target_ratio - current_ratio;
331         /* filter out bad data */
332         if (delta >= 0 && delta <= (1+target_ratio/10)) {
333                 if (d->steady_comp)
334                         d->steady_comp =
335                                 roundup(delta+d->steady_comp, 2)/2;
336                 else
337                         d->steady_comp = delta;
338                 d->confidence++;
339         }
340 }
341
342 static bool powerclamp_adjust_controls(unsigned int target_ratio,
343                                 unsigned int guard, unsigned int win)
344 {
345         static u64 msr_last, tsc_last;
346         u64 msr_now, tsc_now;
347         u64 val64;
348
349         /* check result for the last window */
350         msr_now = pkg_state_counter();
351         tsc_now = rdtsc();
352
353         /* calculate pkg cstate vs tsc ratio */
354         if (!msr_last || !tsc_last)
355                 current_ratio = 1;
356         else if (tsc_now-tsc_last) {
357                 val64 = 100*(msr_now-msr_last);
358                 do_div(val64, (tsc_now-tsc_last));
359                 current_ratio = val64;
360         }
361
362         /* update record */
363         msr_last = msr_now;
364         tsc_last = tsc_now;
365
366         adjust_compensation(target_ratio, win);
367         /*
368          * too many external interrupts, set flag such
369          * that we can take measure later.
370          */
371         reduce_irq = atomic_read(&idle_wakeup_counter) >=
372                 2 * win * num_online_cpus();
373
374         atomic_set(&idle_wakeup_counter, 0);
375         /* if we are above target+guard, skip */
376         return set_target_ratio + guard <= current_ratio;
377 }
378
379 static void clamp_balancing_func(struct kthread_work *work)
380 {
381         struct powerclamp_worker_data *w_data;
382         int sleeptime;
383         unsigned long target_jiffies;
384         unsigned int compensated_ratio;
385         int interval; /* jiffies to sleep for each attempt */
386
387         w_data = container_of(work, struct powerclamp_worker_data,
388                               balancing_work);
389
390         /*
391          * make sure user selected ratio does not take effect until
392          * the next round. adjust target_ratio if user has changed
393          * target such that we can converge quickly.
394          */
395         w_data->target_ratio = READ_ONCE(set_target_ratio);
396         w_data->guard = 1 + w_data->target_ratio / 20;
397         w_data->window_size_now = window_size;
398         w_data->duration_jiffies = msecs_to_jiffies(duration);
399         w_data->count++;
400
401         /*
402          * systems may have different ability to enter package level
403          * c-states, thus we need to compensate the injected idle ratio
404          * to achieve the actual target reported by the HW.
405          */
406         compensated_ratio = w_data->target_ratio +
407                 get_compensation(w_data->target_ratio);
408         if (compensated_ratio <= 0)
409                 compensated_ratio = 1;
410         interval = w_data->duration_jiffies * 100 / compensated_ratio;
411
412         /* align idle time */
413         target_jiffies = roundup(jiffies, interval);
414         sleeptime = target_jiffies - jiffies;
415         if (sleeptime <= 0)
416                 sleeptime = 1;
417
418         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
419                 kthread_queue_delayed_work(w_data->worker,
420                                            &w_data->idle_injection_work,
421                                            sleeptime);
422 }
423
424 static void clamp_idle_injection_func(struct kthread_work *work)
425 {
426         struct powerclamp_worker_data *w_data;
427
428         w_data = container_of(work, struct powerclamp_worker_data,
429                               idle_injection_work.work);
430
431         /*
432          * only elected controlling cpu can collect stats and update
433          * control parameters.
434          */
435         if (w_data->cpu == control_cpu &&
436             !(w_data->count % w_data->window_size_now)) {
437                 should_skip =
438                         powerclamp_adjust_controls(w_data->target_ratio,
439                                                    w_data->guard,
440                                                    w_data->window_size_now);
441                 smp_mb();
442         }
443
444         if (should_skip)
445                 goto balance;
446
447         play_idle(jiffies_to_msecs(w_data->duration_jiffies));
448
449 balance:
450         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
451                 kthread_queue_work(w_data->worker, &w_data->balancing_work);
452 }
453
454 /*
455  * 1 HZ polling while clamping is active, useful for userspace
456  * to monitor actual idle ratio.
457  */
458 static void poll_pkg_cstate(struct work_struct *dummy);
459 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
460 static void poll_pkg_cstate(struct work_struct *dummy)
461 {
462         static u64 msr_last;
463         static u64 tsc_last;
464
465         u64 msr_now;
466         u64 tsc_now;
467         u64 val64;
468
469         msr_now = pkg_state_counter();
470         tsc_now = rdtsc();
471
472         /* calculate pkg cstate vs tsc ratio */
473         if (!msr_last || !tsc_last)
474                 pkg_cstate_ratio_cur = 1;
475         else {
476                 if (tsc_now - tsc_last) {
477                         val64 = 100 * (msr_now - msr_last);
478                         do_div(val64, (tsc_now - tsc_last));
479                         pkg_cstate_ratio_cur = val64;
480                 }
481         }
482
483         /* update record */
484         msr_last = msr_now;
485         tsc_last = tsc_now;
486
487         if (true == clamping)
488                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
489 }
490
491 static void start_power_clamp_worker(unsigned long cpu)
492 {
493         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
494         struct kthread_worker *worker;
495
496         worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
497         if (IS_ERR(worker))
498                 return;
499
500         w_data->worker = worker;
501         w_data->count = 0;
502         w_data->cpu = cpu;
503         w_data->clamping = true;
504         set_bit(cpu, cpu_clamping_mask);
505         sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
506         kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
507         kthread_init_delayed_work(&w_data->idle_injection_work,
508                                   clamp_idle_injection_func);
509         kthread_queue_work(w_data->worker, &w_data->balancing_work);
510 }
511
512 static void stop_power_clamp_worker(unsigned long cpu)
513 {
514         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
515
516         if (!w_data->worker)
517                 return;
518
519         w_data->clamping = false;
520         /*
521          * Make sure that all works that get queued after this point see
522          * the clamping disabled. The counter part is not needed because
523          * there is an implicit memory barrier when the queued work
524          * is proceed.
525          */
526         smp_wmb();
527         kthread_cancel_work_sync(&w_data->balancing_work);
528         kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
529         /*
530          * The balancing work still might be queued here because
531          * the handling of the "clapming" variable, cancel, and queue
532          * operations are not synchronized via a lock. But it is not
533          * a big deal. The balancing work is fast and destroy kthread
534          * will wait for it.
535          */
536         clear_bit(w_data->cpu, cpu_clamping_mask);
537         kthread_destroy_worker(w_data->worker);
538
539         w_data->worker = NULL;
540 }
541
542 static int start_power_clamp(void)
543 {
544         unsigned long cpu;
545
546         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
547         /* prevent cpu hotplug */
548         get_online_cpus();
549
550         /* prefer BSP */
551         control_cpu = 0;
552         if (!cpu_online(control_cpu))
553                 control_cpu = smp_processor_id();
554
555         clamping = true;
556         schedule_delayed_work(&poll_pkg_cstate_work, 0);
557
558         /* start one kthread worker per online cpu */
559         for_each_online_cpu(cpu) {
560                 start_power_clamp_worker(cpu);
561         }
562         put_online_cpus();
563
564         return 0;
565 }
566
567 static void end_power_clamp(void)
568 {
569         int i;
570
571         /*
572          * Block requeuing in all the kthread workers. They will flush and
573          * stop faster.
574          */
575         clamping = false;
576         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
577                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
578                         pr_debug("clamping worker for cpu %d alive, destroy\n",
579                                  i);
580                         stop_power_clamp_worker(i);
581                 }
582         }
583 }
584
585 static int powerclamp_cpu_online(unsigned int cpu)
586 {
587         if (clamping == false)
588                 return 0;
589         start_power_clamp_worker(cpu);
590         /* prefer BSP as controlling CPU */
591         if (cpu == 0) {
592                 control_cpu = 0;
593                 smp_mb();
594         }
595         return 0;
596 }
597
598 static int powerclamp_cpu_predown(unsigned int cpu)
599 {
600         if (clamping == false)
601                 return 0;
602
603         stop_power_clamp_worker(cpu);
604         if (cpu != control_cpu)
605                 return 0;
606
607         control_cpu = cpumask_first(cpu_online_mask);
608         if (control_cpu == cpu)
609                 control_cpu = cpumask_next(cpu, cpu_online_mask);
610         smp_mb();
611         return 0;
612 }
613
614 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
615                                  unsigned long *state)
616 {
617         *state = MAX_TARGET_RATIO;
618
619         return 0;
620 }
621
622 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
623                                  unsigned long *state)
624 {
625         if (true == clamping)
626                 *state = pkg_cstate_ratio_cur;
627         else
628                 /* to save power, do not poll idle ratio while not clamping */
629                 *state = -1; /* indicates invalid state */
630
631         return 0;
632 }
633
634 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
635                                  unsigned long new_target_ratio)
636 {
637         int ret = 0;
638
639         new_target_ratio = clamp(new_target_ratio, 0UL,
640                                 (unsigned long) (MAX_TARGET_RATIO-1));
641         if (set_target_ratio == 0 && new_target_ratio > 0) {
642                 pr_info("Start idle injection to reduce power\n");
643                 set_target_ratio = new_target_ratio;
644                 ret = start_power_clamp();
645                 goto exit_set;
646         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
647                 pr_info("Stop forced idle injection\n");
648                 end_power_clamp();
649                 set_target_ratio = 0;
650         } else  /* adjust currently running */ {
651                 set_target_ratio = new_target_ratio;
652                 /* make new set_target_ratio visible to other cpus */
653                 smp_mb();
654         }
655
656 exit_set:
657         return ret;
658 }
659
660 /* bind to generic thermal layer as cooling device*/
661 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
662         .get_max_state = powerclamp_get_max_state,
663         .get_cur_state = powerclamp_get_cur_state,
664         .set_cur_state = powerclamp_set_cur_state,
665 };
666
667 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
668         { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
669         {}
670 };
671 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
672
673 static int __init powerclamp_probe(void)
674 {
675
676         if (!x86_match_cpu(intel_powerclamp_ids)) {
677                 pr_err("CPU does not support MWAIT");
678                 return -ENODEV;
679         }
680
681         /* The goal for idle time alignment is to achieve package cstate. */
682         if (!has_pkg_state_counter()) {
683                 pr_info("No package C-state available");
684                 return -ENODEV;
685         }
686
687         /* find the deepest mwait value */
688         find_target_mwait();
689
690         return 0;
691 }
692
693 static int powerclamp_debug_show(struct seq_file *m, void *unused)
694 {
695         int i = 0;
696
697         seq_printf(m, "controlling cpu: %d\n", control_cpu);
698         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
699         for (i = 0; i < MAX_TARGET_RATIO; i++) {
700                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
701                         i,
702                         cal_data[i].confidence,
703                         cal_data[i].steady_comp,
704                         cal_data[i].dynamic_comp);
705         }
706
707         return 0;
708 }
709
710 static int powerclamp_debug_open(struct inode *inode,
711                         struct file *file)
712 {
713         return single_open(file, powerclamp_debug_show, inode->i_private);
714 }
715
716 static const struct file_operations powerclamp_debug_fops = {
717         .open           = powerclamp_debug_open,
718         .read           = seq_read,
719         .llseek         = seq_lseek,
720         .release        = single_release,
721         .owner          = THIS_MODULE,
722 };
723
724 static inline void powerclamp_create_debug_files(void)
725 {
726         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
727         if (!debug_dir)
728                 return;
729
730         if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
731                                         cal_data, &powerclamp_debug_fops))
732                 goto file_error;
733
734         return;
735
736 file_error:
737         debugfs_remove_recursive(debug_dir);
738 }
739
740 static enum cpuhp_state hp_state;
741
742 static int __init powerclamp_init(void)
743 {
744         int retval;
745         int bitmap_size;
746
747         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
748         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
749         if (!cpu_clamping_mask)
750                 return -ENOMEM;
751
752         /* probe cpu features and ids here */
753         retval = powerclamp_probe();
754         if (retval)
755                 goto exit_free;
756
757         /* set default limit, maybe adjusted during runtime based on feedback */
758         window_size = 2;
759         retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
760                                            "thermal/intel_powerclamp:online",
761                                            powerclamp_cpu_online,
762                                            powerclamp_cpu_predown);
763         if (retval < 0)
764                 goto exit_free;
765
766         hp_state = retval;
767
768         worker_data = alloc_percpu(struct powerclamp_worker_data);
769         if (!worker_data) {
770                 retval = -ENOMEM;
771                 goto exit_unregister;
772         }
773
774         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
775                                                 &powerclamp_cooling_ops);
776         if (IS_ERR(cooling_dev)) {
777                 retval = -ENODEV;
778                 goto exit_free_thread;
779         }
780
781         if (!duration)
782                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
783
784         powerclamp_create_debug_files();
785
786         return 0;
787
788 exit_free_thread:
789         free_percpu(worker_data);
790 exit_unregister:
791         cpuhp_remove_state_nocalls(hp_state);
792 exit_free:
793         kfree(cpu_clamping_mask);
794         return retval;
795 }
796 module_init(powerclamp_init);
797
798 static void __exit powerclamp_exit(void)
799 {
800         end_power_clamp();
801         cpuhp_remove_state_nocalls(hp_state);
802         free_percpu(worker_data);
803         thermal_cooling_device_unregister(cooling_dev);
804         kfree(cpu_clamping_mask);
805
806         cancel_delayed_work_sync(&poll_pkg_cstate_work);
807         debugfs_remove_recursive(debug_dir);
808 }
809 module_exit(powerclamp_exit);
810
811 MODULE_LICENSE("GPL");
812 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
813 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
814 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");