]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/thermal/intel_powerclamp.c
Merge branch 'release' of git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux
[karo-tx-linux.git] / drivers / thermal / intel_powerclamp.c
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *      TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differenciate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *           2. synchronization with other hw blocks
36  *
37  *
38  */
39
40 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
41
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/freezer.h>
47 #include <linux/cpu.h>
48 #include <linux/thermal.h>
49 #include <linux/slab.h>
50 #include <linux/tick.h>
51 #include <linux/debugfs.h>
52 #include <linux/seq_file.h>
53
54 #include <asm/nmi.h>
55 #include <asm/msr.h>
56 #include <asm/mwait.h>
57 #include <asm/cpu_device_id.h>
58 #include <asm/idle.h>
59 #include <asm/hardirq.h>
60
61 #define MAX_TARGET_RATIO (50U)
62 /* For each undisturbed clamping period (no extra wake ups during idle time),
63  * we increment the confidence counter for the given target ratio.
64  * CONFIDENCE_OK defines the level where runtime calibration results are
65  * valid.
66  */
67 #define CONFIDENCE_OK (3)
68 /* Default idle injection duration, driver adjust sleep time to meet target
69  * idle ratio. Similar to frequency modulation.
70  */
71 #define DEFAULT_DURATION_JIFFIES (6)
72
73 static unsigned int target_mwait;
74 static struct dentry *debug_dir;
75
76 /* user selected target */
77 static unsigned int set_target_ratio;
78 static unsigned int current_ratio;
79 static bool should_skip;
80 static bool reduce_irq;
81 static atomic_t idle_wakeup_counter;
82 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83                                   * control parameters. default to BSP but BSP
84                                   * can be offlined.
85                                   */
86 static bool clamping;
87
88
89 static struct task_struct * __percpu *powerclamp_thread;
90 static struct thermal_cooling_device *cooling_dev;
91 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
92                                            * clamping thread
93                                            */
94
95 static unsigned int duration;
96 static unsigned int pkg_cstate_ratio_cur;
97 static unsigned int window_size;
98
99 static int duration_set(const char *arg, const struct kernel_param *kp)
100 {
101         int ret = 0;
102         unsigned long new_duration;
103
104         ret = kstrtoul(arg, 10, &new_duration);
105         if (ret)
106                 goto exit;
107         if (new_duration > 25 || new_duration < 6) {
108                 pr_err("Out of recommended range %lu, between 6-25ms\n",
109                         new_duration);
110                 ret = -EINVAL;
111         }
112
113         duration = clamp(new_duration, 6ul, 25ul);
114         smp_mb();
115
116 exit:
117
118         return ret;
119 }
120
121 static struct kernel_param_ops duration_ops = {
122         .set = duration_set,
123         .get = param_get_int,
124 };
125
126
127 module_param_cb(duration, &duration_ops, &duration, 0644);
128 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
129
130 struct powerclamp_calibration_data {
131         unsigned long confidence;  /* used for calibration, basically a counter
132                                     * gets incremented each time a clamping
133                                     * period is completed without extra wakeups
134                                     * once that counter is reached given level,
135                                     * compensation is deemed usable.
136                                     */
137         unsigned long steady_comp; /* steady state compensation used when
138                                     * no extra wakeups occurred.
139                                     */
140         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
141                                      * mostly from external interrupts.
142                                      */
143 };
144
145 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
146
147 static int window_size_set(const char *arg, const struct kernel_param *kp)
148 {
149         int ret = 0;
150         unsigned long new_window_size;
151
152         ret = kstrtoul(arg, 10, &new_window_size);
153         if (ret)
154                 goto exit_win;
155         if (new_window_size > 10 || new_window_size < 2) {
156                 pr_err("Out of recommended window size %lu, between 2-10\n",
157                         new_window_size);
158                 ret = -EINVAL;
159         }
160
161         window_size = clamp(new_window_size, 2ul, 10ul);
162         smp_mb();
163
164 exit_win:
165
166         return ret;
167 }
168
169 static struct kernel_param_ops window_size_ops = {
170         .set = window_size_set,
171         .get = param_get_int,
172 };
173
174 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
175 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
176         "\tpowerclamp controls idle ratio within this window. larger\n"
177         "\twindow size results in slower response time but more smooth\n"
178         "\tclamping results. default to 2.");
179
180 static void find_target_mwait(void)
181 {
182         unsigned int eax, ebx, ecx, edx;
183         unsigned int highest_cstate = 0;
184         unsigned int highest_subcstate = 0;
185         int i;
186
187         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
188                 return;
189
190         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
191
192         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
193             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
194                 return;
195
196         edx >>= MWAIT_SUBSTATE_SIZE;
197         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
198                 if (edx & MWAIT_SUBSTATE_MASK) {
199                         highest_cstate = i;
200                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
201                 }
202         }
203         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
204                 (highest_subcstate - 1);
205
206 }
207
208 static u64 pkg_state_counter(void)
209 {
210         u64 val;
211         u64 count = 0;
212
213         static bool skip_c2;
214         static bool skip_c3;
215         static bool skip_c6;
216         static bool skip_c7;
217
218         if (!skip_c2) {
219                 if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
220                         count += val;
221                 else
222                         skip_c2 = true;
223         }
224
225         if (!skip_c3) {
226                 if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
227                         count += val;
228                 else
229                         skip_c3 = true;
230         }
231
232         if (!skip_c6) {
233                 if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
234                         count += val;
235                 else
236                         skip_c6 = true;
237         }
238
239         if (!skip_c7) {
240                 if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
241                         count += val;
242                 else
243                         skip_c7 = true;
244         }
245
246         return count;
247 }
248
249 static void noop_timer(unsigned long foo)
250 {
251         /* empty... just the fact that we get the interrupt wakes us up */
252 }
253
254 static unsigned int get_compensation(int ratio)
255 {
256         unsigned int comp = 0;
257
258         /* we only use compensation if all adjacent ones are good */
259         if (ratio == 1 &&
260                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
261                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
262                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
263                 comp = (cal_data[ratio].steady_comp +
264                         cal_data[ratio + 1].steady_comp +
265                         cal_data[ratio + 2].steady_comp) / 3;
266         } else if (ratio == MAX_TARGET_RATIO - 1 &&
267                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
268                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
269                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
270                 comp = (cal_data[ratio].steady_comp +
271                         cal_data[ratio - 1].steady_comp +
272                         cal_data[ratio - 2].steady_comp) / 3;
273         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
274                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
275                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
276                 comp = (cal_data[ratio].steady_comp +
277                         cal_data[ratio - 1].steady_comp +
278                         cal_data[ratio + 1].steady_comp) / 3;
279         }
280
281         /* REVISIT: simple penalty of double idle injection */
282         if (reduce_irq)
283                 comp = ratio;
284         /* do not exceed limit */
285         if (comp + ratio >= MAX_TARGET_RATIO)
286                 comp = MAX_TARGET_RATIO - ratio - 1;
287
288         return comp;
289 }
290
291 static void adjust_compensation(int target_ratio, unsigned int win)
292 {
293         int delta;
294         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
295
296         /*
297          * adjust compensations if confidence level has not been reached or
298          * there are too many wakeups during the last idle injection period, we
299          * cannot trust the data for compensation.
300          */
301         if (d->confidence >= CONFIDENCE_OK ||
302                 atomic_read(&idle_wakeup_counter) >
303                 win * num_online_cpus())
304                 return;
305
306         delta = set_target_ratio - current_ratio;
307         /* filter out bad data */
308         if (delta >= 0 && delta <= (1+target_ratio/10)) {
309                 if (d->steady_comp)
310                         d->steady_comp =
311                                 roundup(delta+d->steady_comp, 2)/2;
312                 else
313                         d->steady_comp = delta;
314                 d->confidence++;
315         }
316 }
317
318 static bool powerclamp_adjust_controls(unsigned int target_ratio,
319                                 unsigned int guard, unsigned int win)
320 {
321         static u64 msr_last, tsc_last;
322         u64 msr_now, tsc_now;
323         u64 val64;
324
325         /* check result for the last window */
326         msr_now = pkg_state_counter();
327         rdtscll(tsc_now);
328
329         /* calculate pkg cstate vs tsc ratio */
330         if (!msr_last || !tsc_last)
331                 current_ratio = 1;
332         else if (tsc_now-tsc_last) {
333                 val64 = 100*(msr_now-msr_last);
334                 do_div(val64, (tsc_now-tsc_last));
335                 current_ratio = val64;
336         }
337
338         /* update record */
339         msr_last = msr_now;
340         tsc_last = tsc_now;
341
342         adjust_compensation(target_ratio, win);
343         /*
344          * too many external interrupts, set flag such
345          * that we can take measure later.
346          */
347         reduce_irq = atomic_read(&idle_wakeup_counter) >=
348                 2 * win * num_online_cpus();
349
350         atomic_set(&idle_wakeup_counter, 0);
351         /* if we are above target+guard, skip */
352         return set_target_ratio + guard <= current_ratio;
353 }
354
355 static int clamp_thread(void *arg)
356 {
357         int cpunr = (unsigned long)arg;
358         DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
359         static const struct sched_param param = {
360                 .sched_priority = MAX_USER_RT_PRIO/2,
361         };
362         unsigned int count = 0;
363         unsigned int target_ratio;
364
365         set_bit(cpunr, cpu_clamping_mask);
366         set_freezable();
367         init_timer_on_stack(&wakeup_timer);
368         sched_setscheduler(current, SCHED_FIFO, &param);
369
370         while (true == clamping && !kthread_should_stop() &&
371                 cpu_online(cpunr)) {
372                 int sleeptime;
373                 unsigned long target_jiffies;
374                 unsigned int guard;
375                 unsigned int compensation = 0;
376                 int interval; /* jiffies to sleep for each attempt */
377                 unsigned int duration_jiffies = msecs_to_jiffies(duration);
378                 unsigned int window_size_now;
379
380                 try_to_freeze();
381                 /*
382                  * make sure user selected ratio does not take effect until
383                  * the next round. adjust target_ratio if user has changed
384                  * target such that we can converge quickly.
385                  */
386                 target_ratio = set_target_ratio;
387                 guard = 1 + target_ratio/20;
388                 window_size_now = window_size;
389                 count++;
390
391                 /*
392                  * systems may have different ability to enter package level
393                  * c-states, thus we need to compensate the injected idle ratio
394                  * to achieve the actual target reported by the HW.
395                  */
396                 compensation = get_compensation(target_ratio);
397                 interval = duration_jiffies*100/(target_ratio+compensation);
398
399                 /* align idle time */
400                 target_jiffies = roundup(jiffies, interval);
401                 sleeptime = target_jiffies - jiffies;
402                 if (sleeptime <= 0)
403                         sleeptime = 1;
404                 schedule_timeout_interruptible(sleeptime);
405                 /*
406                  * only elected controlling cpu can collect stats and update
407                  * control parameters.
408                  */
409                 if (cpunr == control_cpu && !(count%window_size_now)) {
410                         should_skip =
411                                 powerclamp_adjust_controls(target_ratio,
412                                                         guard, window_size_now);
413                         smp_mb();
414                 }
415
416                 if (should_skip)
417                         continue;
418
419                 target_jiffies = jiffies + duration_jiffies;
420                 mod_timer(&wakeup_timer, target_jiffies);
421                 if (unlikely(local_softirq_pending()))
422                         continue;
423                 /*
424                  * stop tick sched during idle time, interrupts are still
425                  * allowed. thus jiffies are updated properly.
426                  */
427                 preempt_disable();
428                 tick_nohz_idle_enter();
429                 /* mwait until target jiffies is reached */
430                 while (time_before(jiffies, target_jiffies)) {
431                         unsigned long ecx = 1;
432                         unsigned long eax = target_mwait;
433
434                         /*
435                          * REVISIT: may call enter_idle() to notify drivers who
436                          * can save power during cpu idle. same for exit_idle()
437                          */
438                         local_touch_nmi();
439                         stop_critical_timings();
440                         __monitor((void *)&current_thread_info()->flags, 0, 0);
441                         cpu_relax(); /* allow HT sibling to run */
442                         __mwait(eax, ecx);
443                         start_critical_timings();
444                         atomic_inc(&idle_wakeup_counter);
445                 }
446                 tick_nohz_idle_exit();
447                 preempt_enable_no_resched();
448         }
449         del_timer_sync(&wakeup_timer);
450         clear_bit(cpunr, cpu_clamping_mask);
451
452         return 0;
453 }
454
455 /*
456  * 1 HZ polling while clamping is active, useful for userspace
457  * to monitor actual idle ratio.
458  */
459 static void poll_pkg_cstate(struct work_struct *dummy);
460 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
461 static void poll_pkg_cstate(struct work_struct *dummy)
462 {
463         static u64 msr_last;
464         static u64 tsc_last;
465         static unsigned long jiffies_last;
466
467         u64 msr_now;
468         unsigned long jiffies_now;
469         u64 tsc_now;
470         u64 val64;
471
472         msr_now = pkg_state_counter();
473         rdtscll(tsc_now);
474         jiffies_now = jiffies;
475
476         /* calculate pkg cstate vs tsc ratio */
477         if (!msr_last || !tsc_last)
478                 pkg_cstate_ratio_cur = 1;
479         else {
480                 if (tsc_now - tsc_last) {
481                         val64 = 100 * (msr_now - msr_last);
482                         do_div(val64, (tsc_now - tsc_last));
483                         pkg_cstate_ratio_cur = val64;
484                 }
485         }
486
487         /* update record */
488         msr_last = msr_now;
489         jiffies_last = jiffies_now;
490         tsc_last = tsc_now;
491
492         if (true == clamping)
493                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
494 }
495
496 static int start_power_clamp(void)
497 {
498         unsigned long cpu;
499         struct task_struct *thread;
500
501         /* check if pkg cstate counter is completely 0, abort in this case */
502         if (!pkg_state_counter()) {
503                 pr_err("pkg cstate counter not functional, abort\n");
504                 return -EINVAL;
505         }
506
507         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
508         /* prevent cpu hotplug */
509         get_online_cpus();
510
511         /* prefer BSP */
512         control_cpu = 0;
513         if (!cpu_online(control_cpu))
514                 control_cpu = smp_processor_id();
515
516         clamping = true;
517         schedule_delayed_work(&poll_pkg_cstate_work, 0);
518
519         /* start one thread per online cpu */
520         for_each_online_cpu(cpu) {
521                 struct task_struct **p =
522                         per_cpu_ptr(powerclamp_thread, cpu);
523
524                 thread = kthread_create_on_node(clamp_thread,
525                                                 (void *) cpu,
526                                                 cpu_to_node(cpu),
527                                                 "kidle_inject/%ld", cpu);
528                 /* bind to cpu here */
529                 if (likely(!IS_ERR(thread))) {
530                         kthread_bind(thread, cpu);
531                         wake_up_process(thread);
532                         *p = thread;
533                 }
534
535         }
536         put_online_cpus();
537
538         return 0;
539 }
540
541 static void end_power_clamp(void)
542 {
543         int i;
544         struct task_struct *thread;
545
546         clamping = false;
547         /*
548          * make clamping visible to other cpus and give per cpu clamping threads
549          * sometime to exit, or gets killed later.
550          */
551         smp_mb();
552         msleep(20);
553         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
554                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
555                         pr_debug("clamping thread for cpu %d alive, kill\n", i);
556                         thread = *per_cpu_ptr(powerclamp_thread, i);
557                         kthread_stop(thread);
558                 }
559         }
560 }
561
562 static int powerclamp_cpu_callback(struct notifier_block *nfb,
563                                 unsigned long action, void *hcpu)
564 {
565         unsigned long cpu = (unsigned long)hcpu;
566         struct task_struct *thread;
567         struct task_struct **percpu_thread =
568                 per_cpu_ptr(powerclamp_thread, cpu);
569
570         if (false == clamping)
571                 goto exit_ok;
572
573         switch (action) {
574         case CPU_ONLINE:
575                 thread = kthread_create_on_node(clamp_thread,
576                                                 (void *) cpu,
577                                                 cpu_to_node(cpu),
578                                                 "kidle_inject/%lu", cpu);
579                 if (likely(!IS_ERR(thread))) {
580                         kthread_bind(thread, cpu);
581                         wake_up_process(thread);
582                         *percpu_thread = thread;
583                 }
584                 /* prefer BSP as controlling CPU */
585                 if (cpu == 0) {
586                         control_cpu = 0;
587                         smp_mb();
588                 }
589                 break;
590         case CPU_DEAD:
591                 if (test_bit(cpu, cpu_clamping_mask)) {
592                         pr_err("cpu %lu dead but powerclamping thread is not\n",
593                                 cpu);
594                         kthread_stop(*percpu_thread);
595                 }
596                 if (cpu == control_cpu) {
597                         control_cpu = smp_processor_id();
598                         smp_mb();
599                 }
600         }
601
602 exit_ok:
603         return NOTIFY_OK;
604 }
605
606 static struct notifier_block powerclamp_cpu_notifier = {
607         .notifier_call = powerclamp_cpu_callback,
608 };
609
610 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
611                                  unsigned long *state)
612 {
613         *state = MAX_TARGET_RATIO;
614
615         return 0;
616 }
617
618 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
619                                  unsigned long *state)
620 {
621         if (true == clamping)
622                 *state = pkg_cstate_ratio_cur;
623         else
624                 /* to save power, do not poll idle ratio while not clamping */
625                 *state = -1; /* indicates invalid state */
626
627         return 0;
628 }
629
630 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
631                                  unsigned long new_target_ratio)
632 {
633         int ret = 0;
634
635         new_target_ratio = clamp(new_target_ratio, 0UL,
636                                 (unsigned long) (MAX_TARGET_RATIO-1));
637         if (set_target_ratio == 0 && new_target_ratio > 0) {
638                 pr_info("Start idle injection to reduce power\n");
639                 set_target_ratio = new_target_ratio;
640                 ret = start_power_clamp();
641                 goto exit_set;
642         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
643                 pr_info("Stop forced idle injection\n");
644                 set_target_ratio = 0;
645                 end_power_clamp();
646         } else  /* adjust currently running */ {
647                 set_target_ratio = new_target_ratio;
648                 /* make new set_target_ratio visible to other cpus */
649                 smp_mb();
650         }
651
652 exit_set:
653         return ret;
654 }
655
656 /* bind to generic thermal layer as cooling device*/
657 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
658         .get_max_state = powerclamp_get_max_state,
659         .get_cur_state = powerclamp_get_cur_state,
660         .set_cur_state = powerclamp_set_cur_state,
661 };
662
663 /* runs on Nehalem and later */
664 static const struct x86_cpu_id intel_powerclamp_ids[] = {
665         { X86_VENDOR_INTEL, 6, 0x1a},
666         { X86_VENDOR_INTEL, 6, 0x1c},
667         { X86_VENDOR_INTEL, 6, 0x1e},
668         { X86_VENDOR_INTEL, 6, 0x1f},
669         { X86_VENDOR_INTEL, 6, 0x25},
670         { X86_VENDOR_INTEL, 6, 0x26},
671         { X86_VENDOR_INTEL, 6, 0x2a},
672         { X86_VENDOR_INTEL, 6, 0x2c},
673         { X86_VENDOR_INTEL, 6, 0x2d},
674         { X86_VENDOR_INTEL, 6, 0x2e},
675         { X86_VENDOR_INTEL, 6, 0x2f},
676         { X86_VENDOR_INTEL, 6, 0x3a},
677         {}
678 };
679 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
680
681 static int powerclamp_probe(void)
682 {
683         if (!x86_match_cpu(intel_powerclamp_ids)) {
684                 pr_err("Intel powerclamp does not run on family %d model %d\n",
685                                 boot_cpu_data.x86, boot_cpu_data.x86_model);
686                 return -ENODEV;
687         }
688         if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
689                 !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
690                 !boot_cpu_has(X86_FEATURE_MWAIT) ||
691                 !boot_cpu_has(X86_FEATURE_ARAT))
692                 return -ENODEV;
693
694         /* find the deepest mwait value */
695         find_target_mwait();
696
697         return 0;
698 }
699
700 static int powerclamp_debug_show(struct seq_file *m, void *unused)
701 {
702         int i = 0;
703
704         seq_printf(m, "controlling cpu: %d\n", control_cpu);
705         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
706         for (i = 0; i < MAX_TARGET_RATIO; i++) {
707                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
708                         i,
709                         cal_data[i].confidence,
710                         cal_data[i].steady_comp,
711                         cal_data[i].dynamic_comp);
712         }
713
714         return 0;
715 }
716
717 static int powerclamp_debug_open(struct inode *inode,
718                         struct file *file)
719 {
720         return single_open(file, powerclamp_debug_show, inode->i_private);
721 }
722
723 static const struct file_operations powerclamp_debug_fops = {
724         .open           = powerclamp_debug_open,
725         .read           = seq_read,
726         .llseek         = seq_lseek,
727         .release        = single_release,
728         .owner          = THIS_MODULE,
729 };
730
731 static inline void powerclamp_create_debug_files(void)
732 {
733         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
734         if (!debug_dir)
735                 return;
736
737         if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
738                                         cal_data, &powerclamp_debug_fops))
739                 goto file_error;
740
741         return;
742
743 file_error:
744         debugfs_remove_recursive(debug_dir);
745 }
746
747 static int powerclamp_init(void)
748 {
749         int retval;
750         int bitmap_size;
751
752         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
753         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
754         if (!cpu_clamping_mask)
755                 return -ENOMEM;
756
757         /* probe cpu features and ids here */
758         retval = powerclamp_probe();
759         if (retval)
760                 return retval;
761         /* set default limit, maybe adjusted during runtime based on feedback */
762         window_size = 2;
763         register_hotcpu_notifier(&powerclamp_cpu_notifier);
764         powerclamp_thread = alloc_percpu(struct task_struct *);
765         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
766                                                 &powerclamp_cooling_ops);
767         if (IS_ERR(cooling_dev))
768                 return -ENODEV;
769
770         if (!duration)
771                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
772         powerclamp_create_debug_files();
773
774         return 0;
775 }
776 module_init(powerclamp_init);
777
778 static void powerclamp_exit(void)
779 {
780         unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
781         end_power_clamp();
782         free_percpu(powerclamp_thread);
783         thermal_cooling_device_unregister(cooling_dev);
784         kfree(cpu_clamping_mask);
785
786         cancel_delayed_work_sync(&poll_pkg_cstate_work);
787         debugfs_remove_recursive(debug_dir);
788 }
789 module_exit(powerclamp_exit);
790
791 MODULE_LICENSE("GPL");
792 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
793 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
794 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");