]> git.karo-electronics.de Git - karo-tx-linux.git/blob - drivers/cpufreq/powernv-cpufreq.c
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
[karo-tx-linux.git] / drivers / cpufreq / powernv-cpufreq.c
1 /*
2  * POWERNV cpufreq driver for the IBM POWER processors
3  *
4  * (C) Copyright IBM 2014
5  *
6  * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2, or (at your option)
11  * any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  */
19
20 #define pr_fmt(fmt)     "powernv-cpufreq: " fmt
21
22 #include <linux/kernel.h>
23 #include <linux/sysfs.h>
24 #include <linux/cpumask.h>
25 #include <linux/module.h>
26 #include <linux/cpufreq.h>
27 #include <linux/smp.h>
28 #include <linux/of.h>
29 #include <linux/reboot.h>
30 #include <linux/slab.h>
31 #include <linux/cpu.h>
32 #include <trace/events/power.h>
33
34 #include <asm/cputhreads.h>
35 #include <asm/firmware.h>
36 #include <asm/reg.h>
37 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
38 #include <asm/opal.h>
39 #include <linux/timer.h>
40
41 #define POWERNV_MAX_PSTATES     256
42 #define PMSR_PSAFE_ENABLE       (1UL << 30)
43 #define PMSR_SPR_EM_DISABLE     (1UL << 31)
44 #define PMSR_MAX(x)             ((x >> 32) & 0xFF)
45 #define LPSTATE_SHIFT           48
46 #define GPSTATE_SHIFT           56
47 #define GET_LPSTATE(x)          (((x) >> LPSTATE_SHIFT) & 0xFF)
48 #define GET_GPSTATE(x)          (((x) >> GPSTATE_SHIFT) & 0xFF)
49
50 #define MAX_RAMP_DOWN_TIME                              5120
51 /*
52  * On an idle system we want the global pstate to ramp-down from max value to
53  * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
54  * then ramp-down rapidly later on.
55  *
56  * This gives a percentage rampdown for time elapsed in milliseconds.
57  * ramp_down_percentage = ((ms * ms) >> 18)
58  *                      ~= 3.8 * (sec * sec)
59  *
60  * At 0 ms      ramp_down_percent = 0
61  * At 5120 ms   ramp_down_percent = 100
62  */
63 #define ramp_down_percent(time)         ((time * time) >> 18)
64
65 /* Interval after which the timer is queued to bring down global pstate */
66 #define GPSTATE_TIMER_INTERVAL                          2000
67
68 /**
69  * struct global_pstate_info -  Per policy data structure to maintain history of
70  *                              global pstates
71  * @highest_lpstate_idx:        The local pstate index from which we are
72  *                              ramping down
73  * @elapsed_time:               Time in ms spent in ramping down from
74  *                              highest_lpstate_idx
75  * @last_sampled_time:          Time from boot in ms when global pstates were
76  *                              last set
77  * @last_lpstate_idx,           Last set value of local pstate and global
78  * last_gpstate_idx             pstate in terms of cpufreq table index
79  * @timer:                      Is used for ramping down if cpu goes idle for
80  *                              a long time with global pstate held high
81  * @gpstate_lock:               A spinlock to maintain synchronization between
82  *                              routines called by the timer handler and
83  *                              governer's target_index calls
84  */
85 struct global_pstate_info {
86         int highest_lpstate_idx;
87         unsigned int elapsed_time;
88         unsigned int last_sampled_time;
89         int last_lpstate_idx;
90         int last_gpstate_idx;
91         spinlock_t gpstate_lock;
92         struct timer_list timer;
93 };
94
95 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
96 static bool rebooting, throttled, occ_reset;
97
98 static const char * const throttle_reason[] = {
99         "No throttling",
100         "Power Cap",
101         "Processor Over Temperature",
102         "Power Supply Failure",
103         "Over Current",
104         "OCC Reset"
105 };
106
107 enum throttle_reason_type {
108         NO_THROTTLE = 0,
109         POWERCAP,
110         CPU_OVERTEMP,
111         POWER_SUPPLY_FAILURE,
112         OVERCURRENT,
113         OCC_RESET_THROTTLE,
114         OCC_MAX_REASON
115 };
116
117 static struct chip {
118         unsigned int id;
119         bool throttled;
120         bool restore;
121         u8 throttle_reason;
122         cpumask_t mask;
123         struct work_struct throttle;
124         int throttle_turbo;
125         int throttle_sub_turbo;
126         int reason[OCC_MAX_REASON];
127 } *chips;
128
129 static int nr_chips;
130 static DEFINE_PER_CPU(struct chip *, chip_info);
131
132 /*
133  * Note:
134  * The set of pstates consists of contiguous integers.
135  * powernv_pstate_info stores the index of the frequency table for
136  * max, min and nominal frequencies. It also stores number of
137  * available frequencies.
138  *
139  * powernv_pstate_info.nominal indicates the index to the highest
140  * non-turbo frequency.
141  */
142 static struct powernv_pstate_info {
143         unsigned int min;
144         unsigned int max;
145         unsigned int nominal;
146         unsigned int nr_pstates;
147 } powernv_pstate_info;
148
149 /* Use following macros for conversions between pstate_id and index */
150 static inline int idx_to_pstate(unsigned int i)
151 {
152         if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
153                 pr_warn_once("index %u is out of bound\n", i);
154                 return powernv_freqs[powernv_pstate_info.nominal].driver_data;
155         }
156
157         return powernv_freqs[i].driver_data;
158 }
159
160 static inline unsigned int pstate_to_idx(int pstate)
161 {
162         int min = powernv_freqs[powernv_pstate_info.min].driver_data;
163         int max = powernv_freqs[powernv_pstate_info.max].driver_data;
164
165         if (min > 0) {
166                 if (unlikely((pstate < max) || (pstate > min))) {
167                         pr_warn_once("pstate %d is out of bound\n", pstate);
168                         return powernv_pstate_info.nominal;
169                 }
170         } else {
171                 if (unlikely((pstate > max) || (pstate < min))) {
172                         pr_warn_once("pstate %d is out of bound\n", pstate);
173                         return powernv_pstate_info.nominal;
174                 }
175         }
176         /*
177          * abs() is deliberately used so that is works with
178          * both monotonically increasing and decreasing
179          * pstate values
180          */
181         return abs(pstate - idx_to_pstate(powernv_pstate_info.max));
182 }
183
184 static inline void reset_gpstates(struct cpufreq_policy *policy)
185 {
186         struct global_pstate_info *gpstates = policy->driver_data;
187
188         gpstates->highest_lpstate_idx = 0;
189         gpstates->elapsed_time = 0;
190         gpstates->last_sampled_time = 0;
191         gpstates->last_lpstate_idx = 0;
192         gpstates->last_gpstate_idx = 0;
193 }
194
195 /*
196  * Initialize the freq table based on data obtained
197  * from the firmware passed via device-tree
198  */
199 static int init_powernv_pstates(void)
200 {
201         struct device_node *power_mgt;
202         int i, nr_pstates = 0;
203         const __be32 *pstate_ids, *pstate_freqs;
204         u32 len_ids, len_freqs;
205         u32 pstate_min, pstate_max, pstate_nominal;
206
207         power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
208         if (!power_mgt) {
209                 pr_warn("power-mgt node not found\n");
210                 return -ENODEV;
211         }
212
213         if (of_property_read_u32(power_mgt, "ibm,pstate-min", &pstate_min)) {
214                 pr_warn("ibm,pstate-min node not found\n");
215                 return -ENODEV;
216         }
217
218         if (of_property_read_u32(power_mgt, "ibm,pstate-max", &pstate_max)) {
219                 pr_warn("ibm,pstate-max node not found\n");
220                 return -ENODEV;
221         }
222
223         if (of_property_read_u32(power_mgt, "ibm,pstate-nominal",
224                                  &pstate_nominal)) {
225                 pr_warn("ibm,pstate-nominal not found\n");
226                 return -ENODEV;
227         }
228         pr_info("cpufreq pstate min %d nominal %d max %d\n", pstate_min,
229                 pstate_nominal, pstate_max);
230
231         pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
232         if (!pstate_ids) {
233                 pr_warn("ibm,pstate-ids not found\n");
234                 return -ENODEV;
235         }
236
237         pstate_freqs = of_get_property(power_mgt, "ibm,pstate-frequencies-mhz",
238                                       &len_freqs);
239         if (!pstate_freqs) {
240                 pr_warn("ibm,pstate-frequencies-mhz not found\n");
241                 return -ENODEV;
242         }
243
244         if (len_ids != len_freqs) {
245                 pr_warn("Entries in ibm,pstate-ids and "
246                         "ibm,pstate-frequencies-mhz does not match\n");
247         }
248
249         nr_pstates = min(len_ids, len_freqs) / sizeof(u32);
250         if (!nr_pstates) {
251                 pr_warn("No PStates found\n");
252                 return -ENODEV;
253         }
254
255         powernv_pstate_info.nr_pstates = nr_pstates;
256         pr_debug("NR PStates %d\n", nr_pstates);
257         for (i = 0; i < nr_pstates; i++) {
258                 u32 id = be32_to_cpu(pstate_ids[i]);
259                 u32 freq = be32_to_cpu(pstate_freqs[i]);
260
261                 pr_debug("PState id %d freq %d MHz\n", id, freq);
262                 powernv_freqs[i].frequency = freq * 1000; /* kHz */
263                 powernv_freqs[i].driver_data = id;
264
265                 if (id == pstate_max)
266                         powernv_pstate_info.max = i;
267                 else if (id == pstate_nominal)
268                         powernv_pstate_info.nominal = i;
269                 else if (id == pstate_min)
270                         powernv_pstate_info.min = i;
271         }
272
273         /* End of list marker entry */
274         powernv_freqs[i].frequency = CPUFREQ_TABLE_END;
275         return 0;
276 }
277
278 /* Returns the CPU frequency corresponding to the pstate_id. */
279 static unsigned int pstate_id_to_freq(int pstate_id)
280 {
281         int i;
282
283         i = pstate_to_idx(pstate_id);
284         if (i >= powernv_pstate_info.nr_pstates || i < 0) {
285                 pr_warn("PState id %d outside of PState table, "
286                         "reporting nominal id %d instead\n",
287                         pstate_id, idx_to_pstate(powernv_pstate_info.nominal));
288                 i = powernv_pstate_info.nominal;
289         }
290
291         return powernv_freqs[i].frequency;
292 }
293
294 /*
295  * cpuinfo_nominal_freq_show - Show the nominal CPU frequency as indicated by
296  * the firmware
297  */
298 static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy,
299                                         char *buf)
300 {
301         return sprintf(buf, "%u\n",
302                 powernv_freqs[powernv_pstate_info.nominal].frequency);
303 }
304
305 struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq =
306         __ATTR_RO(cpuinfo_nominal_freq);
307
308 static struct freq_attr *powernv_cpu_freq_attr[] = {
309         &cpufreq_freq_attr_scaling_available_freqs,
310         &cpufreq_freq_attr_cpuinfo_nominal_freq,
311         NULL,
312 };
313
314 #define throttle_attr(name, member)                                     \
315 static ssize_t name##_show(struct cpufreq_policy *policy, char *buf)    \
316 {                                                                       \
317         struct chip *chip = per_cpu(chip_info, policy->cpu);            \
318                                                                         \
319         return sprintf(buf, "%u\n", chip->member);                      \
320 }                                                                       \
321                                                                         \
322 static struct freq_attr throttle_attr_##name = __ATTR_RO(name)          \
323
324 throttle_attr(unthrottle, reason[NO_THROTTLE]);
325 throttle_attr(powercap, reason[POWERCAP]);
326 throttle_attr(overtemp, reason[CPU_OVERTEMP]);
327 throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]);
328 throttle_attr(overcurrent, reason[OVERCURRENT]);
329 throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]);
330 throttle_attr(turbo_stat, throttle_turbo);
331 throttle_attr(sub_turbo_stat, throttle_sub_turbo);
332
333 static struct attribute *throttle_attrs[] = {
334         &throttle_attr_unthrottle.attr,
335         &throttle_attr_powercap.attr,
336         &throttle_attr_overtemp.attr,
337         &throttle_attr_supply_fault.attr,
338         &throttle_attr_overcurrent.attr,
339         &throttle_attr_occ_reset.attr,
340         &throttle_attr_turbo_stat.attr,
341         &throttle_attr_sub_turbo_stat.attr,
342         NULL,
343 };
344
345 static const struct attribute_group throttle_attr_grp = {
346         .name   = "throttle_stats",
347         .attrs  = throttle_attrs,
348 };
349
350 /* Helper routines */
351
352 /* Access helpers to power mgt SPR */
353
354 static inline unsigned long get_pmspr(unsigned long sprn)
355 {
356         switch (sprn) {
357         case SPRN_PMCR:
358                 return mfspr(SPRN_PMCR);
359
360         case SPRN_PMICR:
361                 return mfspr(SPRN_PMICR);
362
363         case SPRN_PMSR:
364                 return mfspr(SPRN_PMSR);
365         }
366         BUG();
367 }
368
369 static inline void set_pmspr(unsigned long sprn, unsigned long val)
370 {
371         switch (sprn) {
372         case SPRN_PMCR:
373                 mtspr(SPRN_PMCR, val);
374                 return;
375
376         case SPRN_PMICR:
377                 mtspr(SPRN_PMICR, val);
378                 return;
379         }
380         BUG();
381 }
382
383 /*
384  * Use objects of this type to query/update
385  * pstates on a remote CPU via smp_call_function.
386  */
387 struct powernv_smp_call_data {
388         unsigned int freq;
389         int pstate_id;
390         int gpstate_id;
391 };
392
393 /*
394  * powernv_read_cpu_freq: Reads the current frequency on this CPU.
395  *
396  * Called via smp_call_function.
397  *
398  * Note: The caller of the smp_call_function should pass an argument of
399  * the type 'struct powernv_smp_call_data *' along with this function.
400  *
401  * The current frequency on this CPU will be returned via
402  * ((struct powernv_smp_call_data *)arg)->freq;
403  */
404 static void powernv_read_cpu_freq(void *arg)
405 {
406         unsigned long pmspr_val;
407         s8 local_pstate_id;
408         struct powernv_smp_call_data *freq_data = arg;
409
410         pmspr_val = get_pmspr(SPRN_PMSR);
411
412         /*
413          * The local pstate id corresponds bits 48..55 in the PMSR.
414          * Note: Watch out for the sign!
415          */
416         local_pstate_id = (pmspr_val >> 48) & 0xFF;
417         freq_data->pstate_id = local_pstate_id;
418         freq_data->freq = pstate_id_to_freq(freq_data->pstate_id);
419
420         pr_debug("cpu %d pmsr %016lX pstate_id %d frequency %d kHz\n",
421                 raw_smp_processor_id(), pmspr_val, freq_data->pstate_id,
422                 freq_data->freq);
423 }
424
425 /*
426  * powernv_cpufreq_get: Returns the CPU frequency as reported by the
427  * firmware for CPU 'cpu'. This value is reported through the sysfs
428  * file cpuinfo_cur_freq.
429  */
430 static unsigned int powernv_cpufreq_get(unsigned int cpu)
431 {
432         struct powernv_smp_call_data freq_data;
433
434         smp_call_function_any(cpu_sibling_mask(cpu), powernv_read_cpu_freq,
435                         &freq_data, 1);
436
437         return freq_data.freq;
438 }
439
440 /*
441  * set_pstate: Sets the pstate on this CPU.
442  *
443  * This is called via an smp_call_function.
444  *
445  * The caller must ensure that freq_data is of the type
446  * (struct powernv_smp_call_data *) and the pstate_id which needs to be set
447  * on this CPU should be present in freq_data->pstate_id.
448  */
449 static void set_pstate(void *data)
450 {
451         unsigned long val;
452         struct powernv_smp_call_data *freq_data = data;
453         unsigned long pstate_ul = freq_data->pstate_id;
454         unsigned long gpstate_ul = freq_data->gpstate_id;
455
456         val = get_pmspr(SPRN_PMCR);
457         val = val & 0x0000FFFFFFFFFFFFULL;
458
459         pstate_ul = pstate_ul & 0xFF;
460         gpstate_ul = gpstate_ul & 0xFF;
461
462         /* Set both global(bits 56..63) and local(bits 48..55) PStates */
463         val = val | (gpstate_ul << 56) | (pstate_ul << 48);
464
465         pr_debug("Setting cpu %d pmcr to %016lX\n",
466                         raw_smp_processor_id(), val);
467         set_pmspr(SPRN_PMCR, val);
468 }
469
470 /*
471  * get_nominal_index: Returns the index corresponding to the nominal
472  * pstate in the cpufreq table
473  */
474 static inline unsigned int get_nominal_index(void)
475 {
476         return powernv_pstate_info.nominal;
477 }
478
479 static void powernv_cpufreq_throttle_check(void *data)
480 {
481         struct chip *chip;
482         unsigned int cpu = smp_processor_id();
483         unsigned long pmsr;
484         int pmsr_pmax;
485         unsigned int pmsr_pmax_idx;
486
487         pmsr = get_pmspr(SPRN_PMSR);
488         chip = this_cpu_read(chip_info);
489
490         /* Check for Pmax Capping */
491         pmsr_pmax = (s8)PMSR_MAX(pmsr);
492         pmsr_pmax_idx = pstate_to_idx(pmsr_pmax);
493         if (pmsr_pmax_idx != powernv_pstate_info.max) {
494                 if (chip->throttled)
495                         goto next;
496                 chip->throttled = true;
497                 if (pmsr_pmax_idx > powernv_pstate_info.nominal) {
498                         pr_warn_once("CPU %d on Chip %u has Pmax(%d) reduced below nominal frequency(%d)\n",
499                                      cpu, chip->id, pmsr_pmax,
500                                      idx_to_pstate(powernv_pstate_info.nominal));
501                         chip->throttle_sub_turbo++;
502                 } else {
503                         chip->throttle_turbo++;
504                 }
505                 trace_powernv_throttle(chip->id,
506                                       throttle_reason[chip->throttle_reason],
507                                       pmsr_pmax);
508         } else if (chip->throttled) {
509                 chip->throttled = false;
510                 trace_powernv_throttle(chip->id,
511                                       throttle_reason[chip->throttle_reason],
512                                       pmsr_pmax);
513         }
514
515         /* Check if Psafe_mode_active is set in PMSR. */
516 next:
517         if (pmsr & PMSR_PSAFE_ENABLE) {
518                 throttled = true;
519                 pr_info("Pstate set to safe frequency\n");
520         }
521
522         /* Check if SPR_EM_DISABLE is set in PMSR */
523         if (pmsr & PMSR_SPR_EM_DISABLE) {
524                 throttled = true;
525                 pr_info("Frequency Control disabled from OS\n");
526         }
527
528         if (throttled) {
529                 pr_info("PMSR = %16lx\n", pmsr);
530                 pr_warn("CPU Frequency could be throttled\n");
531         }
532 }
533
534 /**
535  * calc_global_pstate - Calculate global pstate
536  * @elapsed_time:               Elapsed time in milliseconds
537  * @local_pstate_idx:           New local pstate
538  * @highest_lpstate_idx:        pstate from which its ramping down
539  *
540  * Finds the appropriate global pstate based on the pstate from which its
541  * ramping down and the time elapsed in ramping down. It follows a quadratic
542  * equation which ensures that it reaches ramping down to pmin in 5sec.
543  */
544 static inline int calc_global_pstate(unsigned int elapsed_time,
545                                      int highest_lpstate_idx,
546                                      int local_pstate_idx)
547 {
548         int index_diff;
549
550         /*
551          * Using ramp_down_percent we get the percentage of rampdown
552          * that we are expecting to be dropping. Difference between
553          * highest_lpstate_idx and powernv_pstate_info.min will give a absolute
554          * number of how many pstates we will drop eventually by the end of
555          * 5 seconds, then just scale it get the number pstates to be dropped.
556          */
557         index_diff =  ((int)ramp_down_percent(elapsed_time) *
558                         (powernv_pstate_info.min - highest_lpstate_idx)) / 100;
559
560         /* Ensure that global pstate is >= to local pstate */
561         if (highest_lpstate_idx + index_diff >= local_pstate_idx)
562                 return local_pstate_idx;
563         else
564                 return highest_lpstate_idx + index_diff;
565 }
566
567 static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
568 {
569         unsigned int timer_interval;
570
571         /*
572          * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
573          * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
574          * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
575          * seconds of ramp down time.
576          */
577         if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
578              > MAX_RAMP_DOWN_TIME)
579                 timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
580         else
581                 timer_interval = GPSTATE_TIMER_INTERVAL;
582
583         mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
584 }
585
586 /**
587  * gpstate_timer_handler
588  *
589  * @data: pointer to cpufreq_policy on which timer was queued
590  *
591  * This handler brings down the global pstate closer to the local pstate
592  * according quadratic equation. Queues a new timer if it is still not equal
593  * to local pstate
594  */
595 void gpstate_timer_handler(unsigned long data)
596 {
597         struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
598         struct global_pstate_info *gpstates = policy->driver_data;
599         int gpstate_idx, lpstate_idx;
600         unsigned long val;
601         unsigned int time_diff = jiffies_to_msecs(jiffies)
602                                         - gpstates->last_sampled_time;
603         struct powernv_smp_call_data freq_data;
604
605         if (!spin_trylock(&gpstates->gpstate_lock))
606                 return;
607
608         /*
609          * If PMCR was last updated was using fast_swtich then
610          * We may have wrong in gpstate->last_lpstate_idx
611          * value. Hence, read from PMCR to get correct data.
612          */
613         val = get_pmspr(SPRN_PMCR);
614         freq_data.gpstate_id = (s8)GET_GPSTATE(val);
615         freq_data.pstate_id = (s8)GET_LPSTATE(val);
616         if (freq_data.gpstate_id  == freq_data.pstate_id) {
617                 reset_gpstates(policy);
618                 spin_unlock(&gpstates->gpstate_lock);
619                 return;
620         }
621
622         gpstates->last_sampled_time += time_diff;
623         gpstates->elapsed_time += time_diff;
624
625         if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
626                 gpstate_idx = pstate_to_idx(freq_data.pstate_id);
627                 lpstate_idx = gpstate_idx;
628                 reset_gpstates(policy);
629                 gpstates->highest_lpstate_idx = gpstate_idx;
630         } else {
631                 lpstate_idx = pstate_to_idx(freq_data.pstate_id);
632                 gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
633                                                  gpstates->highest_lpstate_idx,
634                                                  lpstate_idx);
635         }
636         freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
637         gpstates->last_gpstate_idx = gpstate_idx;
638         gpstates->last_lpstate_idx = lpstate_idx;
639         /*
640          * If local pstate is equal to global pstate, rampdown is over
641          * So timer is not required to be queued.
642          */
643         if (gpstate_idx != gpstates->last_lpstate_idx)
644                 queue_gpstate_timer(gpstates);
645
646         spin_unlock(&gpstates->gpstate_lock);
647
648         /* Timer may get migrated to a different cpu on cpu hot unplug */
649         smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
650 }
651
652 /*
653  * powernv_cpufreq_target_index: Sets the frequency corresponding to
654  * the cpufreq table entry indexed by new_index on the cpus in the
655  * mask policy->cpus
656  */
657 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
658                                         unsigned int new_index)
659 {
660         struct powernv_smp_call_data freq_data;
661         unsigned int cur_msec, gpstate_idx;
662         struct global_pstate_info *gpstates = policy->driver_data;
663
664         if (unlikely(rebooting) && new_index != get_nominal_index())
665                 return 0;
666
667         if (!throttled) {
668                 /* we don't want to be preempted while
669                  * checking if the CPU frequency has been throttled
670                  */
671                 preempt_disable();
672                 powernv_cpufreq_throttle_check(NULL);
673                 preempt_enable();
674         }
675
676         cur_msec = jiffies_to_msecs(get_jiffies_64());
677
678         spin_lock(&gpstates->gpstate_lock);
679         freq_data.pstate_id = idx_to_pstate(new_index);
680
681         if (!gpstates->last_sampled_time) {
682                 gpstate_idx = new_index;
683                 gpstates->highest_lpstate_idx = new_index;
684                 goto gpstates_done;
685         }
686
687         if (gpstates->last_gpstate_idx < new_index) {
688                 gpstates->elapsed_time += cur_msec -
689                                                  gpstates->last_sampled_time;
690
691                 /*
692                  * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
693                  * we should be resetting all global pstate related data. Set it
694                  * equal to local pstate to start fresh.
695                  */
696                 if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
697                         reset_gpstates(policy);
698                         gpstates->highest_lpstate_idx = new_index;
699                         gpstate_idx = new_index;
700                 } else {
701                 /* Elaspsed_time is less than 5 seconds, continue to rampdown */
702                         gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
703                                                          gpstates->highest_lpstate_idx,
704                                                          new_index);
705                 }
706         } else {
707                 reset_gpstates(policy);
708                 gpstates->highest_lpstate_idx = new_index;
709                 gpstate_idx = new_index;
710         }
711
712         /*
713          * If local pstate is equal to global pstate, rampdown is over
714          * So timer is not required to be queued.
715          */
716         if (gpstate_idx != new_index)
717                 queue_gpstate_timer(gpstates);
718         else
719                 del_timer_sync(&gpstates->timer);
720
721 gpstates_done:
722         freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
723         gpstates->last_sampled_time = cur_msec;
724         gpstates->last_gpstate_idx = gpstate_idx;
725         gpstates->last_lpstate_idx = new_index;
726
727         spin_unlock(&gpstates->gpstate_lock);
728
729         /*
730          * Use smp_call_function to send IPI and execute the
731          * mtspr on target CPU.  We could do that without IPI
732          * if current CPU is within policy->cpus (core)
733          */
734         smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
735         return 0;
736 }
737
738 static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
739 {
740         int base, i, ret;
741         struct kernfs_node *kn;
742         struct global_pstate_info *gpstates;
743
744         base = cpu_first_thread_sibling(policy->cpu);
745
746         for (i = 0; i < threads_per_core; i++)
747                 cpumask_set_cpu(base + i, policy->cpus);
748
749         kn = kernfs_find_and_get(policy->kobj.sd, throttle_attr_grp.name);
750         if (!kn) {
751                 int ret;
752
753                 ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp);
754                 if (ret) {
755                         pr_info("Failed to create throttle stats directory for cpu %d\n",
756                                 policy->cpu);
757                         return ret;
758                 }
759         } else {
760                 kernfs_put(kn);
761         }
762
763         gpstates =  kzalloc(sizeof(*gpstates), GFP_KERNEL);
764         if (!gpstates)
765                 return -ENOMEM;
766
767         policy->driver_data = gpstates;
768
769         /* initialize timer */
770         init_timer_pinned_deferrable(&gpstates->timer);
771         gpstates->timer.data = (unsigned long)policy;
772         gpstates->timer.function = gpstate_timer_handler;
773         gpstates->timer.expires = jiffies +
774                                 msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
775         spin_lock_init(&gpstates->gpstate_lock);
776         ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
777
778         if (ret < 0) {
779                 kfree(policy->driver_data);
780                 return ret;
781         }
782
783         policy->fast_switch_possible = true;
784         return ret;
785 }
786
787 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
788 {
789         /* timer is deleted in cpufreq_cpu_stop() */
790         kfree(policy->driver_data);
791
792         return 0;
793 }
794
795 static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
796                                 unsigned long action, void *unused)
797 {
798         int cpu;
799         struct cpufreq_policy cpu_policy;
800
801         rebooting = true;
802         for_each_online_cpu(cpu) {
803                 cpufreq_get_policy(&cpu_policy, cpu);
804                 powernv_cpufreq_target_index(&cpu_policy, get_nominal_index());
805         }
806
807         return NOTIFY_DONE;
808 }
809
810 static struct notifier_block powernv_cpufreq_reboot_nb = {
811         .notifier_call = powernv_cpufreq_reboot_notifier,
812 };
813
814 void powernv_cpufreq_work_fn(struct work_struct *work)
815 {
816         struct chip *chip = container_of(work, struct chip, throttle);
817         unsigned int cpu;
818         cpumask_t mask;
819
820         get_online_cpus();
821         cpumask_and(&mask, &chip->mask, cpu_online_mask);
822         smp_call_function_any(&mask,
823                               powernv_cpufreq_throttle_check, NULL, 0);
824
825         if (!chip->restore)
826                 goto out;
827
828         chip->restore = false;
829         for_each_cpu(cpu, &mask) {
830                 int index;
831                 struct cpufreq_policy policy;
832
833                 cpufreq_get_policy(&policy, cpu);
834                 index = cpufreq_table_find_index_c(&policy, policy.cur);
835                 powernv_cpufreq_target_index(&policy, index);
836                 cpumask_andnot(&mask, &mask, policy.cpus);
837         }
838 out:
839         put_online_cpus();
840 }
841
842 static int powernv_cpufreq_occ_msg(struct notifier_block *nb,
843                                    unsigned long msg_type, void *_msg)
844 {
845         struct opal_msg *msg = _msg;
846         struct opal_occ_msg omsg;
847         int i;
848
849         if (msg_type != OPAL_MSG_OCC)
850                 return 0;
851
852         omsg.type = be64_to_cpu(msg->params[0]);
853
854         switch (omsg.type) {
855         case OCC_RESET:
856                 occ_reset = true;
857                 pr_info("OCC (On Chip Controller - enforces hard thermal/power limits) Resetting\n");
858                 /*
859                  * powernv_cpufreq_throttle_check() is called in
860                  * target() callback which can detect the throttle state
861                  * for governors like ondemand.
862                  * But static governors will not call target() often thus
863                  * report throttling here.
864                  */
865                 if (!throttled) {
866                         throttled = true;
867                         pr_warn("CPU frequency is throttled for duration\n");
868                 }
869
870                 break;
871         case OCC_LOAD:
872                 pr_info("OCC Loading, CPU frequency is throttled until OCC is started\n");
873                 break;
874         case OCC_THROTTLE:
875                 omsg.chip = be64_to_cpu(msg->params[1]);
876                 omsg.throttle_status = be64_to_cpu(msg->params[2]);
877
878                 if (occ_reset) {
879                         occ_reset = false;
880                         throttled = false;
881                         pr_info("OCC Active, CPU frequency is no longer throttled\n");
882
883                         for (i = 0; i < nr_chips; i++) {
884                                 chips[i].restore = true;
885                                 schedule_work(&chips[i].throttle);
886                         }
887
888                         return 0;
889                 }
890
891                 for (i = 0; i < nr_chips; i++)
892                         if (chips[i].id == omsg.chip)
893                                 break;
894
895                 if (omsg.throttle_status >= 0 &&
896                     omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) {
897                         chips[i].throttle_reason = omsg.throttle_status;
898                         chips[i].reason[omsg.throttle_status]++;
899                 }
900
901                 if (!omsg.throttle_status)
902                         chips[i].restore = true;
903
904                 schedule_work(&chips[i].throttle);
905         }
906         return 0;
907 }
908
909 static struct notifier_block powernv_cpufreq_opal_nb = {
910         .notifier_call  = powernv_cpufreq_occ_msg,
911         .next           = NULL,
912         .priority       = 0,
913 };
914
915 static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
916 {
917         struct powernv_smp_call_data freq_data;
918         struct global_pstate_info *gpstates = policy->driver_data;
919
920         freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
921         freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
922         smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
923         del_timer_sync(&gpstates->timer);
924 }
925
926 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
927                                         unsigned int target_freq)
928 {
929         int index;
930         struct powernv_smp_call_data freq_data;
931
932         index = cpufreq_table_find_index_dl(policy, target_freq);
933         freq_data.pstate_id = powernv_freqs[index].driver_data;
934         freq_data.gpstate_id = powernv_freqs[index].driver_data;
935         set_pstate(&freq_data);
936
937         return powernv_freqs[index].frequency;
938 }
939
940 static struct cpufreq_driver powernv_cpufreq_driver = {
941         .name           = "powernv-cpufreq",
942         .flags          = CPUFREQ_CONST_LOOPS,
943         .init           = powernv_cpufreq_cpu_init,
944         .exit           = powernv_cpufreq_cpu_exit,
945         .verify         = cpufreq_generic_frequency_table_verify,
946         .target_index   = powernv_cpufreq_target_index,
947         .fast_switch    = powernv_fast_switch,
948         .get            = powernv_cpufreq_get,
949         .stop_cpu       = powernv_cpufreq_stop_cpu,
950         .attr           = powernv_cpu_freq_attr,
951 };
952
953 static int init_chip_info(void)
954 {
955         unsigned int chip[256];
956         unsigned int cpu, i;
957         unsigned int prev_chip_id = UINT_MAX;
958
959         for_each_possible_cpu(cpu) {
960                 unsigned int id = cpu_to_chip_id(cpu);
961
962                 if (prev_chip_id != id) {
963                         prev_chip_id = id;
964                         chip[nr_chips++] = id;
965                 }
966         }
967
968         chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL);
969         if (!chips)
970                 return -ENOMEM;
971
972         for (i = 0; i < nr_chips; i++) {
973                 chips[i].id = chip[i];
974                 cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i]));
975                 INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
976                 for_each_cpu(cpu, &chips[i].mask)
977                         per_cpu(chip_info, cpu) =  &chips[i];
978         }
979
980         return 0;
981 }
982
983 static inline void clean_chip_info(void)
984 {
985         kfree(chips);
986 }
987
988 static inline void unregister_all_notifiers(void)
989 {
990         opal_message_notifier_unregister(OPAL_MSG_OCC,
991                                          &powernv_cpufreq_opal_nb);
992         unregister_reboot_notifier(&powernv_cpufreq_reboot_nb);
993 }
994
995 static int __init powernv_cpufreq_init(void)
996 {
997         int rc = 0;
998
999         /* Don't probe on pseries (guest) platforms */
1000         if (!firmware_has_feature(FW_FEATURE_OPAL))
1001                 return -ENODEV;
1002
1003         /* Discover pstates from device tree and init */
1004         rc = init_powernv_pstates();
1005         if (rc)
1006                 goto out;
1007
1008         /* Populate chip info */
1009         rc = init_chip_info();
1010         if (rc)
1011                 goto out;
1012
1013         register_reboot_notifier(&powernv_cpufreq_reboot_nb);
1014         opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb);
1015
1016         rc = cpufreq_register_driver(&powernv_cpufreq_driver);
1017         if (!rc)
1018                 return 0;
1019
1020         pr_info("Failed to register the cpufreq driver (%d)\n", rc);
1021         unregister_all_notifiers();
1022         clean_chip_info();
1023 out:
1024         pr_info("Platform driver disabled. System does not support PState control\n");
1025         return rc;
1026 }
1027 module_init(powernv_cpufreq_init);
1028
1029 static void __exit powernv_cpufreq_exit(void)
1030 {
1031         cpufreq_unregister_driver(&powernv_cpufreq_driver);
1032         unregister_all_notifiers();
1033         clean_chip_info();
1034 }
1035 module_exit(powernv_cpufreq_exit);
1036
1037 MODULE_LICENSE("GPL");
1038 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");