Initial commit of files contained in `mpss-modules-3.8.6.tar.bz2` for Intel Xeon...
[xeon-phi-kernel-module] / ras / micras_pm.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36/*
37 * RAS PM interface
38 *
39 * Contains code to handle interaction with the PM driver.
40 * This includes the initial upload of core voltages and
41 * frequencies, handling of 'turbo' mode, and accounting
42 * for and reporting of card throttles.
43 * This really is for KnC only.
44 */
45
46
47#include <linux/types.h>
48#include <linux/errno.h>
49#include <linux/init.h>
50#include <linux/kernel.h>
51#include <linux/module.h>
52#include <linux/moduleparam.h>
53#include <linux/device.h>
54#include <linux/sysfs.h>
55#include <linux/workqueue.h>
56#include <linux/sched.h>
57#include <linux/wait.h>
58#include <linux/bitmap.h>
59#include <linux/cpumask.h>
60#include <linux/io.h>
61#include <linux/cred.h>
62#include <asm/msr.h>
63#include <asm/mce.h>
64#include <asm/apic.h>
65#include <asm/mic/mic_common.h>
66#include <asm/mic/mic_knc/micsboxdefine.h>
67#include <scif.h>
68#include "micras.h"
69#include "monahan.h"
70#include <asm/mic/micpm_device.h>
71
72#if USE_PM
73
74static atomic_t pm_entry; /* Active calls from PM */
75
76
77/*
78 * Local variables to keep track of throttle states
79 *
80 * onoff Set to 1 if throttling is in effect, otherwise 0
81 * count Count of complete throttles (not counting current).
82 * time Time spent in complete throttles
83 * start Time when current throttle started (or 0)
84 *
85 * Units of time is measured in jiffies and converted to mSecs
86 * at the end of a throttle period. Jiffies are lower resolution
87 * than mSec. If a throttle starts and ends within same jiffy,
88 * a standard penalty of 1/2 jiffy gets added.
89 *
90 *TBD: perhaps it's better simply to add 1/2 jiffy to every throttle
91 * period to compensate for rounding down errors. Would be fair
92 * if average throttle period is more than 1 jiffy long.
93 *
94 *TBD: Using atomics may be overkill. Calls from the RAS MT thread
95 * will be serialized (guaranteed), i.e. the report routine needs
96 * not to care about re-entrancy.
97 */
98
99static atomic_t tmp_onoff;
100static atomic_t tmp_count;
101static atomic_long_t tmp_time;
102static atomic_long_t tmp_start;
103
104static atomic_t pwr_onoff;
105static atomic_t pwr_count;
106static atomic_long_t pwr_time;
107static atomic_long_t pwr_start;
108
109static atomic_t alrt_onoff;
110static atomic_t alrt_count;
111static atomic_long_t alrt_time;
112static atomic_long_t alrt_start;
113
114
115static void
116mr_pwr_enter(void)
117{
118 if (atomic_xchg(&pwr_onoff, 1))
119 return;
120
121 atomic_long_set(&pwr_start, jiffies);
122}
123
124static void
125mr_pwr_leave(void) {
126 unsigned long then;
127
128 if (! atomic_xchg(&pwr_onoff, 0))
129 return;
130
131 then = atomic_long_xchg(&pwr_start, 0);
132 atomic_inc(&pwr_count);
133
134 if (jiffies == then)
135 atomic_long_add(jiffies_to_msecs(1) / 2, &pwr_time);
136 else
137 atomic_long_add(jiffies_to_msecs(jiffies - then), &pwr_time);
138}
139
140
141static void
142mr_tmp_enter(void)
143{
144 if (atomic_xchg(&tmp_onoff, 1))
145 return;
146
147 atomic_long_set(&tmp_start, jiffies);
148}
149
150static void
151mr_tmp_leave(void)
152{
153 unsigned long then;
154
155 if (! atomic_xchg(&tmp_onoff, 0))
156 return;
157
158 then = atomic_long_xchg(&tmp_start, 0);
159 atomic_inc(&tmp_count);
160 if (jiffies == then)
161 atomic_long_add(jiffies_to_msecs(1) / 2, &tmp_time);
162 else
163 atomic_long_add(jiffies_to_msecs(jiffies - then), &tmp_time);
164}
165
166
167static void
168mr_alrt_enter(void)
169{
170 if (atomic_xchg(&alrt_onoff, 1))
171 return;
172
173 atomic_long_set(&alrt_start, jiffies);
174}
175
176static void
177mr_alrt_leave(void)
178{
179 unsigned long then;
180
181 if (! atomic_xchg(&alrt_onoff, 0))
182 return;
183
184 then = atomic_long_xchg(&alrt_start, 0);
185 atomic_inc(&alrt_count);
186 if (jiffies == then)
187 atomic_long_add(jiffies_to_msecs(1) / 2, &alrt_time);
188 else
189 atomic_long_add(jiffies_to_msecs(jiffies - then), &alrt_time);
190}
191
192
193
194/*
195 * Report current throttle state(s) to MT.
196 * Simple copy of local variables, except for the time
197 * measurement, where current throttle (if any) is included.
198 * Don't want a lock to gate access to the local variables,
199 * so the atomics needs to be read in the correct order.
200 * First throttle state, then adder if throttle is in
201 * progress, then counters. If PM enters or leave throttle
202 * while reading stats, the worst is that time for the
203 * current trottle is not included until next read.
204 */
205
206int
207mr_pm_ttl(struct mr_rsp_ttl * rsp)
208{
209 unsigned long then;
210
211 rsp->power.since = 0;
212 rsp->power.active = (uint8_t) atomic_read(&pwr_onoff);
213 if (rsp->power.active) {
214 then = atomic_long_read(&pwr_start);
215 if (then)
216 rsp->power.since = jiffies_to_msecs(jiffies - then);
217 }
218 rsp->power.count = atomic_read(&pwr_count);
219 rsp->power.time = atomic_long_read(&pwr_time);
220
221 rsp->thermal.since = 0;
222 rsp->thermal.active = (uint8_t) atomic_read(&tmp_onoff);
223 if (rsp->thermal.active) {
224 then = atomic_long_read(&tmp_start);
225 if (then)
226 rsp->thermal.since = jiffies_to_msecs(jiffies - then);
227 }
228 rsp->thermal.count = atomic_read(&tmp_count);
229 rsp->thermal.time = atomic_long_read(&tmp_time);
230
231 rsp->alert.since = 0;
232 rsp->alert.active = (uint8_t) atomic_read(&alrt_onoff);
233 if (rsp->alert.active) {
234 then = atomic_long_read(&alrt_start);
235 if (then)
236 rsp->alert.since = jiffies_to_msecs(jiffies - then);
237 }
238 rsp->alert.count = atomic_read(&alrt_count);
239 rsp->alert.time = atomic_long_read(&alrt_time);
240
241 return 0;
242}
243
244
245/*
246 * Throttle signaling function (call from PM)
247 */
248
249static int ttl_tcrit;
250
251void
252mr_throttle(int which, int state)
253{
254 struct ttl_info ttl;
255 uint32_t tmp;
256
257 atomic_inc(&pm_entry);
258
259 tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
260 ttl.die = GET_BITS(19, 10, tmp);
261
262 /*
263 * PM is weird in the destinction of thermal and power throttle.
264 * Power below PLIM should be quiet. Power between PLim1 and PLim0
265 * results in TTL_POWER events. Power above PLim0 results in both
266 * TTL_POWER and TTL_THERMAL events, _even_ if temperature is well
267 * below Tcrit. We handle this by maintaining 3 throttle related
268 * event types: thermal throttles, power throttles and power alert.
269 * The power alert is flaggend on entry as TTL_POWER, no problems.
270 * The two throttles both come in as TTL_THERMAL, so we use current
271 * die temperature to determine whether it was a thermal threshold
272 * or the power limit that was exceeded. Point is power throttles
273 * arriving while temperature is above Tcrit _will_ be counted as
274 * thermal throttles, period.
275 */
276 ttl.upd = 0;
277 switch(which) {
278 case TTL_POWER:
279 (state == TTL_OFF) ? mr_alrt_leave() : mr_alrt_enter();
280 ttl.upd |= PM_ALRT_TTL_CHG;
281 ttl.upd |= atomic_read(&alrt_onoff) ? PM_ALRT_TTL : 0;
282 break;
283
284 case TTL_THERMAL:
285#if 1
286 /*
287 * Careful here: may get throttle ON while die > tcrit
288 * and select thermal throttle correctly and then get
289 * the corresponding throttle OFF when die has fallen
290 * below tcrit in which case we must de-assert thermal
291 * trottle.
292 * As a shortcut, we deassert both throttles if the
293 * GPU_HOT signal gets de-asserted (which is correct).
294 */
295 if (state == TTL_OFF) {
296 if (atomic_read(&pwr_onoff))
297 ttl.upd |= PM_PWR_TTL_CHG;
298 if (atomic_read(&tmp_onoff))
299 ttl.upd |= PM_TRM_TTL_CHG;
300 mr_pwr_leave();
301 mr_tmp_leave();
302 }
303 else {
304 if (ttl_tcrit && ttl.die < ttl_tcrit) {
305 if (! atomic_read(&pwr_onoff))
306 ttl.upd |= (PM_PWR_TTL_CHG | PM_PWR_TTL);
307 mr_pwr_enter();
308 }
309 else {
310 if (! atomic_read(&tmp_onoff))
311 ttl.upd |= (PM_TRM_TTL_CHG | PM_TRM_TTL);
312 mr_tmp_enter();
313 }
314 }
315#else
316 if (ttl_tcrit && ttl.die < ttl_tcrit) {
317 (state == TTL_OFF) ? mr_pwr_leave() : mr_pwr_enter();
318 ttl.upd |= PM_PWR_TTL_CHG;
319 ttl.upd |= atomic_read(&pwr_onoff) ? PM_PWR_TTL : 0;
320 }
321 else {
322 (state == TTL_OFF) ? mr_tmp_leave() : mr_tmp_enter();
323 ttl.upd |= PM_TRM_TTL_CHG;
324 ttl.upd |= atomic_read(&tmp_onoff) ? PM_TRM_TTL : 0;
325 }
326#endif
327 break;
328 }
329
330 micras_ttl_send(&ttl);
331
332#if 0
333 printk("ttl - args: which %d, state %d\n", which, state);
334
335 printk("ttl - therm: on %d, count %d, time %ld, start %ld\n",
336 atomic_read(&tmp_onoff), atomic_read(&tmp_count),
337 atomic_long_read(&tmp_time), atomic_long_read(&tmp_start));
338
339 printk("ttl - power: on %d, count %d, time %ld, start %ld\n",
340 atomic_read(&pwr_onoff), atomic_read(&pwr_count),
341 atomic_long_read(&pwr_time), atomic_long_read(&pwr_start));
342
343 printk("ttl - alert: on %d, count %d, time %ld, start %ld\n",
344 atomic_read(&alrt_onoff), atomic_read(&alrt_count),
345 atomic_long_read(&alrt_time), atomic_long_read(&alrt_start));
346#endif
347
348 atomic_dec(&pm_entry);
349}
350
351
352/*
353 * Throttle signaling function (call from notifier chain)
354 *
355 * TBD: should we test for odd state transitions and recursions?
356 */
357
358static int
359mr_pm_throttle_callback(struct notifier_block *nb, unsigned long event, void *msg)
360{
361 atomic_inc(&pm_entry);
362
363 switch(event) {
364
365 case EVENT_PROCHOT_ON:
366 mr_throttle(TTL_THERMAL, TTL_ON);
367 break;
368
369 case EVENT_PROCHOT_OFF:
370 mr_throttle(TTL_THERMAL, TTL_OFF);
371 break;
372
373 case EVENT_PWR_ALERT_ON:
374 mr_throttle(TTL_POWER, TTL_ON);
375 break;
376
377 case EVENT_PWR_ALERT_OFF:
378 mr_throttle(TTL_POWER, TTL_OFF);
379 break;
380
381 default:
382 /*
383 * Ignore whatever else is sent this way
384 */
385 break;
386 }
387
388 atomic_dec(&pm_entry);
389 return 0;
390}
391
392
393
394
395/*
396**
397** Power management routines
398**
399** one_mmio_rd Read one MMIO register into memory safe
400** one_mmio_wr Write one MMIO register from memory safe
401**
402** one_msr_rd Read one MSR register into memory safe
403** one_msr_wr Write one MSR register from memory safe
404**
405** mc_suspend Prepare for suspend, preserve CSRs to safe
406** mc_suspend_cancel Suspend canceled, restore operating mode
407** mc_resume Recover from suspend, restore CSRs from safe
408**
409** For now this stores all registers that are used by this module.
410** In reality, only those registers on power planes turned off in
411** deep sleep states needs to be stored, but at this point it is
412** not known which registers are in that group. This is a table
413** driven mechanism that _only_ handles RAS related registers.
414**
415**TBD: Turn off MC handlers while in suspend?
416** Both pro's and con's on this one, such as
417** + Disabling uncore is easy, just clear INT_EN
418** + prevents MC to interfere with PM state transitions
419** - can hide corruption due to UC errors
420** - requires a lot of IPIs to shut down core MC handling
421** + there's nobody to handle MCs when cores are asleep.
422** ? can events hide in *BOX banks during suspend/resume
423** and fire when restoring the INT_EN register?
424** - Disabling core is not that easy (from a module).
425** Enabling core MCEs requires setting flag X86_CR4_MCE
426** in CR4 on every core _and_ writing ~0 to MSR IA32_MCG_CAP
427** on every CPU. Probably better to let per-CPU routines
428** like mce_suspend() and mce_resume() handle it, with
429** some care because we'd want to save all CTLs before
430** mce_suspend() runs and restore them after mce_resume().
431** Problem is how to get at these functions; they are not
432** exported and seems not to be hooked into the kernel's PM
433** call chains. Perhaps sysclass abstraction ties into PM.
434** Even so, who's to invoke it and how?
435*/
436
437#define SAVE_BLOCK_MCA 1 /* Disable MC handling in suspend */
438#define RAS_SAVE_MSR 1 /* Include global MSRs in suspend */
439#define RAS_SAVE_CPU_MSR 0 /* Include per-CPU MSRs in suspend */
440
441#define SBOX 1 /* SBOX register (index 0) */
442#define DBOX 2 /* DBOX register (index 0..1) */
443#define GBOX 3 /* GBOX register (index 0..7) */
444#define TBOX 4 /* TBOX register (index 0..7) */
445#define GMSR 5 /* Global MSR (index 0) */
446#define LMSR 6 /* Per-CPU MSR (index 0..CONFIG_NR_CPUS-1) */
447
448#define W64 (1 << 6) /* 64 bit MMIO register (32 bit default) */
449#define VLD (1 << 7) /* Register value valid, can be restored */
450
451typedef struct _regrec {
452 uint8_t box; /* Box type + width bit + valid bit */
453 uint8_t num; /* Box index (or 0) */
454 uint16_t ofs; /* MMIO byte offset / MSR number */
455 uint64_t reg; /* Register value */
456} RegRec;
457
458
459/*
460 * Rumor has it that SBOX CSRs below 0x7000 will survive deep sleep
461 * Think it's safer to save/restore CSRs that RAS writes to anyways.
462 * We'll leave out a bunch of RO CSRs, most of which are HW status.
463 * SCRATCH<n> CSRs are above 0x7000 and needs to be preserved.
464 *
465 *TBD: Somebody else to preserve scratch CSRs not used by RAS?
466 * For now I'll save and restore all of them.
467 */
468
469static RegRec susp_mmio[] = { /* Used in file */
470 { SBOX, 0, SBOX_MCA_INT_EN, 0 }, /* Uncore, must be 1st */
471 { SBOX, 0, SBOX_SCRATCH0, 0 }, /* - */
472 { SBOX, 0, SBOX_SCRATCH1, 0 }, /* - */
473 { SBOX, 0, SBOX_SCRATCH2, 0 }, /* - */
474 { SBOX, 0, SBOX_SCRATCH3, 0 }, /* - */
475 { SBOX, 0, SBOX_SCRATCH4, 0 }, /* Common, knc, */
476 { SBOX, 0, SBOX_SCRATCH5, 0 }, /* - */
477 { SBOX, 0, SBOX_SCRATCH6, 0 }, /* - */
478 { SBOX, 0, SBOX_SCRATCH7, 0 }, /* Knc, knf */
479 { SBOX, 0, SBOX_SCRATCH8, 0 }, /* - */
480 { SBOX, 0, SBOX_SCRATCH9, 0 }, /* Common, knc, knf */
481 { SBOX, 0, SBOX_SCRATCH10, 0 }, /* - */
482 { SBOX, 0, SBOX_SCRATCH11, 0 }, /* - */
483 { SBOX, 0, SBOX_SCRATCH12, 0 }, /* - */
484 { SBOX, 0, SBOX_SCRATCH13, 0 }, /* Common */
485 { SBOX, 0, SBOX_SCRATCH14, 0 }, /* - */
486 { SBOX, 0, SBOX_SCRATCH15, 0 }, /* - */
487// { SBOX, 0, SBOX_COMPONENT_ID, 0 }, /* Knc */
488// { SBOX, 0, SBOX_SVIDCONTROL, 0 }, /* Knc */
489// { SBOX, 0, SBOX_PCIE_PCI_SUBSYSTEM, 0 }, /* Common */
490// { SBOX, 0, SBOX_PCIE_VENDOR_ID_DEVICE_ID, 0 }, /* Common */
491// { SBOX, 0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8, 0 },/* Common */
492 { SBOX, 0, SBOX_OC_I2C_ICR + ICR_OFFSET, 0 }, /* Elog */
493 { SBOX, 0, SBOX_OC_I2C_ICR + ISR_OFFSET, 0 }, /* Elog */
494 { SBOX, 0, SBOX_OC_I2C_ICR + ISAR_OFFSET, 0 }, /* Elog */
495 { SBOX, 0, SBOX_OC_I2C_ICR + IDBR_OFFSET, 0 }, /* Elog */
496// { SBOX, 0, SBOX_OC_I2C_ICR + IBMR_OFFSET, 0 }, /* Elog */
497// { SBOX, 0, SBOX_COREVOLT, 0 }, /* Knc, knf */
498// { SBOX, 0, SBOX_COREFREQ, 0 }, /* Knc, knf */
499// { SBOX, 0, SBOX_MEMVOLT, 0 }, /* Knc, knf */
500// { SBOX, 0, SBOX_MEMORYFREQ, 0 }, /* Knc, knf */
501// { SBOX, 0, SBOX_CURRENTRATIO, 0 }, /* Knc */
502// { SBOX, 0, SBOX_BOARD_VOLTAGE_SENSE, 0 }, /* Knc, knf */
503// { SBOX, 0, SBOX_THERMAL_STATUS, 0 }, /* Knc, knf */
504// { SBOX, 0, SBOX_BOARD_TEMP1, 0 }, /* Knc, knf */
505// { SBOX, 0, SBOX_BOARD_TEMP2, 0 }, /* Knc, knf */
506// { SBOX, 0, SBOX_CURRENT_DIE_TEMP0, 0 }, /* Knc, knf */
507// { SBOX, 0, SBOX_CURRENT_DIE_TEMP1, 0 }, /* Knc, knf */
508// { SBOX, 0, SBOX_CURRENT_DIE_TEMP2, 0 }, /* Knc, knf */
509// { SBOX, 0, SBOX_MAX_DIE_TEMP0, 0 }, /* Knc, knf */
510// { SBOX, 0, SBOX_MAX_DIE_TEMP1, 0 }, /* Knc, knf */
511// { SBOX, 0, SBOX_MAX_DIE_TEMP2, 0 }, /* Knc, knf */
512// { SBOX, 0, SBOX_STATUS_FAN1, 0 }, /* Knc, knf */
513// { SBOX, 0, SBOX_STATUS_FAN2, 0 }, /* Knc, knf */
514// { SBOX, 0, SBOX_SPEED_OVERRIDE_FAN, 0 }, /* Knc, knf */
515 { SBOX, 0, SBOX_MCA_INT_STAT, 0 }, /* Uncore */
516// { SBOX, 0, SBOX_APICRT16, 0 }, /* Uncore */
517 { SBOX, 0, SBOX_MCX_CTL_LO, 0 }, /* Uncore */
518 { DBOX, 0, DBOX_MC2_CTL, 0 }, /* Uncore */
519#ifdef CONFIG_MK1OM
520 { DBOX, 1, DBOX_MC2_CTL, 0 }, /* Uncore */
521#endif
522 { GBOX | W64, 0, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
523 { GBOX | W64, 1, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
524 { GBOX | W64, 2, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
525 { GBOX | W64, 3, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
526#ifdef CONFIG_MK1OM
527 { GBOX | W64, 4, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
528 { GBOX | W64, 5, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
529 { GBOX | W64, 6, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
530 { GBOX | W64, 7, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
531#endif
532#ifdef CONFIG_MK1OM
533 { TBOX | W64, 0, TXS_MCX_CONTROL, 0 }, /* Uncore */
534 { TBOX | W64, 1, TXS_MCX_CONTROL, 0 }, /* Uncore */
535 { TBOX | W64, 2, TXS_MCX_CONTROL, 0 }, /* Uncore */
536 { TBOX | W64, 3, TXS_MCX_CONTROL, 0 }, /* Uncore */
537 { TBOX | W64, 4, TXS_MCX_CONTROL, 0 }, /* Uncore */
538 { TBOX | W64, 5, TXS_MCX_CONTROL, 0 }, /* Uncore */
539 { TBOX | W64, 6, TXS_MCX_CONTROL, 0 }, /* Uncore */
540 { TBOX | W64, 7, TXS_MCX_CONTROL, 0 }, /* Uncore */
541#endif
542};
543
544#if RAS_SAVE_MSR
545static RegRec susp_msr[] = { /* Used in file */
546 { GMSR, 0, MSR_IA32_MCG_STATUS, 0 }, /* Uncore, kernel */
547};
548
549#if RAS_SAVE_CPU_MSR
550static RegRec susp_lcl_msr[4 * CONFIG_NR_CPUS] = { /* Used in file */
551 { LMSR, 0, MSR_IA32_MCx_CTL(0), 0 }, /* Core, kernel */
552 { LMSR, 0, MSR_IA32_MCx_CTL(1), 0 }, /* Core, kernel */
553 { LMSR, 0, MSR_IA32_MCx_CTL(2), 0 }, /* Core, kernel */
554 { LMSR, 0, MSR_IA32_MCG_CTL, 0 }, /* kernel */
555 /*
556 * The remaining entries is setup/replicated by pm_init()
557 */
558};
559#endif
560#endif
561
562
563static void
564one_mmio_rd(RegRec * r)
565{
566 switch(r->box & 0xf) {
567 case SBOX:
568 if (r->box & W64)
569 r->reg = mr_sbox_rq(0, r->ofs);
570 else
571 r->reg = (uint64_t) mr_sbox_rl(0, r->ofs);
572 break;
573 case DBOX:
574 if (r->box & W64)
575 r->reg = mr_dbox_rq(r->num, r->ofs);
576 else
577 r->reg = (uint64_t) mr_dbox_rl(r->num, r->ofs);
578 break;
579 case GBOX:
580 if (r->box & W64)
581 r->reg = mr_gbox_rq(r->num, r->ofs);
582 else
583 r->reg = (uint64_t) mr_gbox_rl(r->num, r->ofs);
584 break;
585 case TBOX:
586 if (mr_txs()) {
587 if (r->box & W64)
588 r->reg = mr_tbox_rq(r->num, r->ofs);
589 else
590 r->reg = (uint64_t) mr_tbox_rl(r->num, r->ofs);
591 }
592 break;
593 default:
594 r->box &= ~VLD;
595 return;
596 }
597 r->box |= VLD;
598
599#if PM_VERBOSE
600 printk("mmio_rd: box %d, idx %3d, ofs %04x -> %llx\n",
601 r->box & 0xf, r->num, r->ofs, r->reg);
602#endif
603}
604
605static void
606one_mmio_wr(RegRec * r)
607{
608 if (! (r->box & VLD))
609 return;
610
611 switch(r->box & 0xf) {
612 case SBOX:
613 if (r->box & W64)
614 mr_sbox_wq(0, r->ofs, r->reg);
615 else
616 mr_sbox_wl(0, r->ofs, (uint32_t) r->reg);
617 break;
618 case DBOX:
619 if (r->box & W64)
620 mr_dbox_wq(r->num, r->ofs, r->reg);
621 else
622 mr_dbox_wl(r->num, r->ofs, (uint32_t) r->reg);
623 break;
624 case GBOX:
625 if (r->box & W64)
626 mr_gbox_wq(r->num, r->ofs, r->reg);
627 else
628 mr_gbox_wl(r->num, r->ofs, (uint32_t) r->reg);
629 break;
630 case TBOX:
631 if (mr_txs()) {
632 if (r->box & W64)
633 mr_tbox_wq(r->num, r->ofs, r->reg);
634 else
635 mr_tbox_wl(r->num, r->ofs, (uint32_t) r->reg);
636 }
637 break;
638 }
639 r->box &= ~VLD;
640
641#if PM_VERBOSE
642 printk("mmio_wr: box %d, idx %3d, ofs %04x <- %llx\n",
643 r->box & 0xf, r->num, r->ofs, r->reg);
644#endif
645}
646
647
648#if RAS_SAVE_MSR
649static void
650one_msr_rd(RegRec * r)
651{
652 uint32_t hi, lo;
653
654 switch(r->box & 0xf) {
655 case GMSR:
656 rdmsr(r->ofs, lo, hi);
657 break;
658#if RAS_SAVE_CPU_MSR
659 case LMSR:
660 rdmsr_on_cpu(r->num, r->ofs, &lo, &hi);
661 break;
662#endif
663 default:
664 r->box &= ~VLD;
665 return;
666 }
667 r->reg = ((uint64_t) hi) << 32 | (uint64_t) lo;
668 r->box |= VLD;
669
670#if PM_VERBOSE
671 printk("msr_rd: box %d, idx %3d, ofs %04x -> %llx\n",
672 r->box & 0xf, r->num, r->ofs, r->reg);
673#endif
674}
675
676static void
677one_msr_wr(RegRec * r)
678{
679 uint32_t hi, lo;
680
681 if (! (r->box & VLD))
682 return;
683
684 hi = r->reg >> 32;
685 lo = r->reg & 0xffffffff;
686 switch(r->box & 0xf) {
687 case GMSR:
688 wrmsr(r->ofs, lo, hi);
689 break;
690#if RAS_SAVE_CPU_MSR
691 case LMSR:
692 wrmsr_on_cpu(r->num, r->ofs, lo, hi);
693 break;
694#endif
695 }
696 r->box &= ~VLD;
697
698#if PM_VERBOSE
699 printk("msr_wr: box %d, idx %3d, ofs %04x <- %llx\n",
700 r->box & 0xf, r->num, r->ofs, r->reg);
701#endif
702}
703#endif /* RAS_SAVE_MSR */
704
705
706/*
707 * Preserve all HW registers that will be lost in
708 * deep sleep states. This will be SBOX registers
709 * above offset 0x7000 and all other BOX registers.
710 */
711
712static void
713mr_suspend(void)
714{
715 int i;
716
717 atomic_inc(&pm_entry);
718
719 /*
720 * Save SBOX_MCA_INT_EN first and clear it.
721 * No more uncore MCAs will get through.
722 */
723 one_mmio_rd(susp_mmio + 0);
724#if SAVE_BLOCK_MCA
725 mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
726#endif
727
728 /*
729 * Save remaining BOX MMIOs
730 */
731 for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
732 one_mmio_rd(susp_mmio + i);
733
734#if RAS_SAVE_MSR
735 /*
736 * Save global MSRs and set MCIP
737 * No new exceptions will be asserted
738 */
739 for(i = 0; i < ARRAY_SIZE(susp_msr); i++)
740 one_msr_rd(susp_msr + i);
741#if SAVE_BLOCK_MCA
742 wrmsr(MSR_IA32_MCG_STATUS, MCG_STATUS_MCIP, 0);
743#endif
744
745#if RAS_SAVE_CPU_MSR
746 /*
747 * Save per-CPU MSRs
748 */
749 for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
750 one_msr_rd(susp_lcl_msr + i);
751#endif
752#endif
753
754 atomic_dec(&pm_entry);
755}
756
757
758/*
759 * Undo side effects of a suspend call.
760 * Nothing to do unless we turned MC handlers off.
761 */
762
763static void
764mr_cancel(void)
765{
766 int i;
767
768 atomic_inc(&pm_entry);
769
770 /*
771 * Restore SBOX_MCA_INT_EN to unblock uncore MCs
772 * Invalidate all other saved MMIO registers.
773 */
774 one_mmio_wr(susp_mmio + 0);
775 for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
776 susp_mmio[i].box &= ~VLD;
777
778#if RAS_SAVE_MSR
779 /*
780 * Restore IA32_MCG_STATUS to unblock core MCs
781 * Invalidate all other saved MSR registers.
782 */
783 one_msr_wr(susp_msr + 0);
784 for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
785 susp_msr[i].box &= ~VLD;
786
787#if RAS_SAVE_CPU_MSR
788 for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
789 susp_lcl_msr[i].box &= ~VLD;
790#endif
791#endif
792
793 atomic_dec(&pm_entry);
794}
795
796
797/*
798 * Restore all HW registers that we use.
799 */
800
801static void
802mr_resume(void)
803{
804 int i;
805
806 atomic_inc(&pm_entry);
807
808 /*
809 * Clear uncore MCA banks (just in case)
810 */
811 if (susp_mmio[0].box & VLD)
812 box_reset(0);
813
814 /*
815 * Restore all BOX MMIOs but SBOX_MCA_INT_EN
816 */
817 for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
818 one_mmio_wr(susp_mmio + i);
819
820 /*
821 * Then restore SBOX_MCA_INT_EN to enable uncore MCAs
822 */
823 one_mmio_wr(susp_mmio + 0);
824
825#if RAS_SAVE_MSR
826 /*
827 * Restore all global MSRs but IA32_MCG_STATUS
828 */
829 for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
830 one_msr_wr(susp_msr + i);
831
832 /*
833 * Then restore IA32_MCG_STATUS to allow core MCAs
834 */
835 one_msr_wr(susp_msr + 0);
836
837#if RAS_SAVE_CPU_MSR
838 /*
839 * Restore all per-cpu MSRs
840 */
841 for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
842 one_msr_wr(susp_lcl_msr + i);
843#endif
844#endif
845
846 atomic_dec(&pm_entry);
847}
848
849
850/*
851 * Callback from PM notifier chain.
852 * TBD: should we test for odd state transitions and recursions?
853 */
854
855static int
856mr_pm_callback(struct notifier_block *nb, unsigned long event, void *msg)
857{
858
859 switch(event) {
860 case MICPM_DEVEVENT_SUSPEND:
861 mr_suspend();
862 break;
863
864 case MICPM_DEVEVENT_RESUME:
865 mr_resume();
866 break;
867
868 case MICPM_DEVEVENT_FAIL_SUSPEND:
869 mr_cancel();
870 break;
871
872 default:
873 /*
874 * Ignore whatever else is sent this way
875 */
876 break;
877 }
878
879 return 0;
880}
881
882
883
884/*
885**
886** The PM module loads before RAS, so we must setup
887** the API to support power management, i.e register.
888** PM needs:
889** - Notification when MT changes certain variables.
890** Provided by a call-out list that the PM sets
891** at registration time.
892** - Access to MT calls.
893** The PM module can use micras_mt_call() for access.
894** Since PM loads first, this function needs to
895** be passed at registration time.
896** RAS needs:
897** - list of core voltages (for CVOLT query).
898** We pass a pointer to the voltage list and the
899** voltage list counter to PM module, who will
900** fill in the actual values (not available until
901** core-freq driver loads).
902** - list of core frequencies (for CFREQ query).
903** Same solution as for CVOLT.
904** - Notifications for throttle state changes.
905** - Power management notifications for suspend/resume.
906**
907** Note: can one notifier block be inserted in multiple
908** chains? Its assume not, which require two blocks
909** both pointing to the same local function.
910*/
911
912extern struct mr_rsp_freq freq;
913extern struct mr_rsp_volt volt;
914
915struct micpm_params pm_reg; /* Our data for PM */
916struct micpm_callbacks pm_cb; /* PM data for us */
917
918extern void micpm_device_register(struct notifier_block *n);
919extern void micpm_device_unregister(struct notifier_block *n);
920extern void micpm_atomic_notifier_register(struct notifier_block *n);
921extern void micpm_atomic_notifier_unregister(struct notifier_block *n);
922
923static struct notifier_block ras_deviceevent = {
924 .notifier_call = mr_pm_callback,
925};
926
927static struct notifier_block ras_throttle_event_ns = {
928 .notifier_call = mr_pm_throttle_callback,
929};
930
931static struct notifier_block ras_throttle_event = {
932 .notifier_call = mr_pm_throttle_callback,
933};
934
935
936/*
937 * Setup PM callbacks and SCIF handler.
938 */
939
940static int
941pm_mt_call(uint16_t cmd, void * buf)
942{
943 int err;
944
945 atomic_inc(&pm_entry);
946 err = micras_mt_call(cmd, buf);
947 atomic_dec(&pm_entry);
948
949 return err;
950}
951
952
953int __init
954pm_init(void)
955{
956 extern int mr_smc_rd(uint8_t, uint32_t *);
957
958#if RAS_SAVE_CPU_MSR
959 /*
960 * Preset MCA bank MSR register descriptions
961 *
962 *TBD: We have to use IPIs to read MSRs, which will wake
963 * up cores at sleep when this function is called.
964 * PM module may not like this at all.
965 */
966 int i, j;
967 for(i = 1; i < nr_cpu_ids; i++) {
968 j = 4 * i;
969 susp_lcl_msr[j] = susp_lcl_msr[0];
970 susp_lcl_msr[j + 1] = susp_lcl_msr[1];
971 susp_lcl_msr[j + 2] = susp_lcl_msr[2];
972 susp_lcl_msr[j + 3] = susp_lcl_msr[3];
973 susp_lcl_msr[j].num = i;
974 susp_lcl_msr[j + 1].num = i;
975 susp_lcl_msr[j + 2].num = i;
976 susp_lcl_msr[j + 3].num = i;
977 }
978#endif
979
980 /*
981 * Get temperature where power throttle becomes thermal throttle
982 */
983 mr_smc_rd(0x4c, &ttl_tcrit);
984
985 /*
986 * Register with the MIC Power Management driver.
987 */
988 pm_reg.volt_lst = volt.supt;
989 pm_reg.volt_len = &volt.slen;
990 pm_reg.volt_siz = ARRAY_SIZE(volt.supt);
991 pm_reg.freq_lst = freq.supt;
992 pm_reg.freq_len = &freq.slen;
993 pm_reg.freq_siz = ARRAY_SIZE(freq.supt);
994 pm_reg.mt_call = pm_mt_call;
995 pm_reg.mt_ttl = mr_throttle;
996 if (micpm_ras_register(&pm_cb, &pm_reg))
997 goto fail_pm;
998
999 /*
1000 * Get into the PM notifier lists
1001 * MicPm reports events in 2 chains, one atomic and one
1002 * blocking. Our callback will not block!
1003 */
1004 micpm_atomic_notifier_register(&ras_throttle_event_ns);
1005 micpm_notifier_register(&ras_throttle_event);
1006
1007 if (boot_cpu_data.x86_mask == KNC_C_STEP)
1008 micpm_device_register(&ras_deviceevent);
1009
1010 printk("RAS.pm: init complete\n");
1011 return 0;
1012
1013fail_pm:
1014 printk("RAS.pm: init failed\n");
1015 return 1;
1016}
1017
1018
1019/*
1020 * Cleanup for module unload.
1021 * Clear/restore hooks in the native MCA handler.
1022 */
1023
1024void __exit
1025pm_exit(void)
1026{
1027 /*
1028 * Get off the PM notifier list
1029 */
1030 micpm_atomic_notifier_unregister(&ras_throttle_event_ns);
1031 micpm_notifier_unregister(&ras_throttle_event);
1032
1033 if (boot_cpu_data.x86_mask == KNC_C_STEP)
1034 micpm_device_unregister(&ras_deviceevent);
1035
1036 /*
1037 * De-register with the PM module.
1038 */
1039 micpm_ras_unregister();
1040
1041 /*
1042 * Wait for an calls to module to finish.
1043 */
1044 while(atomic_read(&pm_entry))
1045 cpu_relax();
1046
1047 printk("RAS.pm: exit complete\n");
1048}
1049
1050#endif /* USE_PM */