Updated `README.md` with instructions for building/using the kernel module.
[xeon-phi-kernel-module] / ras / micras_pm.c
/*
* Copyright 2010-2017 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Disclaimer: The codes contained in these modules may be specific to
* the Intel Software Development Platform codenamed Knights Ferry,
* and the Intel product codenamed Knights Corner, and are not backward
* compatible with other Intel products. Additionally, Intel will NOT
* support the codes or instruction set in future products.
*
* Intel offers no warranty of any kind regarding the code. This code is
* licensed on an "AS IS" basis and Intel is not obligated to provide
* any support, assistance, installation, training, or other services
* of any kind. Intel is also not obligated to provide any updates,
* enhancements or extensions. Intel specifically disclaims any warranty
* of merchantability, non-infringement, fitness for any particular
* purpose, and any other warranty.
*
* Further, Intel disclaims all liability of any kind, including but
* not limited to liability for infringement of any proprietary rights,
* relating to the use of the code, even if Intel is notified of the
* possibility of such liability. Except as expressly stated in an Intel
* license agreement provided with this code and agreed upon with Intel,
* no license, express or implied, by estoppel or otherwise, to any
* intellectual property rights is granted herein.
*/
/*
* RAS PM interface
*
* Contains code to handle interaction with the PM driver.
* This includes the initial upload of core voltages and
* frequencies, handling of 'turbo' mode, and accounting
* for and reporting of card throttles.
* This really is for KnC only.
*/
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/device.h>
#include <linux/sysfs.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/io.h>
#include <linux/cred.h>
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/apic.h>
#include <asm/mic/mic_common.h>
#include <asm/mic/mic_knc/micsboxdefine.h>
#include <scif.h>
#include "micras.h"
#include "monahan.h"
#include <asm/mic/micpm_device.h>
#if USE_PM
static atomic_t pm_entry; /* Active calls from PM */
/*
* Local variables to keep track of throttle states
*
* onoff Set to 1 if throttling is in effect, otherwise 0
* count Count of complete throttles (not counting current).
* time Time spent in complete throttles
* start Time when current throttle started (or 0)
*
* Units of time is measured in jiffies and converted to mSecs
* at the end of a throttle period. Jiffies are lower resolution
* than mSec. If a throttle starts and ends within same jiffy,
* a standard penalty of 1/2 jiffy gets added.
*
*TBD: perhaps it's better simply to add 1/2 jiffy to every throttle
* period to compensate for rounding down errors. Would be fair
* if average throttle period is more than 1 jiffy long.
*
*TBD: Using atomics may be overkill. Calls from the RAS MT thread
* will be serialized (guaranteed), i.e. the report routine needs
* not to care about re-entrancy.
*/
static atomic_t tmp_onoff;
static atomic_t tmp_count;
static atomic_long_t tmp_time;
static atomic_long_t tmp_start;
static atomic_t pwr_onoff;
static atomic_t pwr_count;
static atomic_long_t pwr_time;
static atomic_long_t pwr_start;
static atomic_t alrt_onoff;
static atomic_t alrt_count;
static atomic_long_t alrt_time;
static atomic_long_t alrt_start;
static void
mr_pwr_enter(void)
{
if (atomic_xchg(&pwr_onoff, 1))
return;
atomic_long_set(&pwr_start, jiffies);
}
static void
mr_pwr_leave(void) {
unsigned long then;
if (! atomic_xchg(&pwr_onoff, 0))
return;
then = atomic_long_xchg(&pwr_start, 0);
atomic_inc(&pwr_count);
if (jiffies == then)
atomic_long_add(jiffies_to_msecs(1) / 2, &pwr_time);
else
atomic_long_add(jiffies_to_msecs(jiffies - then), &pwr_time);
}
static void
mr_tmp_enter(void)
{
if (atomic_xchg(&tmp_onoff, 1))
return;
atomic_long_set(&tmp_start, jiffies);
}
static void
mr_tmp_leave(void)
{
unsigned long then;
if (! atomic_xchg(&tmp_onoff, 0))
return;
then = atomic_long_xchg(&tmp_start, 0);
atomic_inc(&tmp_count);
if (jiffies == then)
atomic_long_add(jiffies_to_msecs(1) / 2, &tmp_time);
else
atomic_long_add(jiffies_to_msecs(jiffies - then), &tmp_time);
}
static void
mr_alrt_enter(void)
{
if (atomic_xchg(&alrt_onoff, 1))
return;
atomic_long_set(&alrt_start, jiffies);
}
static void
mr_alrt_leave(void)
{
unsigned long then;
if (! atomic_xchg(&alrt_onoff, 0))
return;
then = atomic_long_xchg(&alrt_start, 0);
atomic_inc(&alrt_count);
if (jiffies == then)
atomic_long_add(jiffies_to_msecs(1) / 2, &alrt_time);
else
atomic_long_add(jiffies_to_msecs(jiffies - then), &alrt_time);
}
/*
* Report current throttle state(s) to MT.
* Simple copy of local variables, except for the time
* measurement, where current throttle (if any) is included.
* Don't want a lock to gate access to the local variables,
* so the atomics needs to be read in the correct order.
* First throttle state, then adder if throttle is in
* progress, then counters. If PM enters or leave throttle
* while reading stats, the worst is that time for the
* current trottle is not included until next read.
*/
int
mr_pm_ttl(struct mr_rsp_ttl * rsp)
{
unsigned long then;
rsp->power.since = 0;
rsp->power.active = (uint8_t) atomic_read(&pwr_onoff);
if (rsp->power.active) {
then = atomic_long_read(&pwr_start);
if (then)
rsp->power.since = jiffies_to_msecs(jiffies - then);
}
rsp->power.count = atomic_read(&pwr_count);
rsp->power.time = atomic_long_read(&pwr_time);
rsp->thermal.since = 0;
rsp->thermal.active = (uint8_t) atomic_read(&tmp_onoff);
if (rsp->thermal.active) {
then = atomic_long_read(&tmp_start);
if (then)
rsp->thermal.since = jiffies_to_msecs(jiffies - then);
}
rsp->thermal.count = atomic_read(&tmp_count);
rsp->thermal.time = atomic_long_read(&tmp_time);
rsp->alert.since = 0;
rsp->alert.active = (uint8_t) atomic_read(&alrt_onoff);
if (rsp->alert.active) {
then = atomic_long_read(&alrt_start);
if (then)
rsp->alert.since = jiffies_to_msecs(jiffies - then);
}
rsp->alert.count = atomic_read(&alrt_count);
rsp->alert.time = atomic_long_read(&alrt_time);
return 0;
}
/*
* Throttle signaling function (call from PM)
*/
static int ttl_tcrit;
void
mr_throttle(int which, int state)
{
struct ttl_info ttl;
uint32_t tmp;
atomic_inc(&pm_entry);
tmp = mr_sbox_rl(0, SBOX_THERMAL_STATUS_2);
ttl.die = GET_BITS(19, 10, tmp);
/*
* PM is weird in the destinction of thermal and power throttle.
* Power below PLIM should be quiet. Power between PLim1 and PLim0
* results in TTL_POWER events. Power above PLim0 results in both
* TTL_POWER and TTL_THERMAL events, _even_ if temperature is well
* below Tcrit. We handle this by maintaining 3 throttle related
* event types: thermal throttles, power throttles and power alert.
* The power alert is flaggend on entry as TTL_POWER, no problems.
* The two throttles both come in as TTL_THERMAL, so we use current
* die temperature to determine whether it was a thermal threshold
* or the power limit that was exceeded. Point is power throttles
* arriving while temperature is above Tcrit _will_ be counted as
* thermal throttles, period.
*/
ttl.upd = 0;
switch(which) {
case TTL_POWER:
(state == TTL_OFF) ? mr_alrt_leave() : mr_alrt_enter();
ttl.upd |= PM_ALRT_TTL_CHG;
ttl.upd |= atomic_read(&alrt_onoff) ? PM_ALRT_TTL : 0;
break;
case TTL_THERMAL:
#if 1
/*
* Careful here: may get throttle ON while die > tcrit
* and select thermal throttle correctly and then get
* the corresponding throttle OFF when die has fallen
* below tcrit in which case we must de-assert thermal
* trottle.
* As a shortcut, we deassert both throttles if the
* GPU_HOT signal gets de-asserted (which is correct).
*/
if (state == TTL_OFF) {
if (atomic_read(&pwr_onoff))
ttl.upd |= PM_PWR_TTL_CHG;
if (atomic_read(&tmp_onoff))
ttl.upd |= PM_TRM_TTL_CHG;
mr_pwr_leave();
mr_tmp_leave();
}
else {
if (ttl_tcrit && ttl.die < ttl_tcrit) {
if (! atomic_read(&pwr_onoff))
ttl.upd |= (PM_PWR_TTL_CHG | PM_PWR_TTL);
mr_pwr_enter();
}
else {
if (! atomic_read(&tmp_onoff))
ttl.upd |= (PM_TRM_TTL_CHG | PM_TRM_TTL);
mr_tmp_enter();
}
}
#else
if (ttl_tcrit && ttl.die < ttl_tcrit) {
(state == TTL_OFF) ? mr_pwr_leave() : mr_pwr_enter();
ttl.upd |= PM_PWR_TTL_CHG;
ttl.upd |= atomic_read(&pwr_onoff) ? PM_PWR_TTL : 0;
}
else {
(state == TTL_OFF) ? mr_tmp_leave() : mr_tmp_enter();
ttl.upd |= PM_TRM_TTL_CHG;
ttl.upd |= atomic_read(&tmp_onoff) ? PM_TRM_TTL : 0;
}
#endif
break;
}
micras_ttl_send(&ttl);
#if 0
printk("ttl - args: which %d, state %d\n", which, state);
printk("ttl - therm: on %d, count %d, time %ld, start %ld\n",
atomic_read(&tmp_onoff), atomic_read(&tmp_count),
atomic_long_read(&tmp_time), atomic_long_read(&tmp_start));
printk("ttl - power: on %d, count %d, time %ld, start %ld\n",
atomic_read(&pwr_onoff), atomic_read(&pwr_count),
atomic_long_read(&pwr_time), atomic_long_read(&pwr_start));
printk("ttl - alert: on %d, count %d, time %ld, start %ld\n",
atomic_read(&alrt_onoff), atomic_read(&alrt_count),
atomic_long_read(&alrt_time), atomic_long_read(&alrt_start));
#endif
atomic_dec(&pm_entry);
}
/*
* Throttle signaling function (call from notifier chain)
*
* TBD: should we test for odd state transitions and recursions?
*/
static int
mr_pm_throttle_callback(struct notifier_block *nb, unsigned long event, void *msg)
{
atomic_inc(&pm_entry);
switch(event) {
case EVENT_PROCHOT_ON:
mr_throttle(TTL_THERMAL, TTL_ON);
break;
case EVENT_PROCHOT_OFF:
mr_throttle(TTL_THERMAL, TTL_OFF);
break;
case EVENT_PWR_ALERT_ON:
mr_throttle(TTL_POWER, TTL_ON);
break;
case EVENT_PWR_ALERT_OFF:
mr_throttle(TTL_POWER, TTL_OFF);
break;
default:
/*
* Ignore whatever else is sent this way
*/
break;
}
atomic_dec(&pm_entry);
return 0;
}
/*
**
** Power management routines
**
** one_mmio_rd Read one MMIO register into memory safe
** one_mmio_wr Write one MMIO register from memory safe
**
** one_msr_rd Read one MSR register into memory safe
** one_msr_wr Write one MSR register from memory safe
**
** mc_suspend Prepare for suspend, preserve CSRs to safe
** mc_suspend_cancel Suspend canceled, restore operating mode
** mc_resume Recover from suspend, restore CSRs from safe
**
** For now this stores all registers that are used by this module.
** In reality, only those registers on power planes turned off in
** deep sleep states needs to be stored, but at this point it is
** not known which registers are in that group. This is a table
** driven mechanism that _only_ handles RAS related registers.
**
**TBD: Turn off MC handlers while in suspend?
** Both pro's and con's on this one, such as
** + Disabling uncore is easy, just clear INT_EN
** + prevents MC to interfere with PM state transitions
** - can hide corruption due to UC errors
** - requires a lot of IPIs to shut down core MC handling
** + there's nobody to handle MCs when cores are asleep.
** ? can events hide in *BOX banks during suspend/resume
** and fire when restoring the INT_EN register?
** - Disabling core is not that easy (from a module).
** Enabling core MCEs requires setting flag X86_CR4_MCE
** in CR4 on every core _and_ writing ~0 to MSR IA32_MCG_CAP
** on every CPU. Probably better to let per-CPU routines
** like mce_suspend() and mce_resume() handle it, with
** some care because we'd want to save all CTLs before
** mce_suspend() runs and restore them after mce_resume().
** Problem is how to get at these functions; they are not
** exported and seems not to be hooked into the kernel's PM
** call chains. Perhaps sysclass abstraction ties into PM.
** Even so, who's to invoke it and how?
*/
#define SAVE_BLOCK_MCA 1 /* Disable MC handling in suspend */
#define RAS_SAVE_MSR 1 /* Include global MSRs in suspend */
#define RAS_SAVE_CPU_MSR 0 /* Include per-CPU MSRs in suspend */
#define SBOX 1 /* SBOX register (index 0) */
#define DBOX 2 /* DBOX register (index 0..1) */
#define GBOX 3 /* GBOX register (index 0..7) */
#define TBOX 4 /* TBOX register (index 0..7) */
#define GMSR 5 /* Global MSR (index 0) */
#define LMSR 6 /* Per-CPU MSR (index 0..CONFIG_NR_CPUS-1) */
#define W64 (1 << 6) /* 64 bit MMIO register (32 bit default) */
#define VLD (1 << 7) /* Register value valid, can be restored */
typedef struct _regrec {
uint8_t box; /* Box type + width bit + valid bit */
uint8_t num; /* Box index (or 0) */
uint16_t ofs; /* MMIO byte offset / MSR number */
uint64_t reg; /* Register value */
} RegRec;
/*
* Rumor has it that SBOX CSRs below 0x7000 will survive deep sleep
* Think it's safer to save/restore CSRs that RAS writes to anyways.
* We'll leave out a bunch of RO CSRs, most of which are HW status.
* SCRATCH<n> CSRs are above 0x7000 and needs to be preserved.
*
*TBD: Somebody else to preserve scratch CSRs not used by RAS?
* For now I'll save and restore all of them.
*/
static RegRec susp_mmio[] = { /* Used in file */
{ SBOX, 0, SBOX_MCA_INT_EN, 0 }, /* Uncore, must be 1st */
{ SBOX, 0, SBOX_SCRATCH0, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH1, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH2, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH3, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH4, 0 }, /* Common, knc, */
{ SBOX, 0, SBOX_SCRATCH5, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH6, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH7, 0 }, /* Knc, knf */
{ SBOX, 0, SBOX_SCRATCH8, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH9, 0 }, /* Common, knc, knf */
{ SBOX, 0, SBOX_SCRATCH10, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH11, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH12, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH13, 0 }, /* Common */
{ SBOX, 0, SBOX_SCRATCH14, 0 }, /* - */
{ SBOX, 0, SBOX_SCRATCH15, 0 }, /* - */
// { SBOX, 0, SBOX_COMPONENT_ID, 0 }, /* Knc */
// { SBOX, 0, SBOX_SVIDCONTROL, 0 }, /* Knc */
// { SBOX, 0, SBOX_PCIE_PCI_SUBSYSTEM, 0 }, /* Common */
// { SBOX, 0, SBOX_PCIE_VENDOR_ID_DEVICE_ID, 0 }, /* Common */
// { SBOX, 0, SBOX_PCIE_PCI_REVISION_ID_AND_C_0X8, 0 },/* Common */
{ SBOX, 0, SBOX_OC_I2C_ICR + ICR_OFFSET, 0 }, /* Elog */
{ SBOX, 0, SBOX_OC_I2C_ICR + ISR_OFFSET, 0 }, /* Elog */
{ SBOX, 0, SBOX_OC_I2C_ICR + ISAR_OFFSET, 0 }, /* Elog */
{ SBOX, 0, SBOX_OC_I2C_ICR + IDBR_OFFSET, 0 }, /* Elog */
// { SBOX, 0, SBOX_OC_I2C_ICR + IBMR_OFFSET, 0 }, /* Elog */
// { SBOX, 0, SBOX_COREVOLT, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_COREFREQ, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_MEMVOLT, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_MEMORYFREQ, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_CURRENTRATIO, 0 }, /* Knc */
// { SBOX, 0, SBOX_BOARD_VOLTAGE_SENSE, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_THERMAL_STATUS, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_BOARD_TEMP1, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_BOARD_TEMP2, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_CURRENT_DIE_TEMP0, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_CURRENT_DIE_TEMP1, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_CURRENT_DIE_TEMP2, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_MAX_DIE_TEMP0, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_MAX_DIE_TEMP1, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_MAX_DIE_TEMP2, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_STATUS_FAN1, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_STATUS_FAN2, 0 }, /* Knc, knf */
// { SBOX, 0, SBOX_SPEED_OVERRIDE_FAN, 0 }, /* Knc, knf */
{ SBOX, 0, SBOX_MCA_INT_STAT, 0 }, /* Uncore */
// { SBOX, 0, SBOX_APICRT16, 0 }, /* Uncore */
{ SBOX, 0, SBOX_MCX_CTL_LO, 0 }, /* Uncore */
{ DBOX, 0, DBOX_MC2_CTL, 0 }, /* Uncore */
#ifdef CONFIG_MK1OM
{ DBOX, 1, DBOX_MC2_CTL, 0 }, /* Uncore */
#endif
{ GBOX | W64, 0, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 1, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 2, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 3, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
#ifdef CONFIG_MK1OM
{ GBOX | W64, 4, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 5, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 6, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
{ GBOX | W64, 7, GBOX_FBOX_MCA_CTL_LO, 0 }, /* Uncore */
#endif
#ifdef CONFIG_MK1OM
{ TBOX | W64, 0, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 1, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 2, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 3, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 4, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 5, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 6, TXS_MCX_CONTROL, 0 }, /* Uncore */
{ TBOX | W64, 7, TXS_MCX_CONTROL, 0 }, /* Uncore */
#endif
};
#if RAS_SAVE_MSR
static RegRec susp_msr[] = { /* Used in file */
{ GMSR, 0, MSR_IA32_MCG_STATUS, 0 }, /* Uncore, kernel */
};
#if RAS_SAVE_CPU_MSR
static RegRec susp_lcl_msr[4 * CONFIG_NR_CPUS] = { /* Used in file */
{ LMSR, 0, MSR_IA32_MCx_CTL(0), 0 }, /* Core, kernel */
{ LMSR, 0, MSR_IA32_MCx_CTL(1), 0 }, /* Core, kernel */
{ LMSR, 0, MSR_IA32_MCx_CTL(2), 0 }, /* Core, kernel */
{ LMSR, 0, MSR_IA32_MCG_CTL, 0 }, /* kernel */
/*
* The remaining entries is setup/replicated by pm_init()
*/
};
#endif
#endif
static void
one_mmio_rd(RegRec * r)
{
switch(r->box & 0xf) {
case SBOX:
if (r->box & W64)
r->reg = mr_sbox_rq(0, r->ofs);
else
r->reg = (uint64_t) mr_sbox_rl(0, r->ofs);
break;
case DBOX:
if (r->box & W64)
r->reg = mr_dbox_rq(r->num, r->ofs);
else
r->reg = (uint64_t) mr_dbox_rl(r->num, r->ofs);
break;
case GBOX:
if (r->box & W64)
r->reg = mr_gbox_rq(r->num, r->ofs);
else
r->reg = (uint64_t) mr_gbox_rl(r->num, r->ofs);
break;
case TBOX:
if (mr_txs()) {
if (r->box & W64)
r->reg = mr_tbox_rq(r->num, r->ofs);
else
r->reg = (uint64_t) mr_tbox_rl(r->num, r->ofs);
}
break;
default:
r->box &= ~VLD;
return;
}
r->box |= VLD;
#if PM_VERBOSE
printk("mmio_rd: box %d, idx %3d, ofs %04x -> %llx\n",
r->box & 0xf, r->num, r->ofs, r->reg);
#endif
}
static void
one_mmio_wr(RegRec * r)
{
if (! (r->box & VLD))
return;
switch(r->box & 0xf) {
case SBOX:
if (r->box & W64)
mr_sbox_wq(0, r->ofs, r->reg);
else
mr_sbox_wl(0, r->ofs, (uint32_t) r->reg);
break;
case DBOX:
if (r->box & W64)
mr_dbox_wq(r->num, r->ofs, r->reg);
else
mr_dbox_wl(r->num, r->ofs, (uint32_t) r->reg);
break;
case GBOX:
if (r->box & W64)
mr_gbox_wq(r->num, r->ofs, r->reg);
else
mr_gbox_wl(r->num, r->ofs, (uint32_t) r->reg);
break;
case TBOX:
if (mr_txs()) {
if (r->box & W64)
mr_tbox_wq(r->num, r->ofs, r->reg);
else
mr_tbox_wl(r->num, r->ofs, (uint32_t) r->reg);
}
break;
}
r->box &= ~VLD;
#if PM_VERBOSE
printk("mmio_wr: box %d, idx %3d, ofs %04x <- %llx\n",
r->box & 0xf, r->num, r->ofs, r->reg);
#endif
}
#if RAS_SAVE_MSR
static void
one_msr_rd(RegRec * r)
{
uint32_t hi, lo;
switch(r->box & 0xf) {
case GMSR:
rdmsr(r->ofs, lo, hi);
break;
#if RAS_SAVE_CPU_MSR
case LMSR:
rdmsr_on_cpu(r->num, r->ofs, &lo, &hi);
break;
#endif
default:
r->box &= ~VLD;
return;
}
r->reg = ((uint64_t) hi) << 32 | (uint64_t) lo;
r->box |= VLD;
#if PM_VERBOSE
printk("msr_rd: box %d, idx %3d, ofs %04x -> %llx\n",
r->box & 0xf, r->num, r->ofs, r->reg);
#endif
}
static void
one_msr_wr(RegRec * r)
{
uint32_t hi, lo;
if (! (r->box & VLD))
return;
hi = r->reg >> 32;
lo = r->reg & 0xffffffff;
switch(r->box & 0xf) {
case GMSR:
wrmsr(r->ofs, lo, hi);
break;
#if RAS_SAVE_CPU_MSR
case LMSR:
wrmsr_on_cpu(r->num, r->ofs, lo, hi);
break;
#endif
}
r->box &= ~VLD;
#if PM_VERBOSE
printk("msr_wr: box %d, idx %3d, ofs %04x <- %llx\n",
r->box & 0xf, r->num, r->ofs, r->reg);
#endif
}
#endif /* RAS_SAVE_MSR */
/*
* Preserve all HW registers that will be lost in
* deep sleep states. This will be SBOX registers
* above offset 0x7000 and all other BOX registers.
*/
static void
mr_suspend(void)
{
int i;
atomic_inc(&pm_entry);
/*
* Save SBOX_MCA_INT_EN first and clear it.
* No more uncore MCAs will get through.
*/
one_mmio_rd(susp_mmio + 0);
#if SAVE_BLOCK_MCA
mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
#endif
/*
* Save remaining BOX MMIOs
*/
for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
one_mmio_rd(susp_mmio + i);
#if RAS_SAVE_MSR
/*
* Save global MSRs and set MCIP
* No new exceptions will be asserted
*/
for(i = 0; i < ARRAY_SIZE(susp_msr); i++)
one_msr_rd(susp_msr + i);
#if SAVE_BLOCK_MCA
wrmsr(MSR_IA32_MCG_STATUS, MCG_STATUS_MCIP, 0);
#endif
#if RAS_SAVE_CPU_MSR
/*
* Save per-CPU MSRs
*/
for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
one_msr_rd(susp_lcl_msr + i);
#endif
#endif
atomic_dec(&pm_entry);
}
/*
* Undo side effects of a suspend call.
* Nothing to do unless we turned MC handlers off.
*/
static void
mr_cancel(void)
{
int i;
atomic_inc(&pm_entry);
/*
* Restore SBOX_MCA_INT_EN to unblock uncore MCs
* Invalidate all other saved MMIO registers.
*/
one_mmio_wr(susp_mmio + 0);
for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
susp_mmio[i].box &= ~VLD;
#if RAS_SAVE_MSR
/*
* Restore IA32_MCG_STATUS to unblock core MCs
* Invalidate all other saved MSR registers.
*/
one_msr_wr(susp_msr + 0);
for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
susp_msr[i].box &= ~VLD;
#if RAS_SAVE_CPU_MSR
for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
susp_lcl_msr[i].box &= ~VLD;
#endif
#endif
atomic_dec(&pm_entry);
}
/*
* Restore all HW registers that we use.
*/
static void
mr_resume(void)
{
int i;
atomic_inc(&pm_entry);
/*
* Clear uncore MCA banks (just in case)
*/
if (susp_mmio[0].box & VLD)
box_reset(0);
/*
* Restore all BOX MMIOs but SBOX_MCA_INT_EN
*/
for(i = 1; i < ARRAY_SIZE(susp_mmio); i++)
one_mmio_wr(susp_mmio + i);
/*
* Then restore SBOX_MCA_INT_EN to enable uncore MCAs
*/
one_mmio_wr(susp_mmio + 0);
#if RAS_SAVE_MSR
/*
* Restore all global MSRs but IA32_MCG_STATUS
*/
for(i = 1; i < ARRAY_SIZE(susp_msr); i++)
one_msr_wr(susp_msr + i);
/*
* Then restore IA32_MCG_STATUS to allow core MCAs
*/
one_msr_wr(susp_msr + 0);
#if RAS_SAVE_CPU_MSR
/*
* Restore all per-cpu MSRs
*/
for(i = 0; i < ARRAY_SIZE(susp_lcl_msr); i++)
one_msr_wr(susp_lcl_msr + i);
#endif
#endif
atomic_dec(&pm_entry);
}
/*
* Callback from PM notifier chain.
* TBD: should we test for odd state transitions and recursions?
*/
static int
mr_pm_callback(struct notifier_block *nb, unsigned long event, void *msg)
{
switch(event) {
case MICPM_DEVEVENT_SUSPEND:
mr_suspend();
break;
case MICPM_DEVEVENT_RESUME:
mr_resume();
break;
case MICPM_DEVEVENT_FAIL_SUSPEND:
mr_cancel();
break;
default:
/*
* Ignore whatever else is sent this way
*/
break;
}
return 0;
}
/*
**
** The PM module loads before RAS, so we must setup
** the API to support power management, i.e register.
** PM needs:
** - Notification when MT changes certain variables.
** Provided by a call-out list that the PM sets
** at registration time.
** - Access to MT calls.
** The PM module can use micras_mt_call() for access.
** Since PM loads first, this function needs to
** be passed at registration time.
** RAS needs:
** - list of core voltages (for CVOLT query).
** We pass a pointer to the voltage list and the
** voltage list counter to PM module, who will
** fill in the actual values (not available until
** core-freq driver loads).
** - list of core frequencies (for CFREQ query).
** Same solution as for CVOLT.
** - Notifications for throttle state changes.
** - Power management notifications for suspend/resume.
**
** Note: can one notifier block be inserted in multiple
** chains? Its assume not, which require two blocks
** both pointing to the same local function.
*/
extern struct mr_rsp_freq freq;
extern struct mr_rsp_volt volt;
struct micpm_params pm_reg; /* Our data for PM */
struct micpm_callbacks pm_cb; /* PM data for us */
extern void micpm_device_register(struct notifier_block *n);
extern void micpm_device_unregister(struct notifier_block *n);
extern void micpm_atomic_notifier_register(struct notifier_block *n);
extern void micpm_atomic_notifier_unregister(struct notifier_block *n);
static struct notifier_block ras_deviceevent = {
.notifier_call = mr_pm_callback,
};
static struct notifier_block ras_throttle_event_ns = {
.notifier_call = mr_pm_throttle_callback,
};
static struct notifier_block ras_throttle_event = {
.notifier_call = mr_pm_throttle_callback,
};
/*
* Setup PM callbacks and SCIF handler.
*/
static int
pm_mt_call(uint16_t cmd, void * buf)
{
int err;
atomic_inc(&pm_entry);
err = micras_mt_call(cmd, buf);
atomic_dec(&pm_entry);
return err;
}
int __init
pm_init(void)
{
extern int mr_smc_rd(uint8_t, uint32_t *);
#if RAS_SAVE_CPU_MSR
/*
* Preset MCA bank MSR register descriptions
*
*TBD: We have to use IPIs to read MSRs, which will wake
* up cores at sleep when this function is called.
* PM module may not like this at all.
*/
int i, j;
for(i = 1; i < nr_cpu_ids; i++) {
j = 4 * i;
susp_lcl_msr[j] = susp_lcl_msr[0];
susp_lcl_msr[j + 1] = susp_lcl_msr[1];
susp_lcl_msr[j + 2] = susp_lcl_msr[2];
susp_lcl_msr[j + 3] = susp_lcl_msr[3];
susp_lcl_msr[j].num = i;
susp_lcl_msr[j + 1].num = i;
susp_lcl_msr[j + 2].num = i;
susp_lcl_msr[j + 3].num = i;
}
#endif
/*
* Get temperature where power throttle becomes thermal throttle
*/
mr_smc_rd(0x4c, &ttl_tcrit);
/*
* Register with the MIC Power Management driver.
*/
pm_reg.volt_lst = volt.supt;
pm_reg.volt_len = &volt.slen;
pm_reg.volt_siz = ARRAY_SIZE(volt.supt);
pm_reg.freq_lst = freq.supt;
pm_reg.freq_len = &freq.slen;
pm_reg.freq_siz = ARRAY_SIZE(freq.supt);
pm_reg.mt_call = pm_mt_call;
pm_reg.mt_ttl = mr_throttle;
if (micpm_ras_register(&pm_cb, &pm_reg))
goto fail_pm;
/*
* Get into the PM notifier lists
* MicPm reports events in 2 chains, one atomic and one
* blocking. Our callback will not block!
*/
micpm_atomic_notifier_register(&ras_throttle_event_ns);
micpm_notifier_register(&ras_throttle_event);
if (boot_cpu_data.x86_mask == KNC_C_STEP)
micpm_device_register(&ras_deviceevent);
printk("RAS.pm: init complete\n");
return 0;
fail_pm:
printk("RAS.pm: init failed\n");
return 1;
}
/*
* Cleanup for module unload.
* Clear/restore hooks in the native MCA handler.
*/
void __exit
pm_exit(void)
{
/*
* Get off the PM notifier list
*/
micpm_atomic_notifier_unregister(&ras_throttle_event_ns);
micpm_notifier_unregister(&ras_throttle_event);
if (boot_cpu_data.x86_mask == KNC_C_STEP)
micpm_device_unregister(&ras_deviceevent);
/*
* De-register with the PM module.
*/
micpm_ras_unregister();
/*
* Wait for an calls to module to finish.
*/
while(atomic_read(&pm_entry))
cpu_relax();
printk("RAS.pm: exit complete\n");
}
#endif /* USE_PM */