Updated `README.md` with instructions for building/using the kernel module.
[xeon-phi-kernel-module] / ras / micras_core.c
/*
* Copyright 2010-2017 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Disclaimer: The codes contained in these modules may be specific to
* the Intel Software Development Platform codenamed Knights Ferry,
* and the Intel product codenamed Knights Corner, and are not backward
* compatible with other Intel products. Additionally, Intel will NOT
* support the codes or instruction set in future products.
*
* Intel offers no warranty of any kind regarding the code. This code is
* licensed on an "AS IS" basis and Intel is not obligated to provide
* any support, assistance, installation, training, or other services
* of any kind. Intel is also not obligated to provide any updates,
* enhancements or extensions. Intel specifically disclaims any warranty
* of merchantability, non-infringement, fitness for any particular
* purpose, and any other warranty.
*
* Further, Intel disclaims all liability of any kind, including but
* not limited to liability for infringement of any proprietary rights,
* relating to the use of the code, even if Intel is notified of the
* possibility of such liability. Except as expressly stated in an Intel
* license agreement provided with this code and agreed upon with Intel,
* no license, express or implied, by estoppel or otherwise, to any
* intellectual property rights is granted herein.
*/
/*
* RAS handler for core MC events
*
* Contains code to intercept MC events, collect information
* from core MCA banks on originating core and possibly on
* all active cores if necessary.
*
* In case of a severe event, defined by corrupted context,
* the handler will add a record of the event in the designated
* EEPROM hanging off the Over-Clocking I2C bus. Next a message
* will be sent to the SMC (enabling IPMI notifications) and at
* last a message is sent to host via the MC SCIF connection
* (if MC SCIF session has been established).
*
* Lesser events will also be sent to the host on a 'FYI' basis,
* but no record will be stored in the event log, nor will the
* SMC be notified.
*
* Special cases of high rate correctable errors may also cause
* events to be recorded in EEPROM on the assumption that the
* root cause will be detectable from maintenance mode.
*
* The handler cannot expect any support from the OS while in
* exception (NMI) context. Therefore, NMI-safe routines has
* been added to mimic some kernel services, e.g. ee_print().
*/
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/io.h>
#include <linux/cpumask.h>
#include <asm/mce.h>
#include <asm/apic.h>
#include "micras.h"
/*
**
** Brief design notes:
** There are two ways this code normally will be entered.
**
** 1) From standard interrupt context (bottom-half).
** This is supporting MC events picked up by the
** machine_check_poll(), i.e. events that aren't
** causing state corrruption (UC bit not set).
**
** 2) From exception/NMI context.
** This handles errors that _did_ flag processor
** state corruption (UC bit set, or other condition
** causing the kernel exception handler to pick it up).
**
** Both cases can happen simultaneously on different CPU's,
** which require careful considerations about re-entrant code
** behaviour here. Particularly nasty is exception context where
** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt
** disable can protect a critical region, an assumption that is
** false when an exception/NMI occur).
**
** Standard interrupt context entries occur when non-fatal and
** thus non-critical MC events are handled. In most cases just
** results in a regular SCIF send of McInfo structs to the host.
** Note that the call chain origin is a callout from the timer
** thread, not from an interrupt service routine, so to name
** it as standard interrupt context is somewhat misleading.
**
** Exception context messages are usuallly fatal and must be
** dealt with immediately, because otherwise the generic machine
** handler may panic() the system when exiting exception handler
** (default behavior, may be tweaked by altering 'threshold').
**
** In order to proceed we can either implement a locking mechanism
** at every API function entry, or we can let every function do it's
** thing independently. The latter is preferred, though it gets
** somewhat complicated because the API between the generic MC
** handling and RAS module is in fact composed of several calls.
**
** If state between API calls needs to be tracked then that can be
** done by means of pre-allocated arrays, similar to the generic
** handling in the Linux kernel. Currently the only state variable
** is the mask of CPUs that has been sent an IPI.
**
** Core MC events can be simulated by using the 'mce-inject' tool,
** consisting of a kernel module and a text mode application program.
** The 'mce-inject' module knows the difference between fatal and
** non-fatal events (defined by the UC bit) and acts differently
** in the two cases. Non-fatal injections cause machine_check_poll()
** to be called on all CPUs, resulting in events being reported to
** function mce_poll(). Fatal injections cause do_machine_check()
** to be called on all CPUs, resulting in calls to the mcc_exc_*
** routines below. Activities triggered by mce-inject are flagged
** as 'fake', and shall _NOT_ be logged in the EEPROM.
**
** Warning:
** Controls in the generic MC handling may cause the kernel to
** panic, _ALSO_ even if no event was found in any MCA banks!!
** Not sure exactly how to capture that sort of event.
**
** Warning:
** The 'mce-inject' module uses different methods of invoking error
** handling routines, depending on the mce record (inject_flags).
** Specifically, the 'mce-inject' module may use of broadcast NMIs
** to invoke machine_check_poll() or do_machine_check() on all CPUs,
** which will make these functions execute in exception context.
** The NMI broadcast mechanism is based on registering a handler on
** the 'die' notifier chain and then doing an
** apic->send_IPI_mask(.., NMI_VECTOR),
** knowing that do_nmi() will invoke this notifier chain when no
** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0,
** [which is SERR + IOCHK on chipset register NSR]).
** Long story short; if 'mce-inject' is used we can not expect that
** polling is done in standard interrupt context, and need to set
** the 'in exception context' flag for SCIF access.
**
*/
/*
* Hooks placed in the native machine check handler
* See file arch/x86/kernel/cpu/mcheck/mce.c for placement.
*
* poll After entering a non-UC event into mce_log.
* This happens in normal thread context, which
* means that kernel services are avaialble.
* exc_flt Filter on correctable errors. If events occur
* at a very high rate they can severely slow
* down the system and/or crash it entirely.
* Logic here will disable reporting of some
* events if they are seen too often.
* exc_entry Entering MC exception handler.
* Called _after_ reading MCG_STATUS and the early
* severity assesment by mce_severity() has been
* performed on all banks, such that we get to
* know if the native MC handler will panic.
* exc_log After entering a UC event into mce_log.
* The logged mce record has all available
* details on the event, and this point is the
* best place to perform our RAS activities.
* exc_panic Right before the MC exception handler calls
* the panic function.
* exc_exit Exit the MC exception handler
* print Exception context safe printf to POST-card UART
*/
extern void (*mca_poll)(struct mce *, uint64_t, int);
extern void (*mca_exc_flt)(struct mce *, uint64_t, int);
extern void (*mca_exc_entry)(struct mce *, int, int, int, char *);
extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int);
extern void (*mca_exc_panic)(struct mce *, char *, char *, int);
extern void (*mca_exc_exit)(struct mce *, int, int, int, int);
extern int (*mca_print)(char *, ...);
extern struct mce_log mcelog; /* Export from kernel */
extern struct mutex mce_read_mutex; /* Export from kernel */
static unsigned mcc_seen; /* Last event in kernel log */
int in_sync; /* Flag when sync'ing */
/*
* Convert a kernel mce record into a MC API format
*/
static void
mcc_conv(struct mce * mce, struct mce_info * mc)
{
mc->org = mce->bank;
mc->id = mce->extcpu;
#ifdef CONFIG_MK1OM
mc->pid = xlat_cpu[cpu_data(mc->id).apicid];
#endif
mc->stamp = mce->time;
mc->status = mce->status;
mc->addr = mce->addr;
mc->misc = mce->misc;
mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
}
/*
* Filter for correctable errors, may modify CTL value.
* The filter is pretty crude, we just want to protect
* ourselves from being run over by fast recurring events.
* We keep tabs of events seen in a static array.
*
* Algorithm is like this:
* - test if event is in filter list; if not exit filter.
* - search for instance of this event in history.
* - if not found, insert event in history (strike 1).
* - if found but time since last seen exceeds window,
* then treat event as new in history (new strike 1).
* - if found and within time window, bump strike counter.
* - if strike counter reach maximum, we're fed up and
* turn this event off by clearing the associated
* bit in the offending MCA bank's CTL register and
* send a 'filter' event notification to the host.
*
* Advantages of this design is:
* - individual parameters for every filtered event.
* - only one event history array.
* - no periodic aging of events in history array.
* - no averaging over time required.
* - no moving/reordering of event history entries.
* - new events do not replace older seen event
* - filter reacts immediately when max reached.
*
* Disadvantages are:
* - linear search through filter array.
* - linear search through history array.
* - time parameter not obvious, it's really a limit
* on how old events in history are allowed to be.
* - in pathological cases the filter's reaction time
* will be max * window (when events trickle in at
* a rate just below the window size).
* - data in ADDR and MISC registers are not used to
* match current event with history. Should they be?
*
* For now, both lists are short enough that introducing
* more advanced searches probably are not going to help.
*
* On KnC the flash may have overrides of the mc_turnoff table.
*/
#define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */
static struct mc_hist {
uint32_t count; /* How many times seen */
uint64_t last; /* TSC last time seen */
struct mce_info mc; /* Local MC event record */
} mc_history[32];
static struct mc_disc {
uint8_t bank, ctl; /* Bank selector and control bit # */
uint16_t win; /* Time window (seconds) */
uint16_t max; /* Max count */
uint16_t mca_code; /* MCA code, status[15:0] */
uint16_t mdl_code; /* Model code, status[31:16] */
} mc_turnoff[] = {
{ 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */
{ 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */
{ 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */
{ 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */
{ 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */
{ 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */
{ 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */
{ 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */
{ 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */
{ 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */
{ 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */
{ 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */
{ 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */
{ 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */
{ 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */
{ 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */
{ 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */
};
#ifdef CONFIG_MK1OM
#define MC_FLT_SIG1 0x0e13c20f /* Start signature */
#define MC_FLT_SIG2 0xf1ec3df0 /* End signature */
#define MC_FLT_SIZE 0x200 /* Filter block length */
void
mcc_flt_parm(uint8_t * p)
{
uint16_t fnum;
/*
* Check signatures
*/
if (*((uint32_t *) p) != MC_FLT_SIG1 ||
*((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) {
printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n",
*((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4)));
return;
}
/*
* After start signature comes filter count (uint16_t)
* followed by 'count' filter descriptors (struct mc_disc).
*/
fnum = *(uint16_t *)(p + 4);
if (fnum > ARRAY_SIZE(mc_turnoff) ||
fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) {
printk("mcc_flt_parm: filter count %d not valid\n", fnum);
return;
}
/*
* Seems the table is legit, copy it over defaults.
*/
memset(mc_turnoff, '\0', sizeof(mc_turnoff));
memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc));
#if MC_VERBOSE
{
int i;
for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n",
i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win,
mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code);
}
}
#endif
}
#endif
/*
* Frequency filter for core and un-core MC events
*/
uint32_t
micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc)
{
struct mc_disc * dsc;
struct mc_hist * hst;
uint64_t ostamp;
int i, oldest;
if (mc->status & MCI_STATUS_UC)
return 0;
/*
* Check if this event may be filtered
*/
dsc = mc_turnoff;
for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
if (dsc->bank == mc->org &&
dsc->mca_code == GET_BITS(15, 0, mc->status) &&
dsc->mdl_code == GET_BITS(31, 16, mc->status))
break;
dsc++;
}
if (i == ARRAY_SIZE(mc_turnoff))
return 0;
/*
* Have a candidate for filter.
* Have we seen this one before?
*/
oldest = 0;
ostamp = tsc;
hst = mc_history;
for(i = 0; i < ARRAY_SIZE(mc_history); i++) {
/*
* While scanning, find the oldest event too
*/
if (hst->last < ostamp) {
ostamp = hst->last;
oldest = i;
}
/*
* Does this match event in filter history?
* TBD: how much needs to match?
* For now: cpu (or box), bank, mca_code and model_code.
*/
if (hst->last &&
hst->mc.id == mc->id &&
hst->mc.org == mc->org &&
GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) &&
GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status))
break;
hst++;
}
if (i == ARRAY_SIZE(mc_history)) {
/*
* Not seen this event before.
* 'oldest' is where to store this event.
*/
hst = mc_history + oldest;
hst->count = 1;
hst->last = tsc;
hst->mc = *mc;
return 0;
}
/*
* Already 'on file in history', test expiration date
*/
if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) {
/*
* Matching history element had expired, just overwrite it
*/
hst->count = 1;
hst->last = tsc;
hst->mc = *mc;
return 0;
}
/*
* Filter element active, bump count and set last seen.
* We do _NOT_ want injected events to enter the EEPROM,
* so that flag is preserved over all event history
*/
hst->count++;
if (mc->flags & MC_FLG_FALSE)
hst->mc.flags |= MC_FLG_FALSE;
if (hst->count < dsc->max) {
hst->last = tsc;
return 0;
}
/*
* Threshold reached, event source needs to be silenced.
* Store a record of this in the EEPROM and send a
* notification to host about it. Once duly reported, clear
* event from the filter; it is not expected to show up again.
* Note: we report the _first_ event seen, not the
* event at hand. We could save array space
* by sending latest event (less info to keep).
*/
ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n",
dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz);
hst->mc.flags |= MC_FLG_FILTER;
#ifdef CONFIG_MK1OM
if (!(hst->mc.flags & MC_FLG_FALSE)) {
micras_mc_log(&hst->mc);
hst->mc.flags |= MC_FLG_LOG;
}
#endif
micras_mc_send(&hst->mc, exc);
hst->last = 0;
/*
* MC events are disabled by caller when a
* non-zero mask is returned by this routine.
*/
return (1 << dsc->ctl);
}
/*
* Remove/mask an 'enable-bit' from a core MCA bank.
* Note: This applies to _current_ cpu only. It is not explicitly
* linked to the cpu that was ID'd in the incoming mce struct.
* Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log().
*/
static void
mcc_ctl_mask(int bank, uint32_t msk)
{
uint32_t ctl_lo, ctl_hi;
rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
ctl_lo &= ~msk;
wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
#if MC_VERBOSE
ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo);
#endif
}
/*
* Filtering of correctable core MC events
* Called from the exception handler.
*/
static void
mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake)
{
struct mce_info mc;
uint32_t msk;
if (!mce)
return;
if (mce->status & MCI_STATUS_UC)
return;
mcc_conv(mce, &mc);
mc.ctl = ctl;
mc.flags = fake ? MC_FLG_FALSE : 0;
msk = micras_mc_filter(&mc, mce->tsc, 1);
if (msk)
mcc_ctl_mask(mce->bank, msk);
}
/*
* Only action required for polled MC events is to
* pass the event on to the SCIF channel (if connected).
* The event should already have caused an excption (the
* exception handler choses to ignore corrected errors)
* which means it already has been filtered.
* Injected corrected events do not cause MCE exceptions
* and thus escaped filtering, so we'll filter them here.
*/
static void
mcc_poll(struct mce * mce, uint64_t ctl, int fake)
{
struct mce_info mc;
#if MC_VERBOSE
ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status);
#endif
mcc_conv(mce, &mc);
mc.ctl = ctl;
mc.flags = fake ? MC_FLG_FALSE : 0;
#if BEAM_TEST
/*
* Under beam test we only want to send the SCIF message
*/
micras_mc_send(&mc, fake);
return;
#endif
if (micras_mc_send(&mc, fake))
mcc_seen = mcelog.next;
/*
* According to MCA HAS the MCI_STATUS_VAL will only
* be set when an event's enable bit is set, in which
* case it is difficult to imagine how events without
* the MCI_STATUS_EN can appear here. The second clause
* of the test may never actually happen on Kn{F,C}.
* Note: MC polling does not capture TSCs
*/
if (fake || !(mc.status & MCI_STATUS_EN)) {
uint32_t msk;
msk = micras_mc_filter(&mc, rdtsc(), fake);
if (msk)
mcc_ctl_mask(mce->bank, msk);
}
}
/*
* One CPU entered do_machine_check().
* We get the initial mce record (which has cpu ID), early
* control variables and whether the event is injected.
*
* Since KnF and KnC deviate from the standard IA by not
* having the core MCAs broadcast to all CPU's we'll try
* to fake standard behavior in order to keep the generic
* machine check code intact.
* Therefore, if event is real (fake flag unset) and this
* CPU is the first seeing it (mcc_exc_mask is empty),
* then send IPI to all other CPU's listed in the online
* cpumask for vector #18. Later CPUs will see themselves
* marked in mcc_exc_mask and return quickly.
*/
struct cpumask mcc_exc_mask; /* CPU's in mce ctx */
static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */
static void
mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg)
{
unsigned int cpu;
/*
*TBD: should we use 'extcpu' from the MCE record instead?
*/
cpu = smp_processor_id();
/*
* Injected events invokes all CPUs automatically
* by hooking into the NMI notify_die call_chain.
* Nothing to do here.
*/
if (fake)
return;
#if 1
/*
* Avoid the IPI corralling circus on corrected errors,
* based on assessment entirely done by mce_severity().
* If the result (no_way_out) is MCE_NO_SEVERITY (=0), then
* at worst we may have a correctable error, and that does
* not warrant the system lockdown managed by mce_start()
* and mce_end().
* Note that MICs do not support newer status bits (MCG_SER_P)
* which causes variable mce_ser always to be zero and thus
* the test in the inner loop of do_machine_check() will be
* reduced to just testing for the UC bit.
*/
if (! no_way_out)
return;
#endif
/*
* Test for entry from MT thread IPIs (testing)
* or a 'soft' exception from a IPI issued from
* the handler of the first exception.
* No further action needed in both cases.
*/
if (cpumask_test_cpu(cpu, &mcc_exc_mask))
return;
/*
* Create mcc_exc_mask to flag which CPU's are
* to be included in the IPI. This mask is later
* used to determine who needs to EOI the local
* APIC after MC event handling.
*/
while(atomic_xchg(&ipi_lock, 1))
cpu_relax();
smp_rmb();
if (cpumask_test_cpu(cpu, &mcc_exc_mask)) {
/*
* Another CPU got here first
*/
atomic_xchg(&ipi_lock, 0);
return;
}
cpumask_copy(&mcc_exc_mask, cpu_online_mask);
cpumask_clear_cpu(cpu, &mcc_exc_mask);
smp_wmb();
atomic_xchg(&ipi_lock, 0);
/*
* Simulate a broadcast ny sending IPI to all
* other CPUs.
*/
// apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR);
apic->send_IPI_allbutself(MCE_VECTOR);
}
/*
* In do_machine_check() bank scan loop.
* Called from a lockdown, no synchronization needed.
* MC bank scan is complete and the mce event has been
* entered into the kernel MC log
*
*TBD: revise logic on HALT on UC events?
* From a state corruption point of view this
* _is_ a fatal error because UC bit was set.
* However, if the tolerance setting is set
* high enough, the generic MC handler may
* not chose to panic on this event.
* We currently do not have the tolerance value
* when recording this event, nor do we have
* other factors that mce_reign() use to determine
* what to do after reporting event to the host.
*/
static void
mcc_exc_log(struct mce * mce, uint64_t ctl, int fake,
int no_way_out, char * msg, int severity, int worst)
{
struct mce_info mc;
uint32_t msk;
#if MC_VERBOSE
ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n",
mce->extcpu, mce->time, no_way_out, msg, severity, worst);
#endif
/*
* Create a message for the host.
*/
mcc_conv(mce, &mc);
mc.ctl = ctl;
mc.flags |= fake ? MC_FLG_FALSE : 0;
#if BEAM_TEST
/*
* Under beam test we only want to send the SCIF message
* This is guaranteed not to be called re-entrantly.
*/
micras_mc_send(&mc, 1);
return;
#endif
#ifdef CONFIG_MK1OM
/*
* If this is a true event then log it in the EEPROM and
* notify SMC that we've had a serious machine check error.
*/
if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) {
micras_mc_log(&mc);
mc.flags |= MC_FLG_LOG;
/*
*TBD: Should this be deferred until the actual panic?
* The user can raise tolerance such that we in
* fact continue operating; in which case the SMC
* notification would be (somewhat) misleading.
*/
micras_mc_ipmi(&mc, 1);
}
#endif
/*
* Always notify host and sync to kernel log
*/
if (micras_mc_send(&mc, 1))
mcc_seen = mcelog.next;
#if RAS_HALT
if ((mc.flags & MC_FLG_FATAL) && !fake)
panic("FATAL core machine check event:\n"
"bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc);
#endif
/*
* Correctable events can in fact reach us here if
* mce_no_way_out() tags them as critical (for other
* reasons than the UC flag, e.g. MCIP missing).
* If the tolerance setting is high enough to prevent
* such events to panic, we'd still want filtering.
*/
msk = micras_mc_filter(&mc, mce->tsc, 1);
if (msk)
mcc_ctl_mask(mce->bank, msk);
}
/*
* In mce_panic().
* Current event is about to make the kernel panic.
* Sources of this call are
* do_machine_check(), when no_way_out set
* mce_timed_out(), CPU rendez-vous failed
* mce_reign(), when severety high, a CPU hung, or no events
*/
static void
mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake)
{
/*
* Should host be notified in this case?
* And if so, how should be presented, we might not
* even have a mce record to show when this happens!
* If an mce is passed, it has already been seen and
* reported to the host by a call to mcc_exc_log().
* If mce is NULL, then this _is_ an MC relatedi panic,
* but we have no data fitting for a host notification.
* Create a pseudo event and ship that?
*/
ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n",
mce->extcpu, mce->time, msg, exp, fake);
}
/*
* A CPU is leaving do_machine_check().
* We get this after the monarch has 'reigned' and
* the response to the event has been completed.
*/
static void
mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order)
{
unsigned int cpu;
int eoi;
cpu = smp_processor_id();
/*
* Assuming test_and_clear_bit() is atomic.
*/
smp_rmb();
eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask);
smp_wmb();
if (eoi)
ack_APIC_irq();
}
/*
* Routine to scan the kernel's MC log.
* Called when SCIF MC session has been created, to bring the host
* side up to date with prior unreported MC events, such as events
* occurring when MC session was not active (no peer was listening
* on the host) and events occurring before RAS module was loaded.
*
* Notes:
* - This is always called in thread context.
* - There are no injection flags in the kernel
* MC log, i.e. no guarantee events are genuine.
* - The MC kernel log has been exported explicitly for this.
*
* On synchronization (or the lack thereof):
* Effectively the mcelog holds a static array of mce's where the
* 'finished' flag says whether mce content is valid or not. The
* 'next' field is the index of the first element in the array that
* has not been assigned for an MC event. It is incremented when a
* new event is entered, and reset to zero on reads to /dev/mcelog.
* The kernel's event log does not wrap, so it is safe to use it as
* an indicator of how many events (finished or not) are in it.
* The mcelog's next field is protected by RCU style mechanisms
* in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c).
* For obvious reasons it is not genuine RCU, e.g. access to 'next'
* isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever
* masking use of a lock in an RCU macro definition.
* There is no RCU moving data around, the mce array does not move,
* and the 'finished' flag is set after a wmb() on the mce contents
* which means this routine will not clash with the MCE handler.
* Collisions with memset() on reads from /dev/mcelog are prevented
* by locking of mce_read_mutex.
*/
void
mcc_sync(void)
{
struct mce_info mc;
unsigned seen;
if (mce_disabled)
return;
#if 0
/*
* Can't do this until bootstrap scrubs MC banks on all cards.
* It has been observed that MCA banks may _not_ be reset on card
* reboot which means events picked up by the kernel before loading
* the RAS module may have occured in a previous uOS run.
* Should be OK post early Jan '12 (flash ver 262, HSD 4115351).
*/
return;
#endif
/*
* Lock out kernel log access through /dev/mcelog
*/
mutex_lock(&mce_read_mutex);
/*
* Start over if the log has been cleared cleared
*/
if (mcc_seen > mcelog.next)
mcc_seen = 0;
for(seen = mcc_seen; seen < mcelog.next; seen++) {
/*
* Basic checks. Index, CPU & bank must be reasonable.
*/
if (mcelog.entry[seen].finished) {
if (mcelog.entry[seen].cpu >= NR_CPUS ||
mcelog.entry[seen].bank >= 3) {
printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n",
seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank);
continue;
}
/*
* Have good entry, can be UC, but it is 'old'.
*/
mcc_conv(&mcelog.entry[seen], &mc);
mc.ctl = 0;
#ifdef CONFIG_MK1OM
/*
* Log this event in the eeprom and notify
* that we've had a serious machine check error.
*/
if (mc.flags & MC_FLG_FATAL) {
in_sync = 1;
micras_mc_log(&mc);
in_sync = 0;
mc.flags |= MC_FLG_LOG;
micras_mc_ipmi(&mc, 0);
}
#endif
/*
* Notify host about this too
*/
if (! micras_mc_send(&mc, 0))
break;
}
}
mcc_seen = mcelog.next;
/*
* Done, release lock
*/
mutex_unlock(&mce_read_mutex);
}
/*
* Setup excetion handlers by hooking into the
* kernel's native MCA handler.
*/
int __init
mcc_init(void)
{
if (mce_disabled) {
printk("RAS.core: disabled\n");
}
else {
mca_poll = mcc_poll;
mca_exc_flt = mcc_exc_flt;
mca_exc_entry = mcc_exc_entry;
mca_exc_log = mcc_exc_log;
mca_exc_panic = mcc_exc_panic;
mca_exc_exit = mcc_exc_exit;
mca_print = 0; /* For debug: ee_printk; */
printk("RAS.core: init complete\n");
}
return 0;
}
/*
* Cleanup for module unload.
* Clear/restore hooks in the native MCA handler.
*/
int __exit
mcc_exit(void)
{
mca_poll = 0;
mca_exc_flt = 0;
mca_exc_entry = 0;
mca_exc_log = 0;
mca_exc_panic = 0;
mca_exc_exit = 0;
mca_print = 0;
/*
* Links from kernel's MCE handler cut,
* wait for everybody in handler to leave.
*/
while(atomic_read(&mce_entry))
cpu_relax();
printk("RAS.core: exit complete\n");
return 0;
}