git.subgeniuskitty.com - xeon-phi-kernel-module/.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright 2010-2017 Intel Corporation.
	3	*
	4	* This program is free software; you can redistribute it and/or modify
	5	* it under the terms of the GNU General Public License, version 2,
	6	* as published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	11	* General Public License for more details.
	12	*
	13	* Disclaimer: The codes contained in these modules may be specific to
	14	* the Intel Software Development Platform codenamed Knights Ferry,
	15	* and the Intel product codenamed Knights Corner, and are not backward
	16	* compatible with other Intel products. Additionally, Intel will NOT
	17	* support the codes or instruction set in future products.
	18	*
	19	* Intel offers no warranty of any kind regarding the code. This code is
	20	* licensed on an "AS IS" basis and Intel is not obligated to provide
	21	* any support, assistance, installation, training, or other services
	22	* of any kind. Intel is also not obligated to provide any updates,
	23	* enhancements or extensions. Intel specifically disclaims any warranty
	24	* of merchantability, non-infringement, fitness for any particular
	25	* purpose, and any other warranty.
	26	*
	27	* Further, Intel disclaims all liability of any kind, including but
	28	* not limited to liability for infringement of any proprietary rights,
	29	* relating to the use of the code, even if Intel is notified of the
	30	* possibility of such liability. Except as expressly stated in an Intel
	31	* license agreement provided with this code and agreed upon with Intel,
	32	* no license, express or implied, by estoppel or otherwise, to any
	33	* intellectual property rights is granted herein.
	34	*/
	35
	36	/*
	37	* RAS handler for core MC events
	38	*
	39	* Contains code to intercept MC events, collect information
	40	* from core MCA banks on originating core and possibly on
	41	* all active cores if necessary.
	42	*
	43	* In case of a severe event, defined by corrupted context,
	44	* the handler will add a record of the event in the designated
	45	* EEPROM hanging off the Over-Clocking I2C bus. Next a message
	46	* will be sent to the SMC (enabling IPMI notifications) and at
	47	* last a message is sent to host via the MC SCIF connection
	48	* (if MC SCIF session has been established).
	49	*
	50	* Lesser events will also be sent to the host on a 'FYI' basis,
	51	* but no record will be stored in the event log, nor will the
	52	* SMC be notified.
	53	*
	54	* Special cases of high rate correctable errors may also cause
	55	* events to be recorded in EEPROM on the assumption that the
	56	* root cause will be detectable from maintenance mode.
	57	*
	58	* The handler cannot expect any support from the OS while in
	59	* exception (NMI) context. Therefore, NMI-safe routines has
	60	* been added to mimic some kernel services, e.g. ee_print().
	61	*/
	62
	63	#include <linux/types.h>
	64	#include <linux/errno.h>
	65	#include <linux/kernel.h>
	66	#include <linux/mm.h>
	67	#include <linux/mm_types.h>
	68	#include <linux/io.h>
	69	#include <linux/cpumask.h>
	70	#include <asm/mce.h>
	71	#include <asm/apic.h>
	72	#include "micras.h"
	73
	74
	75	/*
	76	**
	77	** Brief design notes:
	78	** There are two ways this code normally will be entered.
	79	**
	80	** 1) From standard interrupt context (bottom-half).
	81	** This is supporting MC events picked up by the
	82	** machine_check_poll(), i.e. events that aren't
	83	** causing state corrruption (UC bit not set).
	84	**
	85	** 2) From exception/NMI context.
	86	** This handles errors that _did_ flag processor
	87	** state corruption (UC bit set, or other condition
	88	** causing the kernel exception handler to pick it up).
	89	**
	90	** Both cases can happen simultaneously on different CPU's,
	91	** which require careful considerations about re-entrant code
	92	** behaviour here. Particularly nasty is exception context where
	93	** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt
	94	** disable can protect a critical region, an assumption that is
	95	** false when an exception/NMI occur).
	96	**
	97	** Standard interrupt context entries occur when non-fatal and
	98	** thus non-critical MC events are handled. In most cases just
	99	** results in a regular SCIF send of McInfo structs to the host.
	100	** Note that the call chain origin is a callout from the timer
	101	** thread, not from an interrupt service routine, so to name
	102	** it as standard interrupt context is somewhat misleading.
	103	**
	104	** Exception context messages are usuallly fatal and must be
	105	** dealt with immediately, because otherwise the generic machine
	106	** handler may panic() the system when exiting exception handler
	107	** (default behavior, may be tweaked by altering 'threshold').
	108	**
	109	** In order to proceed we can either implement a locking mechanism
	110	** at every API function entry, or we can let every function do it's
	111	** thing independently. The latter is preferred, though it gets
	112	** somewhat complicated because the API between the generic MC
	113	** handling and RAS module is in fact composed of several calls.
	114	**
	115	** If state between API calls needs to be tracked then that can be
	116	** done by means of pre-allocated arrays, similar to the generic
	117	** handling in the Linux kernel. Currently the only state variable
	118	** is the mask of CPUs that has been sent an IPI.
	119	**
	120	** Core MC events can be simulated by using the 'mce-inject' tool,
	121	** consisting of a kernel module and a text mode application program.
	122	** The 'mce-inject' module knows the difference between fatal and
	123	** non-fatal events (defined by the UC bit) and acts differently
	124	** in the two cases. Non-fatal injections cause machine_check_poll()
	125	** to be called on all CPUs, resulting in events being reported to
	126	** function mce_poll(). Fatal injections cause do_machine_check()
	127	** to be called on all CPUs, resulting in calls to the mcc_exc_*
	128	** routines below. Activities triggered by mce-inject are flagged
	129	** as 'fake', and shall _NOT_ be logged in the EEPROM.
	130	**
	131	** Warning:
	132	** Controls in the generic MC handling may cause the kernel to
	133	** panic, _ALSO_ even if no event was found in any MCA banks!!
	134	** Not sure exactly how to capture that sort of event.
	135	**
	136	** Warning:
	137	** The 'mce-inject' module uses different methods of invoking error
	138	** handling routines, depending on the mce record (inject_flags).
	139	** Specifically, the 'mce-inject' module may use of broadcast NMIs
	140	** to invoke machine_check_poll() or do_machine_check() on all CPUs,
	141	** which will make these functions execute in exception context.
	142	** The NMI broadcast mechanism is based on registering a handler on
	143	** the 'die' notifier chain and then doing an
	144	** apic->send_IPI_mask(.., NMI_VECTOR),
	145	** knowing that do_nmi() will invoke this notifier chain when no
	146	** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0,
	147	** [which is SERR + IOCHK on chipset register NSR]).
	148	** Long story short; if 'mce-inject' is used we can not expect that
	149	** polling is done in standard interrupt context, and need to set
	150	** the 'in exception context' flag for SCIF access.
	151	**
	152	*/
	153
	154
	155	/*
	156	* Hooks placed in the native machine check handler
	157	* See file arch/x86/kernel/cpu/mcheck/mce.c for placement.
	158	*
	159	* poll After entering a non-UC event into mce_log.
	160	* This happens in normal thread context, which
	161	* means that kernel services are avaialble.
	162	* exc_flt Filter on correctable errors. If events occur
	163	* at a very high rate they can severely slow
	164	* down the system and/or crash it entirely.
	165	* Logic here will disable reporting of some
	166	* events if they are seen too often.
	167	* exc_entry Entering MC exception handler.
	168	* Called _after_ reading MCG_STATUS and the early
	169	* severity assesment by mce_severity() has been
	170	* performed on all banks, such that we get to
	171	* know if the native MC handler will panic.
	172	* exc_log After entering a UC event into mce_log.
	173	* The logged mce record has all available
	174	* details on the event, and this point is the
	175	* best place to perform our RAS activities.
	176	* exc_panic Right before the MC exception handler calls
	177	* the panic function.
	178	* exc_exit Exit the MC exception handler
	179	* print Exception context safe printf to POST-card UART
	180	*/
	181
	182	extern void (mca_poll)(struct mce , uint64_t, int);
	183	extern void (mca_exc_flt)(struct mce , uint64_t, int);
	184	extern void (mca_exc_entry)(struct mce , int, int, int, char *);
	185	extern void (mca_exc_log)(struct mce , uint64_t, int, int, char *, int, int);
	186	extern void (mca_exc_panic)(struct mce , char , char , int);
	187	extern void (mca_exc_exit)(struct mce , int, int, int, int);
	188	extern int (mca_print)(char , ...);
	189
	190	extern struct mce_log mcelog; /* Export from kernel */
	191	extern struct mutex mce_read_mutex; /* Export from kernel */
	192	static unsigned mcc_seen; /* Last event in kernel log */
	193	int in_sync; /* Flag when sync'ing */
	194
	195
	196	/*
	197	* Convert a kernel mce record into a MC API format
	198	*/
	199
	200	static void
	201	mcc_conv(struct mce * mce, struct mce_info * mc)
	202	{
	203	mc->org = mce->bank;
	204	mc->id = mce->extcpu;
	205	#ifdef CONFIG_MK1OM
	206	mc->pid = xlat_cpu[cpu_data(mc->id).apicid];
	207	#endif
	208	mc->stamp = mce->time;
	209	mc->status = mce->status;
	210	mc->addr = mce->addr;
	211	mc->misc = mce->misc;
	212	mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
	213	}
	214
	215
	216	/*
	217	* Filter for correctable errors, may modify CTL value.
	218	* The filter is pretty crude, we just want to protect
	219	* ourselves from being run over by fast recurring events.
	220	* We keep tabs of events seen in a static array.
	221	*
	222	* Algorithm is like this:
	223	* - test if event is in filter list; if not exit filter.
	224	* - search for instance of this event in history.
	225	* - if not found, insert event in history (strike 1).
	226	* - if found but time since last seen exceeds window,
	227	* then treat event as new in history (new strike 1).
	228	* - if found and within time window, bump strike counter.
	229	* - if strike counter reach maximum, we're fed up and
	230	* turn this event off by clearing the associated
	231	* bit in the offending MCA bank's CTL register and
	232	* send a 'filter' event notification to the host.
	233	*
	234	* Advantages of this design is:
	235	* - individual parameters for every filtered event.
	236	* - only one event history array.
	237	* - no periodic aging of events in history array.
	238	* - no averaging over time required.
	239	* - no moving/reordering of event history entries.
	240	* - new events do not replace older seen event
	241	* - filter reacts immediately when max reached.
	242	*
	243	* Disadvantages are:
	244	* - linear search through filter array.
	245	* - linear search through history array.
	246	* - time parameter not obvious, it's really a limit
	247	* on how old events in history are allowed to be.
	248	* - in pathological cases the filter's reaction time
	249	* will be max * window (when events trickle in at
	250	* a rate just below the window size).
	251	* - data in ADDR and MISC registers are not used to
	252	* match current event with history. Should they be?
	253	*
	254	* For now, both lists are short enough that introducing
	255	* more advanced searches probably are not going to help.
	256	*
	257	* On KnC the flash may have overrides of the mc_turnoff table.
	258	*/
	259
	260	#define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */
	261
	262	static struct mc_hist {
	263	uint32_t count; /* How many times seen */
	264	uint64_t last; /* TSC last time seen */
	265	struct mce_info mc; /* Local MC event record */
	266	} mc_history[32];
	267
	268	static struct mc_disc {
	269	uint8_t bank, ctl; /* Bank selector and control bit # */
	270	uint16_t win; /* Time window (seconds) */
	271	uint16_t max; /* Max count */
	272	uint16_t mca_code; /* MCA code, status[15:0] */
	273	uint16_t mdl_code; /* Model code, status[31:16] */
	274	} mc_turnoff[] = {
	275	{ 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */
	276	{ 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */
	277	{ 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */
	278	{ 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */
	279	{ 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */
	280	{ 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */
	281	{ 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */
	282	{ 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */
	283	{ 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */
	284	{ 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */
	285	{ 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */
	286	{ 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */
	287	{ 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */
	288	{ 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */
	289	{ 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */
	290	{ 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */
	291	{ 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */
	292	};
	293
	294
	295	#ifdef CONFIG_MK1OM
	296
	297	#define MC_FLT_SIG1 0x0e13c20f /* Start signature */
	298	#define MC_FLT_SIG2 0xf1ec3df0 /* End signature */
	299	#define MC_FLT_SIZE 0x200 /* Filter block length */
	300
	301	void
	302	mcc_flt_parm(uint8_t * p)
	303	{
	304	uint16_t fnum;
	305
	306	/*
	307	* Check signatures
	308	*/
	309	if (((uint32_t ) p) != MC_FLT_SIG1 \|\|
	310	((uint32_t )(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) {
	311	printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n",
	312	((uint32_t ) p), ((uint32_t )(p + MC_FLT_SIZE - 4)));
	313	return;
	314	}
	315
	316	/*
	317	* After start signature comes filter count (uint16_t)
	318	* followed by 'count' filter descriptors (struct mc_disc).
	319	*/
	320	fnum = (uint16_t )(p + 4);
	321	if (fnum > ARRAY_SIZE(mc_turnoff) \|\|
	322	fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) {
	323	printk("mcc_flt_parm: filter count %d not valid\n", fnum);
	324	return;
	325	}
	326
	327	/*
	328	* Seems the table is legit, copy it over defaults.
	329	*/
	330	memset(mc_turnoff, '\0', sizeof(mc_turnoff));
	331	memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc));
	332	#if MC_VERBOSE
	333	{
	334	int i;
	335
	336	for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
	337	printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n",
	338	i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win,
	339	mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code);
	340	}
	341	}
	342	#endif
	343	}
	344
	345	#endif
	346
	347
	348	/*
	349	* Frequency filter for core and un-core MC events
	350	*/
	351
	352	uint32_t
	353	micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc)
	354	{
	355	struct mc_disc * dsc;
	356	struct mc_hist * hst;
	357	uint64_t ostamp;
	358	int i, oldest;
	359
	360	if (mc->status & MCI_STATUS_UC)
	361	return 0;
	362
	363	/*
	364	* Check if this event may be filtered
	365	*/
	366	dsc = mc_turnoff;
	367	for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
	368	if (dsc->bank == mc->org &&
	369	dsc->mca_code == GET_BITS(15, 0, mc->status) &&
	370	dsc->mdl_code == GET_BITS(31, 16, mc->status))
	371	break;
	372	dsc++;
	373	}
	374	if (i == ARRAY_SIZE(mc_turnoff))
	375	return 0;
	376
	377	/*
	378	* Have a candidate for filter.
	379	* Have we seen this one before?
	380	*/
	381	oldest = 0;
	382	ostamp = tsc;
	383	hst = mc_history;
	384	for(i = 0; i < ARRAY_SIZE(mc_history); i++) {
	385	/*
	386	* While scanning, find the oldest event too
	387	*/
	388	if (hst->last < ostamp) {
	389	ostamp = hst->last;
	390	oldest = i;
	391	}
	392
	393	/*
	394	* Does this match event in filter history?
	395	* TBD: how much needs to match?
	396	* For now: cpu (or box), bank, mca_code and model_code.
	397	*/
	398	if (hst->last &&
	399	hst->mc.id == mc->id &&
	400	hst->mc.org == mc->org &&
	401	GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) &&
	402	GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status))
	403	break;
	404	hst++;
	405	}
	406	if (i == ARRAY_SIZE(mc_history)) {
	407	/*
	408	* Not seen this event before.
	409	* 'oldest' is where to store this event.
	410	*/
	411	hst = mc_history + oldest;
	412	hst->count = 1;
	413	hst->last = tsc;
	414	hst->mc = *mc;
	415	return 0;
	416	}
	417
	418	/*
	419	* Already 'on file in history', test expiration date
	420	*/
	421	if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) {
	422	/*
	423	* Matching history element had expired, just overwrite it
	424	*/
	425	hst->count = 1;
	426	hst->last = tsc;
	427	hst->mc = *mc;
	428	return 0;
	429	}
	430
	431	/*
	432	* Filter element active, bump count and set last seen.
	433	* We do _NOT_ want injected events to enter the EEPROM,
	434	* so that flag is preserved over all event history
	435	*/
	436	hst->count++;
	437	if (mc->flags & MC_FLG_FALSE)
	438	hst->mc.flags \|= MC_FLG_FALSE;
	439	if (hst->count < dsc->max) {
	440	hst->last = tsc;
	441	return 0;
	442	}
	443
	444	/*
	445	* Threshold reached, event source needs to be silenced.
	446	* Store a record of this in the EEPROM and send a
	447	* notification to host about it. Once duly reported, clear
	448	* event from the filter; it is not expected to show up again.
	449	* Note: we report the _first_ event seen, not the
	450	* event at hand. We could save array space
	451	* by sending latest event (less info to keep).
	452	*/
	453	ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n",
	454	dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz);
	455	hst->mc.flags \|= MC_FLG_FILTER;
	456	#ifdef CONFIG_MK1OM
	457	if (!(hst->mc.flags & MC_FLG_FALSE)) {
	458	micras_mc_log(&hst->mc);
	459	hst->mc.flags \|= MC_FLG_LOG;
	460	}
	461	#endif
	462	micras_mc_send(&hst->mc, exc);
	463	hst->last = 0;
	464
	465	/*
	466	* MC events are disabled by caller when a
	467	* non-zero mask is returned by this routine.
	468	*/
	469	return (1 << dsc->ctl);
	470	}
	471
	472
	473	/*
	474	* Remove/mask an 'enable-bit' from a core MCA bank.
	475	* Note: This applies to _current_ cpu only. It is not explicitly
	476	* linked to the cpu that was ID'd in the incoming mce struct.
	477	* Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log().
	478	*/
	479
	480	static void
	481	mcc_ctl_mask(int bank, uint32_t msk)
	482	{
	483	uint32_t ctl_lo, ctl_hi;
	484
	485	rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
	486	ctl_lo &= ~msk;
	487	wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
	488
	489	#if MC_VERBOSE
	490	ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo);
	491	#endif
	492	}
	493
	494
	495	/*
	496	* Filtering of correctable core MC events
	497	* Called from the exception handler.
	498	*/
	499
	500	static void
	501	mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake)
	502	{
	503	struct mce_info mc;
	504	uint32_t msk;
	505
	506	if (!mce)
	507	return;
	508
	509	if (mce->status & MCI_STATUS_UC)
	510	return;
	511
	512	mcc_conv(mce, &mc);
	513	mc.ctl = ctl;
	514	mc.flags = fake ? MC_FLG_FALSE : 0;
	515	msk = micras_mc_filter(&mc, mce->tsc, 1);
	516	if (msk)
	517	mcc_ctl_mask(mce->bank, msk);
	518	}
	519
	520
	521	/*
	522	* Only action required for polled MC events is to
	523	* pass the event on to the SCIF channel (if connected).
	524	* The event should already have caused an excption (the
	525	* exception handler choses to ignore corrected errors)
	526	* which means it already has been filtered.
	527	* Injected corrected events do not cause MCE exceptions
	528	* and thus escaped filtering, so we'll filter them here.
	529	*/
	530
	531	static void
	532	mcc_poll(struct mce * mce, uint64_t ctl, int fake)
	533	{
	534	struct mce_info mc;
	535
	536	#if MC_VERBOSE
	537	ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status);
	538	#endif
	539
	540	mcc_conv(mce, &mc);
	541	mc.ctl = ctl;
	542	mc.flags = fake ? MC_FLG_FALSE : 0;
	543
	544	#if BEAM_TEST
	545	/*
	546	* Under beam test we only want to send the SCIF message
	547	*/
	548	micras_mc_send(&mc, fake);
	549	return;
	550	#endif
	551
	552	if (micras_mc_send(&mc, fake))
	553	mcc_seen = mcelog.next;
	554
	555	/*
	556	* According to MCA HAS the MCI_STATUS_VAL will only
	557	* be set when an event's enable bit is set, in which
	558	* case it is difficult to imagine how events without
	559	* the MCI_STATUS_EN can appear here. The second clause
	560	* of the test may never actually happen on Kn{F,C}.
	561	* Note: MC polling does not capture TSCs
	562	*/
	563	if (fake \|\| !(mc.status & MCI_STATUS_EN)) {
	564	uint32_t msk;
	565
	566	msk = micras_mc_filter(&mc, rdtsc(), fake);
	567	if (msk)
	568	mcc_ctl_mask(mce->bank, msk);
	569	}
	570	}
	571
	572
	573	/*
	574	* One CPU entered do_machine_check().
	575	* We get the initial mce record (which has cpu ID), early
	576	* control variables and whether the event is injected.
	577	*
	578	* Since KnF and KnC deviate from the standard IA by not
	579	* having the core MCAs broadcast to all CPU's we'll try
	580	* to fake standard behavior in order to keep the generic
	581	* machine check code intact.
	582	* Therefore, if event is real (fake flag unset) and this
	583	* CPU is the first seeing it (mcc_exc_mask is empty),
	584	* then send IPI to all other CPU's listed in the online
	585	* cpumask for vector #18. Later CPUs will see themselves
	586	* marked in mcc_exc_mask and return quickly.
	587	*/
	588
	589	struct cpumask mcc_exc_mask; /* CPU's in mce ctx */
	590	static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */
	591
	592	static void
	593	mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg)
	594	{
	595	unsigned int cpu;
	596
	597	/*
	598	*TBD: should we use 'extcpu' from the MCE record instead?
	599	*/
	600	cpu = smp_processor_id();
	601
	602	/*
	603	* Injected events invokes all CPUs automatically
	604	* by hooking into the NMI notify_die call_chain.
	605	* Nothing to do here.
	606	*/
	607	if (fake)
	608	return;
	609
	610	#if 1
	611	/*
	612	* Avoid the IPI corralling circus on corrected errors,
	613	* based on assessment entirely done by mce_severity().
	614	* If the result (no_way_out) is MCE_NO_SEVERITY (=0), then
	615	* at worst we may have a correctable error, and that does
	616	* not warrant the system lockdown managed by mce_start()
	617	* and mce_end().
	618	* Note that MICs do not support newer status bits (MCG_SER_P)
	619	* which causes variable mce_ser always to be zero and thus
	620	* the test in the inner loop of do_machine_check() will be
	621	* reduced to just testing for the UC bit.
	622	*/
	623	if (! no_way_out)
	624	return;
	625	#endif
	626
	627	/*
	628	* Test for entry from MT thread IPIs (testing)
	629	* or a 'soft' exception from a IPI issued from
	630	* the handler of the first exception.
	631	* No further action needed in both cases.
	632	*/
	633	if (cpumask_test_cpu(cpu, &mcc_exc_mask))
	634	return;
	635
	636	/*
	637	* Create mcc_exc_mask to flag which CPU's are
	638	* to be included in the IPI. This mask is later
	639	* used to determine who needs to EOI the local
	640	* APIC after MC event handling.
	641	*/
	642	while(atomic_xchg(&ipi_lock, 1))
	643	cpu_relax();
	644	smp_rmb();
	645	if (cpumask_test_cpu(cpu, &mcc_exc_mask)) {
	646	/*
	647	* Another CPU got here first
	648	*/
	649	atomic_xchg(&ipi_lock, 0);
	650	return;
	651	}
	652	cpumask_copy(&mcc_exc_mask, cpu_online_mask);
	653	cpumask_clear_cpu(cpu, &mcc_exc_mask);
	654	smp_wmb();
	655	atomic_xchg(&ipi_lock, 0);
	656
	657	/*
	658	* Simulate a broadcast ny sending IPI to all
	659	* other CPUs.
	660	*/
	661	// apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR);
	662	apic->send_IPI_allbutself(MCE_VECTOR);
	663	}
	664
	665
	666	/*
	667	* In do_machine_check() bank scan loop.
	668	* Called from a lockdown, no synchronization needed.
	669	* MC bank scan is complete and the mce event has been
	670	* entered into the kernel MC log
	671	*
	672	*TBD: revise logic on HALT on UC events?
	673	* From a state corruption point of view this
	674	* _is_ a fatal error because UC bit was set.
	675	* However, if the tolerance setting is set
	676	* high enough, the generic MC handler may
	677	* not chose to panic on this event.
	678	* We currently do not have the tolerance value
	679	* when recording this event, nor do we have
	680	* other factors that mce_reign() use to determine
	681	* what to do after reporting event to the host.
	682	*/
	683
	684	static void
	685	mcc_exc_log(struct mce * mce, uint64_t ctl, int fake,
	686	int no_way_out, char * msg, int severity, int worst)
	687	{
	688	struct mce_info mc;
	689	uint32_t msk;
	690
	691	#if MC_VERBOSE
	692	ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n",
	693	mce->extcpu, mce->time, no_way_out, msg, severity, worst);
	694	#endif
	695
	696	/*
	697	* Create a message for the host.
	698	*/
	699	mcc_conv(mce, &mc);
	700	mc.ctl = ctl;
	701	mc.flags \|= fake ? MC_FLG_FALSE : 0;
	702
	703	#if BEAM_TEST
	704	/*
	705	* Under beam test we only want to send the SCIF message
	706	* This is guaranteed not to be called re-entrantly.
	707	*/
	708	micras_mc_send(&mc, 1);
	709	return;
	710	#endif
	711
	712	#ifdef CONFIG_MK1OM
	713	/*
	714	* If this is a true event then log it in the EEPROM and
	715	* notify SMC that we've had a serious machine check error.
	716	*/
	717	if ((mc.flags & (MC_FLG_FALSE \| MC_FLG_FATAL)) == MC_FLG_FATAL) {
	718	micras_mc_log(&mc);
	719	mc.flags \|= MC_FLG_LOG;
	720
	721	/*
	722	*TBD: Should this be deferred until the actual panic?
	723	* The user can raise tolerance such that we in
	724	* fact continue operating; in which case the SMC
	725	* notification would be (somewhat) misleading.
	726	*/
	727	micras_mc_ipmi(&mc, 1);
	728	}
	729	#endif
	730
	731	/*
	732	* Always notify host and sync to kernel log
	733	*/
	734	if (micras_mc_send(&mc, 1))
	735	mcc_seen = mcelog.next;
	736
	737	#if RAS_HALT
	738	if ((mc.flags & MC_FLG_FATAL) && !fake)
	739	panic("FATAL core machine check event:\n"
	740	"bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
	741	mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc);
	742	#endif
	743
	744	/*
	745	* Correctable events can in fact reach us here if
	746	* mce_no_way_out() tags them as critical (for other
	747	* reasons than the UC flag, e.g. MCIP missing).
	748	* If the tolerance setting is high enough to prevent
	749	* such events to panic, we'd still want filtering.
	750	*/
	751	msk = micras_mc_filter(&mc, mce->tsc, 1);
	752	if (msk)
	753	mcc_ctl_mask(mce->bank, msk);
	754	}
	755
	756
	757	/*
	758	* In mce_panic().
	759	* Current event is about to make the kernel panic.
	760	* Sources of this call are
	761	* do_machine_check(), when no_way_out set
	762	* mce_timed_out(), CPU rendez-vous failed
	763	* mce_reign(), when severety high, a CPU hung, or no events
	764	*/
	765
	766	static void
	767	mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake)
	768	{
	769	/*
	770	* Should host be notified in this case?
	771	* And if so, how should be presented, we might not
	772	* even have a mce record to show when this happens!
	773	* If an mce is passed, it has already been seen and
	774	* reported to the host by a call to mcc_exc_log().
	775	* If mce is NULL, then this _is_ an MC relatedi panic,
	776	* but we have no data fitting for a host notification.
	777	* Create a pseudo event and ship that?
	778	*/
	779	ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n",
	780	mce->extcpu, mce->time, msg, exp, fake);
	781	}
	782
	783
	784	/*
	785	* A CPU is leaving do_machine_check().
	786	* We get this after the monarch has 'reigned' and
	787	* the response to the event has been completed.
	788	*/
	789
	790	static void
	791	mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order)
	792	{
	793	unsigned int cpu;
	794	int eoi;
	795
	796	cpu = smp_processor_id();
	797
	798	/*
	799	* Assuming test_and_clear_bit() is atomic.
	800	*/
	801	smp_rmb();
	802	eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask);
	803	smp_wmb();
	804	if (eoi)
	805	ack_APIC_irq();
	806	}
	807
	808
	809	/*
	810	* Routine to scan the kernel's MC log.
	811	* Called when SCIF MC session has been created, to bring the host
	812	* side up to date with prior unreported MC events, such as events
	813	* occurring when MC session was not active (no peer was listening
	814	* on the host) and events occurring before RAS module was loaded.
	815	*
	816	* Notes:
	817	* - This is always called in thread context.
	818	* - There are no injection flags in the kernel
	819	* MC log, i.e. no guarantee events are genuine.
	820	* - The MC kernel log has been exported explicitly for this.
	821	*
	822	* On synchronization (or the lack thereof):
	823	* Effectively the mcelog holds a static array of mce's where the
	824	* 'finished' flag says whether mce content is valid or not. The
	825	* 'next' field is the index of the first element in the array that
	826	* has not been assigned for an MC event. It is incremented when a
	827	* new event is entered, and reset to zero on reads to /dev/mcelog.
	828	* The kernel's event log does not wrap, so it is safe to use it as
	829	* an indicator of how many events (finished or not) are in it.
	830	* The mcelog's next field is protected by RCU style mechanisms
	831	* in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c).
	832	* For obvious reasons it is not genuine RCU, e.g. access to 'next'
	833	* isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever
	834	* masking use of a lock in an RCU macro definition.
	835	* There is no RCU moving data around, the mce array does not move,
	836	* and the 'finished' flag is set after a wmb() on the mce contents
	837	* which means this routine will not clash with the MCE handler.
	838	* Collisions with memset() on reads from /dev/mcelog are prevented
	839	* by locking of mce_read_mutex.
	840	*/
	841
	842	void
	843	mcc_sync(void)
	844	{
	845	struct mce_info mc;
	846	unsigned seen;
	847
	848	if (mce_disabled)
	849	return;
	850
	851	#if 0
	852	/*
	853	* Can't do this until bootstrap scrubs MC banks on all cards.
	854	* It has been observed that MCA banks may _not_ be reset on card
	855	* reboot which means events picked up by the kernel before loading
	856	* the RAS module may have occured in a previous uOS run.
	857	* Should be OK post early Jan '12 (flash ver 262, HSD 4115351).
	858	*/
	859	return;
	860	#endif
	861
	862	/*
	863	* Lock out kernel log access through /dev/mcelog
	864	*/
	865	mutex_lock(&mce_read_mutex);
	866
	867	/*
	868	* Start over if the log has been cleared cleared
	869	*/
	870	if (mcc_seen > mcelog.next)
	871	mcc_seen = 0;
	872
	873	for(seen = mcc_seen; seen < mcelog.next; seen++) {
	874	/*
	875	* Basic checks. Index, CPU & bank must be reasonable.
	876	*/
	877	if (mcelog.entry[seen].finished) {
	878	if (mcelog.entry[seen].cpu >= NR_CPUS \|\|
	879	mcelog.entry[seen].bank >= 3) {
	880	printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n",
	881	seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank);
	882	continue;
	883	}
	884
	885	/*
	886	* Have good entry, can be UC, but it is 'old'.
	887	*/
	888	mcc_conv(&mcelog.entry[seen], &mc);
	889	mc.ctl = 0;
	890
	891	#ifdef CONFIG_MK1OM
	892	/*
	893	* Log this event in the eeprom and notify
	894	* that we've had a serious machine check error.
	895	*/
	896	if (mc.flags & MC_FLG_FATAL) {
	897	in_sync = 1;
	898	micras_mc_log(&mc);
	899	in_sync = 0;
	900	mc.flags \|= MC_FLG_LOG;
	901	micras_mc_ipmi(&mc, 0);
	902	}
	903	#endif
	904
	905	/*
	906	* Notify host about this too
	907	*/
	908	if (! micras_mc_send(&mc, 0))
	909	break;
	910	}
	911	}
	912	mcc_seen = mcelog.next;
	913
	914	/*
	915	* Done, release lock
	916	*/
	917	mutex_unlock(&mce_read_mutex);
	918	}
	919
	920
	921	/*
	922	* Setup excetion handlers by hooking into the
	923	* kernel's native MCA handler.
	924	*/
	925
	926	int __init
	927	mcc_init(void)
	928	{
	929	if (mce_disabled) {
	930	printk("RAS.core: disabled\n");
	931	}
	932	else {
	933	mca_poll = mcc_poll;
	934	mca_exc_flt = mcc_exc_flt;
	935	mca_exc_entry = mcc_exc_entry;
	936	mca_exc_log = mcc_exc_log;
	937	mca_exc_panic = mcc_exc_panic;
	938	mca_exc_exit = mcc_exc_exit;
	939	mca_print = 0; /* For debug: ee_printk; */
	940	printk("RAS.core: init complete\n");
	941	}
	942
	943	return 0;
	944	}
	945
	946
	947	/*
	948	* Cleanup for module unload.
	949	* Clear/restore hooks in the native MCA handler.
	950	*/
	951
	952	int __exit
	953	mcc_exit(void)
	954	{
	955	mca_poll = 0;
	956	mca_exc_flt = 0;
	957	mca_exc_entry = 0;
	958	mca_exc_log = 0;
	959	mca_exc_panic = 0;
	960	mca_exc_exit = 0;
	961	mca_print = 0;
	962
	963	/*
	964	* Links from kernel's MCE handler cut,
	965	* wait for everybody in handler to leave.
	966	*/
	967	while(atomic_read(&mce_entry))
	968	cpu_relax();
	969
	970	printk("RAS.core: exit complete\n");
	971	return 0;
	972	}
	973