Updated `README.md` with instructions for building/using the kernel module.
[xeon-phi-kernel-module] / ras / micras_core.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36/*
37 * RAS handler for core MC events
38 *
39 * Contains code to intercept MC events, collect information
40 * from core MCA banks on originating core and possibly on
41 * all active cores if necessary.
42 *
43 * In case of a severe event, defined by corrupted context,
44 * the handler will add a record of the event in the designated
45 * EEPROM hanging off the Over-Clocking I2C bus. Next a message
46 * will be sent to the SMC (enabling IPMI notifications) and at
47 * last a message is sent to host via the MC SCIF connection
48 * (if MC SCIF session has been established).
49 *
50 * Lesser events will also be sent to the host on a 'FYI' basis,
51 * but no record will be stored in the event log, nor will the
52 * SMC be notified.
53 *
54 * Special cases of high rate correctable errors may also cause
55 * events to be recorded in EEPROM on the assumption that the
56 * root cause will be detectable from maintenance mode.
57 *
58 * The handler cannot expect any support from the OS while in
59 * exception (NMI) context. Therefore, NMI-safe routines has
60 * been added to mimic some kernel services, e.g. ee_print().
61 */
62
63#include <linux/types.h>
64#include <linux/errno.h>
65#include <linux/kernel.h>
66#include <linux/mm.h>
67#include <linux/mm_types.h>
68#include <linux/io.h>
69#include <linux/cpumask.h>
70#include <asm/mce.h>
71#include <asm/apic.h>
72#include "micras.h"
73
74
75/*
76**
77** Brief design notes:
78** There are two ways this code normally will be entered.
79**
80** 1) From standard interrupt context (bottom-half).
81** This is supporting MC events picked up by the
82** machine_check_poll(), i.e. events that aren't
83** causing state corrruption (UC bit not set).
84**
85** 2) From exception/NMI context.
86** This handles errors that _did_ flag processor
87** state corruption (UC bit set, or other condition
88** causing the kernel exception handler to pick it up).
89**
90** Both cases can happen simultaneously on different CPU's,
91** which require careful considerations about re-entrant code
92** behaviour here. Particularly nasty is exception context where
93** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt
94** disable can protect a critical region, an assumption that is
95** false when an exception/NMI occur).
96**
97** Standard interrupt context entries occur when non-fatal and
98** thus non-critical MC events are handled. In most cases just
99** results in a regular SCIF send of McInfo structs to the host.
100** Note that the call chain origin is a callout from the timer
101** thread, not from an interrupt service routine, so to name
102** it as standard interrupt context is somewhat misleading.
103**
104** Exception context messages are usuallly fatal and must be
105** dealt with immediately, because otherwise the generic machine
106** handler may panic() the system when exiting exception handler
107** (default behavior, may be tweaked by altering 'threshold').
108**
109** In order to proceed we can either implement a locking mechanism
110** at every API function entry, or we can let every function do it's
111** thing independently. The latter is preferred, though it gets
112** somewhat complicated because the API between the generic MC
113** handling and RAS module is in fact composed of several calls.
114**
115** If state between API calls needs to be tracked then that can be
116** done by means of pre-allocated arrays, similar to the generic
117** handling in the Linux kernel. Currently the only state variable
118** is the mask of CPUs that has been sent an IPI.
119**
120** Core MC events can be simulated by using the 'mce-inject' tool,
121** consisting of a kernel module and a text mode application program.
122** The 'mce-inject' module knows the difference between fatal and
123** non-fatal events (defined by the UC bit) and acts differently
124** in the two cases. Non-fatal injections cause machine_check_poll()
125** to be called on all CPUs, resulting in events being reported to
126** function mce_poll(). Fatal injections cause do_machine_check()
127** to be called on all CPUs, resulting in calls to the mcc_exc_*
128** routines below. Activities triggered by mce-inject are flagged
129** as 'fake', and shall _NOT_ be logged in the EEPROM.
130**
131** Warning:
132** Controls in the generic MC handling may cause the kernel to
133** panic, _ALSO_ even if no event was found in any MCA banks!!
134** Not sure exactly how to capture that sort of event.
135**
136** Warning:
137** The 'mce-inject' module uses different methods of invoking error
138** handling routines, depending on the mce record (inject_flags).
139** Specifically, the 'mce-inject' module may use of broadcast NMIs
140** to invoke machine_check_poll() or do_machine_check() on all CPUs,
141** which will make these functions execute in exception context.
142** The NMI broadcast mechanism is based on registering a handler on
143** the 'die' notifier chain and then doing an
144** apic->send_IPI_mask(.., NMI_VECTOR),
145** knowing that do_nmi() will invoke this notifier chain when no
146** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0,
147** [which is SERR + IOCHK on chipset register NSR]).
148** Long story short; if 'mce-inject' is used we can not expect that
149** polling is done in standard interrupt context, and need to set
150** the 'in exception context' flag for SCIF access.
151**
152*/
153
154
155/*
156 * Hooks placed in the native machine check handler
157 * See file arch/x86/kernel/cpu/mcheck/mce.c for placement.
158 *
159 * poll After entering a non-UC event into mce_log.
160 * This happens in normal thread context, which
161 * means that kernel services are avaialble.
162 * exc_flt Filter on correctable errors. If events occur
163 * at a very high rate they can severely slow
164 * down the system and/or crash it entirely.
165 * Logic here will disable reporting of some
166 * events if they are seen too often.
167 * exc_entry Entering MC exception handler.
168 * Called _after_ reading MCG_STATUS and the early
169 * severity assesment by mce_severity() has been
170 * performed on all banks, such that we get to
171 * know if the native MC handler will panic.
172 * exc_log After entering a UC event into mce_log.
173 * The logged mce record has all available
174 * details on the event, and this point is the
175 * best place to perform our RAS activities.
176 * exc_panic Right before the MC exception handler calls
177 * the panic function.
178 * exc_exit Exit the MC exception handler
179 * print Exception context safe printf to POST-card UART
180 */
181
182extern void (*mca_poll)(struct mce *, uint64_t, int);
183extern void (*mca_exc_flt)(struct mce *, uint64_t, int);
184extern void (*mca_exc_entry)(struct mce *, int, int, int, char *);
185extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int);
186extern void (*mca_exc_panic)(struct mce *, char *, char *, int);
187extern void (*mca_exc_exit)(struct mce *, int, int, int, int);
188extern int (*mca_print)(char *, ...);
189
190extern struct mce_log mcelog; /* Export from kernel */
191extern struct mutex mce_read_mutex; /* Export from kernel */
192static unsigned mcc_seen; /* Last event in kernel log */
193int in_sync; /* Flag when sync'ing */
194
195
196/*
197 * Convert a kernel mce record into a MC API format
198 */
199
200static void
201mcc_conv(struct mce * mce, struct mce_info * mc)
202{
203 mc->org = mce->bank;
204 mc->id = mce->extcpu;
205#ifdef CONFIG_MK1OM
206 mc->pid = xlat_cpu[cpu_data(mc->id).apicid];
207#endif
208 mc->stamp = mce->time;
209 mc->status = mce->status;
210 mc->addr = mce->addr;
211 mc->misc = mce->misc;
212 mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
213}
214
215
216/*
217 * Filter for correctable errors, may modify CTL value.
218 * The filter is pretty crude, we just want to protect
219 * ourselves from being run over by fast recurring events.
220 * We keep tabs of events seen in a static array.
221 *
222 * Algorithm is like this:
223 * - test if event is in filter list; if not exit filter.
224 * - search for instance of this event in history.
225 * - if not found, insert event in history (strike 1).
226 * - if found but time since last seen exceeds window,
227 * then treat event as new in history (new strike 1).
228 * - if found and within time window, bump strike counter.
229 * - if strike counter reach maximum, we're fed up and
230 * turn this event off by clearing the associated
231 * bit in the offending MCA bank's CTL register and
232 * send a 'filter' event notification to the host.
233 *
234 * Advantages of this design is:
235 * - individual parameters for every filtered event.
236 * - only one event history array.
237 * - no periodic aging of events in history array.
238 * - no averaging over time required.
239 * - no moving/reordering of event history entries.
240 * - new events do not replace older seen event
241 * - filter reacts immediately when max reached.
242 *
243 * Disadvantages are:
244 * - linear search through filter array.
245 * - linear search through history array.
246 * - time parameter not obvious, it's really a limit
247 * on how old events in history are allowed to be.
248 * - in pathological cases the filter's reaction time
249 * will be max * window (when events trickle in at
250 * a rate just below the window size).
251 * - data in ADDR and MISC registers are not used to
252 * match current event with history. Should they be?
253 *
254 * For now, both lists are short enough that introducing
255 * more advanced searches probably are not going to help.
256 *
257 * On KnC the flash may have overrides of the mc_turnoff table.
258 */
259
260#define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */
261
262static struct mc_hist {
263 uint32_t count; /* How many times seen */
264 uint64_t last; /* TSC last time seen */
265 struct mce_info mc; /* Local MC event record */
266} mc_history[32];
267
268static struct mc_disc {
269 uint8_t bank, ctl; /* Bank selector and control bit # */
270 uint16_t win; /* Time window (seconds) */
271 uint16_t max; /* Max count */
272 uint16_t mca_code; /* MCA code, status[15:0] */
273 uint16_t mdl_code; /* Model code, status[31:16] */
274} mc_turnoff[] = {
275 { 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */
276 { 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */
277 { 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */
278 { 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */
279 { 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */
280 { 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */
281 { 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */
282 { 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */
283 { 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */
284 { 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */
285 { 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */
286 { 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */
287 { 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */
288 { 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */
289 { 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */
290 { 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */
291 { 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */
292};
293
294
295#ifdef CONFIG_MK1OM
296
297#define MC_FLT_SIG1 0x0e13c20f /* Start signature */
298#define MC_FLT_SIG2 0xf1ec3df0 /* End signature */
299#define MC_FLT_SIZE 0x200 /* Filter block length */
300
301void
302mcc_flt_parm(uint8_t * p)
303{
304 uint16_t fnum;
305
306 /*
307 * Check signatures
308 */
309 if (*((uint32_t *) p) != MC_FLT_SIG1 ||
310 *((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) {
311 printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n",
312 *((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4)));
313 return;
314 }
315
316 /*
317 * After start signature comes filter count (uint16_t)
318 * followed by 'count' filter descriptors (struct mc_disc).
319 */
320 fnum = *(uint16_t *)(p + 4);
321 if (fnum > ARRAY_SIZE(mc_turnoff) ||
322 fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) {
323 printk("mcc_flt_parm: filter count %d not valid\n", fnum);
324 return;
325 }
326
327 /*
328 * Seems the table is legit, copy it over defaults.
329 */
330 memset(mc_turnoff, '\0', sizeof(mc_turnoff));
331 memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc));
332#if MC_VERBOSE
333 {
334 int i;
335
336 for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
337 printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n",
338 i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win,
339 mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code);
340 }
341 }
342#endif
343}
344
345#endif
346
347
348/*
349 * Frequency filter for core and un-core MC events
350 */
351
352uint32_t
353micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc)
354{
355 struct mc_disc * dsc;
356 struct mc_hist * hst;
357 uint64_t ostamp;
358 int i, oldest;
359
360 if (mc->status & MCI_STATUS_UC)
361 return 0;
362
363 /*
364 * Check if this event may be filtered
365 */
366 dsc = mc_turnoff;
367 for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) {
368 if (dsc->bank == mc->org &&
369 dsc->mca_code == GET_BITS(15, 0, mc->status) &&
370 dsc->mdl_code == GET_BITS(31, 16, mc->status))
371 break;
372 dsc++;
373 }
374 if (i == ARRAY_SIZE(mc_turnoff))
375 return 0;
376
377 /*
378 * Have a candidate for filter.
379 * Have we seen this one before?
380 */
381 oldest = 0;
382 ostamp = tsc;
383 hst = mc_history;
384 for(i = 0; i < ARRAY_SIZE(mc_history); i++) {
385 /*
386 * While scanning, find the oldest event too
387 */
388 if (hst->last < ostamp) {
389 ostamp = hst->last;
390 oldest = i;
391 }
392
393 /*
394 * Does this match event in filter history?
395 * TBD: how much needs to match?
396 * For now: cpu (or box), bank, mca_code and model_code.
397 */
398 if (hst->last &&
399 hst->mc.id == mc->id &&
400 hst->mc.org == mc->org &&
401 GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) &&
402 GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status))
403 break;
404 hst++;
405 }
406 if (i == ARRAY_SIZE(mc_history)) {
407 /*
408 * Not seen this event before.
409 * 'oldest' is where to store this event.
410 */
411 hst = mc_history + oldest;
412 hst->count = 1;
413 hst->last = tsc;
414 hst->mc = *mc;
415 return 0;
416 }
417
418 /*
419 * Already 'on file in history', test expiration date
420 */
421 if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) {
422 /*
423 * Matching history element had expired, just overwrite it
424 */
425 hst->count = 1;
426 hst->last = tsc;
427 hst->mc = *mc;
428 return 0;
429 }
430
431 /*
432 * Filter element active, bump count and set last seen.
433 * We do _NOT_ want injected events to enter the EEPROM,
434 * so that flag is preserved over all event history
435 */
436 hst->count++;
437 if (mc->flags & MC_FLG_FALSE)
438 hst->mc.flags |= MC_FLG_FALSE;
439 if (hst->count < dsc->max) {
440 hst->last = tsc;
441 return 0;
442 }
443
444 /*
445 * Threshold reached, event source needs to be silenced.
446 * Store a record of this in the EEPROM and send a
447 * notification to host about it. Once duly reported, clear
448 * event from the filter; it is not expected to show up again.
449 * Note: we report the _first_ event seen, not the
450 * event at hand. We could save array space
451 * by sending latest event (less info to keep).
452 */
453 ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n",
454 dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz);
455 hst->mc.flags |= MC_FLG_FILTER;
456#ifdef CONFIG_MK1OM
457 if (!(hst->mc.flags & MC_FLG_FALSE)) {
458 micras_mc_log(&hst->mc);
459 hst->mc.flags |= MC_FLG_LOG;
460 }
461#endif
462 micras_mc_send(&hst->mc, exc);
463 hst->last = 0;
464
465 /*
466 * MC events are disabled by caller when a
467 * non-zero mask is returned by this routine.
468 */
469 return (1 << dsc->ctl);
470}
471
472
473/*
474 * Remove/mask an 'enable-bit' from a core MCA bank.
475 * Note: This applies to _current_ cpu only. It is not explicitly
476 * linked to the cpu that was ID'd in the incoming mce struct.
477 * Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log().
478 */
479
480static void
481mcc_ctl_mask(int bank, uint32_t msk)
482{
483 uint32_t ctl_lo, ctl_hi;
484
485 rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
486 ctl_lo &= ~msk;
487 wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi);
488
489#if MC_VERBOSE
490 ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo);
491#endif
492}
493
494
495/*
496 * Filtering of correctable core MC events
497 * Called from the exception handler.
498 */
499
500static void
501mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake)
502{
503 struct mce_info mc;
504 uint32_t msk;
505
506 if (!mce)
507 return;
508
509 if (mce->status & MCI_STATUS_UC)
510 return;
511
512 mcc_conv(mce, &mc);
513 mc.ctl = ctl;
514 mc.flags = fake ? MC_FLG_FALSE : 0;
515 msk = micras_mc_filter(&mc, mce->tsc, 1);
516 if (msk)
517 mcc_ctl_mask(mce->bank, msk);
518}
519
520
521/*
522 * Only action required for polled MC events is to
523 * pass the event on to the SCIF channel (if connected).
524 * The event should already have caused an excption (the
525 * exception handler choses to ignore corrected errors)
526 * which means it already has been filtered.
527 * Injected corrected events do not cause MCE exceptions
528 * and thus escaped filtering, so we'll filter them here.
529 */
530
531static void
532mcc_poll(struct mce * mce, uint64_t ctl, int fake)
533{
534 struct mce_info mc;
535
536#if MC_VERBOSE
537 ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status);
538#endif
539
540 mcc_conv(mce, &mc);
541 mc.ctl = ctl;
542 mc.flags = fake ? MC_FLG_FALSE : 0;
543
544#if BEAM_TEST
545 /*
546 * Under beam test we only want to send the SCIF message
547 */
548 micras_mc_send(&mc, fake);
549 return;
550#endif
551
552 if (micras_mc_send(&mc, fake))
553 mcc_seen = mcelog.next;
554
555 /*
556 * According to MCA HAS the MCI_STATUS_VAL will only
557 * be set when an event's enable bit is set, in which
558 * case it is difficult to imagine how events without
559 * the MCI_STATUS_EN can appear here. The second clause
560 * of the test may never actually happen on Kn{F,C}.
561 * Note: MC polling does not capture TSCs
562 */
563 if (fake || !(mc.status & MCI_STATUS_EN)) {
564 uint32_t msk;
565
566 msk = micras_mc_filter(&mc, rdtsc(), fake);
567 if (msk)
568 mcc_ctl_mask(mce->bank, msk);
569 }
570}
571
572
573/*
574 * One CPU entered do_machine_check().
575 * We get the initial mce record (which has cpu ID), early
576 * control variables and whether the event is injected.
577 *
578 * Since KnF and KnC deviate from the standard IA by not
579 * having the core MCAs broadcast to all CPU's we'll try
580 * to fake standard behavior in order to keep the generic
581 * machine check code intact.
582 * Therefore, if event is real (fake flag unset) and this
583 * CPU is the first seeing it (mcc_exc_mask is empty),
584 * then send IPI to all other CPU's listed in the online
585 * cpumask for vector #18. Later CPUs will see themselves
586 * marked in mcc_exc_mask and return quickly.
587 */
588
589struct cpumask mcc_exc_mask; /* CPU's in mce ctx */
590static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */
591
592static void
593mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg)
594{
595 unsigned int cpu;
596
597 /*
598 *TBD: should we use 'extcpu' from the MCE record instead?
599 */
600 cpu = smp_processor_id();
601
602 /*
603 * Injected events invokes all CPUs automatically
604 * by hooking into the NMI notify_die call_chain.
605 * Nothing to do here.
606 */
607 if (fake)
608 return;
609
610#if 1
611 /*
612 * Avoid the IPI corralling circus on corrected errors,
613 * based on assessment entirely done by mce_severity().
614 * If the result (no_way_out) is MCE_NO_SEVERITY (=0), then
615 * at worst we may have a correctable error, and that does
616 * not warrant the system lockdown managed by mce_start()
617 * and mce_end().
618 * Note that MICs do not support newer status bits (MCG_SER_P)
619 * which causes variable mce_ser always to be zero and thus
620 * the test in the inner loop of do_machine_check() will be
621 * reduced to just testing for the UC bit.
622 */
623 if (! no_way_out)
624 return;
625#endif
626
627 /*
628 * Test for entry from MT thread IPIs (testing)
629 * or a 'soft' exception from a IPI issued from
630 * the handler of the first exception.
631 * No further action needed in both cases.
632 */
633 if (cpumask_test_cpu(cpu, &mcc_exc_mask))
634 return;
635
636 /*
637 * Create mcc_exc_mask to flag which CPU's are
638 * to be included in the IPI. This mask is later
639 * used to determine who needs to EOI the local
640 * APIC after MC event handling.
641 */
642 while(atomic_xchg(&ipi_lock, 1))
643 cpu_relax();
644 smp_rmb();
645 if (cpumask_test_cpu(cpu, &mcc_exc_mask)) {
646 /*
647 * Another CPU got here first
648 */
649 atomic_xchg(&ipi_lock, 0);
650 return;
651 }
652 cpumask_copy(&mcc_exc_mask, cpu_online_mask);
653 cpumask_clear_cpu(cpu, &mcc_exc_mask);
654 smp_wmb();
655 atomic_xchg(&ipi_lock, 0);
656
657 /*
658 * Simulate a broadcast ny sending IPI to all
659 * other CPUs.
660 */
661 // apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR);
662 apic->send_IPI_allbutself(MCE_VECTOR);
663}
664
665
666/*
667 * In do_machine_check() bank scan loop.
668 * Called from a lockdown, no synchronization needed.
669 * MC bank scan is complete and the mce event has been
670 * entered into the kernel MC log
671 *
672 *TBD: revise logic on HALT on UC events?
673 * From a state corruption point of view this
674 * _is_ a fatal error because UC bit was set.
675 * However, if the tolerance setting is set
676 * high enough, the generic MC handler may
677 * not chose to panic on this event.
678 * We currently do not have the tolerance value
679 * when recording this event, nor do we have
680 * other factors that mce_reign() use to determine
681 * what to do after reporting event to the host.
682 */
683
684static void
685mcc_exc_log(struct mce * mce, uint64_t ctl, int fake,
686 int no_way_out, char * msg, int severity, int worst)
687{
688 struct mce_info mc;
689 uint32_t msk;
690
691#if MC_VERBOSE
692 ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n",
693 mce->extcpu, mce->time, no_way_out, msg, severity, worst);
694#endif
695
696 /*
697 * Create a message for the host.
698 */
699 mcc_conv(mce, &mc);
700 mc.ctl = ctl;
701 mc.flags |= fake ? MC_FLG_FALSE : 0;
702
703#if BEAM_TEST
704 /*
705 * Under beam test we only want to send the SCIF message
706 * This is guaranteed not to be called re-entrantly.
707 */
708 micras_mc_send(&mc, 1);
709 return;
710#endif
711
712#ifdef CONFIG_MK1OM
713 /*
714 * If this is a true event then log it in the EEPROM and
715 * notify SMC that we've had a serious machine check error.
716 */
717 if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) {
718 micras_mc_log(&mc);
719 mc.flags |= MC_FLG_LOG;
720
721 /*
722 *TBD: Should this be deferred until the actual panic?
723 * The user can raise tolerance such that we in
724 * fact continue operating; in which case the SMC
725 * notification would be (somewhat) misleading.
726 */
727 micras_mc_ipmi(&mc, 1);
728 }
729#endif
730
731 /*
732 * Always notify host and sync to kernel log
733 */
734 if (micras_mc_send(&mc, 1))
735 mcc_seen = mcelog.next;
736
737#if RAS_HALT
738 if ((mc.flags & MC_FLG_FATAL) && !fake)
739 panic("FATAL core machine check event:\n"
740 "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
741 mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc);
742#endif
743
744 /*
745 * Correctable events can in fact reach us here if
746 * mce_no_way_out() tags them as critical (for other
747 * reasons than the UC flag, e.g. MCIP missing).
748 * If the tolerance setting is high enough to prevent
749 * such events to panic, we'd still want filtering.
750 */
751 msk = micras_mc_filter(&mc, mce->tsc, 1);
752 if (msk)
753 mcc_ctl_mask(mce->bank, msk);
754}
755
756
757/*
758 * In mce_panic().
759 * Current event is about to make the kernel panic.
760 * Sources of this call are
761 * do_machine_check(), when no_way_out set
762 * mce_timed_out(), CPU rendez-vous failed
763 * mce_reign(), when severety high, a CPU hung, or no events
764 */
765
766static void
767mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake)
768{
769 /*
770 * Should host be notified in this case?
771 * And if so, how should be presented, we might not
772 * even have a mce record to show when this happens!
773 * If an mce is passed, it has already been seen and
774 * reported to the host by a call to mcc_exc_log().
775 * If mce is NULL, then this _is_ an MC relatedi panic,
776 * but we have no data fitting for a host notification.
777 * Create a pseudo event and ship that?
778 */
779 ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n",
780 mce->extcpu, mce->time, msg, exp, fake);
781}
782
783
784/*
785 * A CPU is leaving do_machine_check().
786 * We get this after the monarch has 'reigned' and
787 * the response to the event has been completed.
788 */
789
790static void
791mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order)
792{
793 unsigned int cpu;
794 int eoi;
795
796 cpu = smp_processor_id();
797
798 /*
799 * Assuming test_and_clear_bit() is atomic.
800 */
801 smp_rmb();
802 eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask);
803 smp_wmb();
804 if (eoi)
805 ack_APIC_irq();
806}
807
808
809/*
810 * Routine to scan the kernel's MC log.
811 * Called when SCIF MC session has been created, to bring the host
812 * side up to date with prior unreported MC events, such as events
813 * occurring when MC session was not active (no peer was listening
814 * on the host) and events occurring before RAS module was loaded.
815 *
816 * Notes:
817 * - This is always called in thread context.
818 * - There are no injection flags in the kernel
819 * MC log, i.e. no guarantee events are genuine.
820 * - The MC kernel log has been exported explicitly for this.
821 *
822 * On synchronization (or the lack thereof):
823 * Effectively the mcelog holds a static array of mce's where the
824 * 'finished' flag says whether mce content is valid or not. The
825 * 'next' field is the index of the first element in the array that
826 * has not been assigned for an MC event. It is incremented when a
827 * new event is entered, and reset to zero on reads to /dev/mcelog.
828 * The kernel's event log does not wrap, so it is safe to use it as
829 * an indicator of how many events (finished or not) are in it.
830 * The mcelog's next field is protected by RCU style mechanisms
831 * in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c).
832 * For obvious reasons it is not genuine RCU, e.g. access to 'next'
833 * isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever
834 * masking use of a lock in an RCU macro definition.
835 * There is no RCU moving data around, the mce array does not move,
836 * and the 'finished' flag is set after a wmb() on the mce contents
837 * which means this routine will not clash with the MCE handler.
838 * Collisions with memset() on reads from /dev/mcelog are prevented
839 * by locking of mce_read_mutex.
840 */
841
842void
843mcc_sync(void)
844{
845 struct mce_info mc;
846 unsigned seen;
847
848 if (mce_disabled)
849 return;
850
851#if 0
852 /*
853 * Can't do this until bootstrap scrubs MC banks on all cards.
854 * It has been observed that MCA banks may _not_ be reset on card
855 * reboot which means events picked up by the kernel before loading
856 * the RAS module may have occured in a previous uOS run.
857 * Should be OK post early Jan '12 (flash ver 262, HSD 4115351).
858 */
859 return;
860#endif
861
862 /*
863 * Lock out kernel log access through /dev/mcelog
864 */
865 mutex_lock(&mce_read_mutex);
866
867 /*
868 * Start over if the log has been cleared cleared
869 */
870 if (mcc_seen > mcelog.next)
871 mcc_seen = 0;
872
873 for(seen = mcc_seen; seen < mcelog.next; seen++) {
874 /*
875 * Basic checks. Index, CPU & bank must be reasonable.
876 */
877 if (mcelog.entry[seen].finished) {
878 if (mcelog.entry[seen].cpu >= NR_CPUS ||
879 mcelog.entry[seen].bank >= 3) {
880 printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n",
881 seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank);
882 continue;
883 }
884
885 /*
886 * Have good entry, can be UC, but it is 'old'.
887 */
888 mcc_conv(&mcelog.entry[seen], &mc);
889 mc.ctl = 0;
890
891#ifdef CONFIG_MK1OM
892 /*
893 * Log this event in the eeprom and notify
894 * that we've had a serious machine check error.
895 */
896 if (mc.flags & MC_FLG_FATAL) {
897 in_sync = 1;
898 micras_mc_log(&mc);
899 in_sync = 0;
900 mc.flags |= MC_FLG_LOG;
901 micras_mc_ipmi(&mc, 0);
902 }
903#endif
904
905 /*
906 * Notify host about this too
907 */
908 if (! micras_mc_send(&mc, 0))
909 break;
910 }
911 }
912 mcc_seen = mcelog.next;
913
914 /*
915 * Done, release lock
916 */
917 mutex_unlock(&mce_read_mutex);
918}
919
920
921/*
922 * Setup excetion handlers by hooking into the
923 * kernel's native MCA handler.
924 */
925
926int __init
927mcc_init(void)
928{
929 if (mce_disabled) {
930 printk("RAS.core: disabled\n");
931 }
932 else {
933 mca_poll = mcc_poll;
934 mca_exc_flt = mcc_exc_flt;
935 mca_exc_entry = mcc_exc_entry;
936 mca_exc_log = mcc_exc_log;
937 mca_exc_panic = mcc_exc_panic;
938 mca_exc_exit = mcc_exc_exit;
939 mca_print = 0; /* For debug: ee_printk; */
940 printk("RAS.core: init complete\n");
941 }
942
943 return 0;
944}
945
946
947/*
948 * Cleanup for module unload.
949 * Clear/restore hooks in the native MCA handler.
950 */
951
952int __exit
953mcc_exit(void)
954{
955 mca_poll = 0;
956 mca_exc_flt = 0;
957 mca_exc_entry = 0;
958 mca_exc_log = 0;
959 mca_exc_panic = 0;
960 mca_exc_exit = 0;
961 mca_print = 0;
962
963 /*
964 * Links from kernel's MCE handler cut,
965 * wait for everybody in handler to leave.
966 */
967 while(atomic_read(&mce_entry))
968 cpu_relax();
969
970 printk("RAS.core: exit complete\n");
971 return 0;
972}
973