Commit | Line | Data |
---|---|---|
800f879a AT |
1 | /* |
2 | * Copyright 2010-2017 Intel Corporation. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License, version 2, | |
6 | * as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, | |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | * | |
13 | * Disclaimer: The codes contained in these modules may be specific to | |
14 | * the Intel Software Development Platform codenamed Knights Ferry, | |
15 | * and the Intel product codenamed Knights Corner, and are not backward | |
16 | * compatible with other Intel products. Additionally, Intel will NOT | |
17 | * support the codes or instruction set in future products. | |
18 | * | |
19 | * Intel offers no warranty of any kind regarding the code. This code is | |
20 | * licensed on an "AS IS" basis and Intel is not obligated to provide | |
21 | * any support, assistance, installation, training, or other services | |
22 | * of any kind. Intel is also not obligated to provide any updates, | |
23 | * enhancements or extensions. Intel specifically disclaims any warranty | |
24 | * of merchantability, non-infringement, fitness for any particular | |
25 | * purpose, and any other warranty. | |
26 | * | |
27 | * Further, Intel disclaims all liability of any kind, including but | |
28 | * not limited to liability for infringement of any proprietary rights, | |
29 | * relating to the use of the code, even if Intel is notified of the | |
30 | * possibility of such liability. Except as expressly stated in an Intel | |
31 | * license agreement provided with this code and agreed upon with Intel, | |
32 | * no license, express or implied, by estoppel or otherwise, to any | |
33 | * intellectual property rights is granted herein. | |
34 | */ | |
35 | ||
36 | /* | |
37 | * RAS handler for core MC events | |
38 | * | |
39 | * Contains code to intercept MC events, collect information | |
40 | * from core MCA banks on originating core and possibly on | |
41 | * all active cores if necessary. | |
42 | * | |
43 | * In case of a severe event, defined by corrupted context, | |
44 | * the handler will add a record of the event in the designated | |
45 | * EEPROM hanging off the Over-Clocking I2C bus. Next a message | |
46 | * will be sent to the SMC (enabling IPMI notifications) and at | |
47 | * last a message is sent to host via the MC SCIF connection | |
48 | * (if MC SCIF session has been established). | |
49 | * | |
50 | * Lesser events will also be sent to the host on a 'FYI' basis, | |
51 | * but no record will be stored in the event log, nor will the | |
52 | * SMC be notified. | |
53 | * | |
54 | * Special cases of high rate correctable errors may also cause | |
55 | * events to be recorded in EEPROM on the assumption that the | |
56 | * root cause will be detectable from maintenance mode. | |
57 | * | |
58 | * The handler cannot expect any support from the OS while in | |
59 | * exception (NMI) context. Therefore, NMI-safe routines has | |
60 | * been added to mimic some kernel services, e.g. ee_print(). | |
61 | */ | |
62 | ||
63 | #include <linux/types.h> | |
64 | #include <linux/errno.h> | |
65 | #include <linux/kernel.h> | |
66 | #include <linux/mm.h> | |
67 | #include <linux/mm_types.h> | |
68 | #include <linux/io.h> | |
69 | #include <linux/cpumask.h> | |
70 | #include <asm/mce.h> | |
71 | #include <asm/apic.h> | |
72 | #include "micras.h" | |
73 | ||
74 | ||
75 | /* | |
76 | ** | |
77 | ** Brief design notes: | |
78 | ** There are two ways this code normally will be entered. | |
79 | ** | |
80 | ** 1) From standard interrupt context (bottom-half). | |
81 | ** This is supporting MC events picked up by the | |
82 | ** machine_check_poll(), i.e. events that aren't | |
83 | ** causing state corrruption (UC bit not set). | |
84 | ** | |
85 | ** 2) From exception/NMI context. | |
86 | ** This handles errors that _did_ flag processor | |
87 | ** state corruption (UC bit set, or other condition | |
88 | ** causing the kernel exception handler to pick it up). | |
89 | ** | |
90 | ** Both cases can happen simultaneously on different CPU's, | |
91 | ** which require careful considerations about re-entrant code | |
92 | ** behaviour here. Particularly nasty is exception context where | |
93 | ** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt | |
94 | ** disable can protect a critical region, an assumption that is | |
95 | ** false when an exception/NMI occur). | |
96 | ** | |
97 | ** Standard interrupt context entries occur when non-fatal and | |
98 | ** thus non-critical MC events are handled. In most cases just | |
99 | ** results in a regular SCIF send of McInfo structs to the host. | |
100 | ** Note that the call chain origin is a callout from the timer | |
101 | ** thread, not from an interrupt service routine, so to name | |
102 | ** it as standard interrupt context is somewhat misleading. | |
103 | ** | |
104 | ** Exception context messages are usuallly fatal and must be | |
105 | ** dealt with immediately, because otherwise the generic machine | |
106 | ** handler may panic() the system when exiting exception handler | |
107 | ** (default behavior, may be tweaked by altering 'threshold'). | |
108 | ** | |
109 | ** In order to proceed we can either implement a locking mechanism | |
110 | ** at every API function entry, or we can let every function do it's | |
111 | ** thing independently. The latter is preferred, though it gets | |
112 | ** somewhat complicated because the API between the generic MC | |
113 | ** handling and RAS module is in fact composed of several calls. | |
114 | ** | |
115 | ** If state between API calls needs to be tracked then that can be | |
116 | ** done by means of pre-allocated arrays, similar to the generic | |
117 | ** handling in the Linux kernel. Currently the only state variable | |
118 | ** is the mask of CPUs that has been sent an IPI. | |
119 | ** | |
120 | ** Core MC events can be simulated by using the 'mce-inject' tool, | |
121 | ** consisting of a kernel module and a text mode application program. | |
122 | ** The 'mce-inject' module knows the difference between fatal and | |
123 | ** non-fatal events (defined by the UC bit) and acts differently | |
124 | ** in the two cases. Non-fatal injections cause machine_check_poll() | |
125 | ** to be called on all CPUs, resulting in events being reported to | |
126 | ** function mce_poll(). Fatal injections cause do_machine_check() | |
127 | ** to be called on all CPUs, resulting in calls to the mcc_exc_* | |
128 | ** routines below. Activities triggered by mce-inject are flagged | |
129 | ** as 'fake', and shall _NOT_ be logged in the EEPROM. | |
130 | ** | |
131 | ** Warning: | |
132 | ** Controls in the generic MC handling may cause the kernel to | |
133 | ** panic, _ALSO_ even if no event was found in any MCA banks!! | |
134 | ** Not sure exactly how to capture that sort of event. | |
135 | ** | |
136 | ** Warning: | |
137 | ** The 'mce-inject' module uses different methods of invoking error | |
138 | ** handling routines, depending on the mce record (inject_flags). | |
139 | ** Specifically, the 'mce-inject' module may use of broadcast NMIs | |
140 | ** to invoke machine_check_poll() or do_machine_check() on all CPUs, | |
141 | ** which will make these functions execute in exception context. | |
142 | ** The NMI broadcast mechanism is based on registering a handler on | |
143 | ** the 'die' notifier chain and then doing an | |
144 | ** apic->send_IPI_mask(.., NMI_VECTOR), | |
145 | ** knowing that do_nmi() will invoke this notifier chain when no | |
146 | ** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0, | |
147 | ** [which is SERR + IOCHK on chipset register NSR]). | |
148 | ** Long story short; if 'mce-inject' is used we can not expect that | |
149 | ** polling is done in standard interrupt context, and need to set | |
150 | ** the 'in exception context' flag for SCIF access. | |
151 | ** | |
152 | */ | |
153 | ||
154 | ||
155 | /* | |
156 | * Hooks placed in the native machine check handler | |
157 | * See file arch/x86/kernel/cpu/mcheck/mce.c for placement. | |
158 | * | |
159 | * poll After entering a non-UC event into mce_log. | |
160 | * This happens in normal thread context, which | |
161 | * means that kernel services are avaialble. | |
162 | * exc_flt Filter on correctable errors. If events occur | |
163 | * at a very high rate they can severely slow | |
164 | * down the system and/or crash it entirely. | |
165 | * Logic here will disable reporting of some | |
166 | * events if they are seen too often. | |
167 | * exc_entry Entering MC exception handler. | |
168 | * Called _after_ reading MCG_STATUS and the early | |
169 | * severity assesment by mce_severity() has been | |
170 | * performed on all banks, such that we get to | |
171 | * know if the native MC handler will panic. | |
172 | * exc_log After entering a UC event into mce_log. | |
173 | * The logged mce record has all available | |
174 | * details on the event, and this point is the | |
175 | * best place to perform our RAS activities. | |
176 | * exc_panic Right before the MC exception handler calls | |
177 | * the panic function. | |
178 | * exc_exit Exit the MC exception handler | |
179 | * print Exception context safe printf to POST-card UART | |
180 | */ | |
181 | ||
182 | extern void (*mca_poll)(struct mce *, uint64_t, int); | |
183 | extern void (*mca_exc_flt)(struct mce *, uint64_t, int); | |
184 | extern void (*mca_exc_entry)(struct mce *, int, int, int, char *); | |
185 | extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int); | |
186 | extern void (*mca_exc_panic)(struct mce *, char *, char *, int); | |
187 | extern void (*mca_exc_exit)(struct mce *, int, int, int, int); | |
188 | extern int (*mca_print)(char *, ...); | |
189 | ||
190 | extern struct mce_log mcelog; /* Export from kernel */ | |
191 | extern struct mutex mce_read_mutex; /* Export from kernel */ | |
192 | static unsigned mcc_seen; /* Last event in kernel log */ | |
193 | int in_sync; /* Flag when sync'ing */ | |
194 | ||
195 | ||
196 | /* | |
197 | * Convert a kernel mce record into a MC API format | |
198 | */ | |
199 | ||
200 | static void | |
201 | mcc_conv(struct mce * mce, struct mce_info * mc) | |
202 | { | |
203 | mc->org = mce->bank; | |
204 | mc->id = mce->extcpu; | |
205 | #ifdef CONFIG_MK1OM | |
206 | mc->pid = xlat_cpu[cpu_data(mc->id).apicid]; | |
207 | #endif | |
208 | mc->stamp = mce->time; | |
209 | mc->status = mce->status; | |
210 | mc->addr = mce->addr; | |
211 | mc->misc = mce->misc; | |
212 | mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0; | |
213 | } | |
214 | ||
215 | ||
216 | /* | |
217 | * Filter for correctable errors, may modify CTL value. | |
218 | * The filter is pretty crude, we just want to protect | |
219 | * ourselves from being run over by fast recurring events. | |
220 | * We keep tabs of events seen in a static array. | |
221 | * | |
222 | * Algorithm is like this: | |
223 | * - test if event is in filter list; if not exit filter. | |
224 | * - search for instance of this event in history. | |
225 | * - if not found, insert event in history (strike 1). | |
226 | * - if found but time since last seen exceeds window, | |
227 | * then treat event as new in history (new strike 1). | |
228 | * - if found and within time window, bump strike counter. | |
229 | * - if strike counter reach maximum, we're fed up and | |
230 | * turn this event off by clearing the associated | |
231 | * bit in the offending MCA bank's CTL register and | |
232 | * send a 'filter' event notification to the host. | |
233 | * | |
234 | * Advantages of this design is: | |
235 | * - individual parameters for every filtered event. | |
236 | * - only one event history array. | |
237 | * - no periodic aging of events in history array. | |
238 | * - no averaging over time required. | |
239 | * - no moving/reordering of event history entries. | |
240 | * - new events do not replace older seen event | |
241 | * - filter reacts immediately when max reached. | |
242 | * | |
243 | * Disadvantages are: | |
244 | * - linear search through filter array. | |
245 | * - linear search through history array. | |
246 | * - time parameter not obvious, it's really a limit | |
247 | * on how old events in history are allowed to be. | |
248 | * - in pathological cases the filter's reaction time | |
249 | * will be max * window (when events trickle in at | |
250 | * a rate just below the window size). | |
251 | * - data in ADDR and MISC registers are not used to | |
252 | * match current event with history. Should they be? | |
253 | * | |
254 | * For now, both lists are short enough that introducing | |
255 | * more advanced searches probably are not going to help. | |
256 | * | |
257 | * On KnC the flash may have overrides of the mc_turnoff table. | |
258 | */ | |
259 | ||
260 | #define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */ | |
261 | ||
262 | static struct mc_hist { | |
263 | uint32_t count; /* How many times seen */ | |
264 | uint64_t last; /* TSC last time seen */ | |
265 | struct mce_info mc; /* Local MC event record */ | |
266 | } mc_history[32]; | |
267 | ||
268 | static struct mc_disc { | |
269 | uint8_t bank, ctl; /* Bank selector and control bit # */ | |
270 | uint16_t win; /* Time window (seconds) */ | |
271 | uint16_t max; /* Max count */ | |
272 | uint16_t mca_code; /* MCA code, status[15:0] */ | |
273 | uint16_t mdl_code; /* Model code, status[31:16] */ | |
274 | } mc_turnoff[] = { | |
275 | { 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */ | |
276 | { 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */ | |
277 | { 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */ | |
278 | { 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */ | |
279 | { 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */ | |
280 | { 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */ | |
281 | { 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */ | |
282 | { 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */ | |
283 | { 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */ | |
284 | { 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */ | |
285 | { 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */ | |
286 | { 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */ | |
287 | { 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */ | |
288 | { 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */ | |
289 | { 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */ | |
290 | { 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */ | |
291 | { 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */ | |
292 | }; | |
293 | ||
294 | ||
295 | #ifdef CONFIG_MK1OM | |
296 | ||
297 | #define MC_FLT_SIG1 0x0e13c20f /* Start signature */ | |
298 | #define MC_FLT_SIG2 0xf1ec3df0 /* End signature */ | |
299 | #define MC_FLT_SIZE 0x200 /* Filter block length */ | |
300 | ||
301 | void | |
302 | mcc_flt_parm(uint8_t * p) | |
303 | { | |
304 | uint16_t fnum; | |
305 | ||
306 | /* | |
307 | * Check signatures | |
308 | */ | |
309 | if (*((uint32_t *) p) != MC_FLT_SIG1 || | |
310 | *((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) { | |
311 | printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n", | |
312 | *((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4))); | |
313 | return; | |
314 | } | |
315 | ||
316 | /* | |
317 | * After start signature comes filter count (uint16_t) | |
318 | * followed by 'count' filter descriptors (struct mc_disc). | |
319 | */ | |
320 | fnum = *(uint16_t *)(p + 4); | |
321 | if (fnum > ARRAY_SIZE(mc_turnoff) || | |
322 | fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) { | |
323 | printk("mcc_flt_parm: filter count %d not valid\n", fnum); | |
324 | return; | |
325 | } | |
326 | ||
327 | /* | |
328 | * Seems the table is legit, copy it over defaults. | |
329 | */ | |
330 | memset(mc_turnoff, '\0', sizeof(mc_turnoff)); | |
331 | memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc)); | |
332 | #if MC_VERBOSE | |
333 | { | |
334 | int i; | |
335 | ||
336 | for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { | |
337 | printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n", | |
338 | i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win, | |
339 | mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code); | |
340 | } | |
341 | } | |
342 | #endif | |
343 | } | |
344 | ||
345 | #endif | |
346 | ||
347 | ||
348 | /* | |
349 | * Frequency filter for core and un-core MC events | |
350 | */ | |
351 | ||
352 | uint32_t | |
353 | micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc) | |
354 | { | |
355 | struct mc_disc * dsc; | |
356 | struct mc_hist * hst; | |
357 | uint64_t ostamp; | |
358 | int i, oldest; | |
359 | ||
360 | if (mc->status & MCI_STATUS_UC) | |
361 | return 0; | |
362 | ||
363 | /* | |
364 | * Check if this event may be filtered | |
365 | */ | |
366 | dsc = mc_turnoff; | |
367 | for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { | |
368 | if (dsc->bank == mc->org && | |
369 | dsc->mca_code == GET_BITS(15, 0, mc->status) && | |
370 | dsc->mdl_code == GET_BITS(31, 16, mc->status)) | |
371 | break; | |
372 | dsc++; | |
373 | } | |
374 | if (i == ARRAY_SIZE(mc_turnoff)) | |
375 | return 0; | |
376 | ||
377 | /* | |
378 | * Have a candidate for filter. | |
379 | * Have we seen this one before? | |
380 | */ | |
381 | oldest = 0; | |
382 | ostamp = tsc; | |
383 | hst = mc_history; | |
384 | for(i = 0; i < ARRAY_SIZE(mc_history); i++) { | |
385 | /* | |
386 | * While scanning, find the oldest event too | |
387 | */ | |
388 | if (hst->last < ostamp) { | |
389 | ostamp = hst->last; | |
390 | oldest = i; | |
391 | } | |
392 | ||
393 | /* | |
394 | * Does this match event in filter history? | |
395 | * TBD: how much needs to match? | |
396 | * For now: cpu (or box), bank, mca_code and model_code. | |
397 | */ | |
398 | if (hst->last && | |
399 | hst->mc.id == mc->id && | |
400 | hst->mc.org == mc->org && | |
401 | GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) && | |
402 | GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status)) | |
403 | break; | |
404 | hst++; | |
405 | } | |
406 | if (i == ARRAY_SIZE(mc_history)) { | |
407 | /* | |
408 | * Not seen this event before. | |
409 | * 'oldest' is where to store this event. | |
410 | */ | |
411 | hst = mc_history + oldest; | |
412 | hst->count = 1; | |
413 | hst->last = tsc; | |
414 | hst->mc = *mc; | |
415 | return 0; | |
416 | } | |
417 | ||
418 | /* | |
419 | * Already 'on file in history', test expiration date | |
420 | */ | |
421 | if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) { | |
422 | /* | |
423 | * Matching history element had expired, just overwrite it | |
424 | */ | |
425 | hst->count = 1; | |
426 | hst->last = tsc; | |
427 | hst->mc = *mc; | |
428 | return 0; | |
429 | } | |
430 | ||
431 | /* | |
432 | * Filter element active, bump count and set last seen. | |
433 | * We do _NOT_ want injected events to enter the EEPROM, | |
434 | * so that flag is preserved over all event history | |
435 | */ | |
436 | hst->count++; | |
437 | if (mc->flags & MC_FLG_FALSE) | |
438 | hst->mc.flags |= MC_FLG_FALSE; | |
439 | if (hst->count < dsc->max) { | |
440 | hst->last = tsc; | |
441 | return 0; | |
442 | } | |
443 | ||
444 | /* | |
445 | * Threshold reached, event source needs to be silenced. | |
446 | * Store a record of this in the EEPROM and send a | |
447 | * notification to host about it. Once duly reported, clear | |
448 | * event from the filter; it is not expected to show up again. | |
449 | * Note: we report the _first_ event seen, not the | |
450 | * event at hand. We could save array space | |
451 | * by sending latest event (less info to keep). | |
452 | */ | |
453 | ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n", | |
454 | dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz); | |
455 | hst->mc.flags |= MC_FLG_FILTER; | |
456 | #ifdef CONFIG_MK1OM | |
457 | if (!(hst->mc.flags & MC_FLG_FALSE)) { | |
458 | micras_mc_log(&hst->mc); | |
459 | hst->mc.flags |= MC_FLG_LOG; | |
460 | } | |
461 | #endif | |
462 | micras_mc_send(&hst->mc, exc); | |
463 | hst->last = 0; | |
464 | ||
465 | /* | |
466 | * MC events are disabled by caller when a | |
467 | * non-zero mask is returned by this routine. | |
468 | */ | |
469 | return (1 << dsc->ctl); | |
470 | } | |
471 | ||
472 | ||
473 | /* | |
474 | * Remove/mask an 'enable-bit' from a core MCA bank. | |
475 | * Note: This applies to _current_ cpu only. It is not explicitly | |
476 | * linked to the cpu that was ID'd in the incoming mce struct. | |
477 | * Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log(). | |
478 | */ | |
479 | ||
480 | static void | |
481 | mcc_ctl_mask(int bank, uint32_t msk) | |
482 | { | |
483 | uint32_t ctl_lo, ctl_hi; | |
484 | ||
485 | rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); | |
486 | ctl_lo &= ~msk; | |
487 | wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); | |
488 | ||
489 | #if MC_VERBOSE | |
490 | ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo); | |
491 | #endif | |
492 | } | |
493 | ||
494 | ||
495 | /* | |
496 | * Filtering of correctable core MC events | |
497 | * Called from the exception handler. | |
498 | */ | |
499 | ||
500 | static void | |
501 | mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake) | |
502 | { | |
503 | struct mce_info mc; | |
504 | uint32_t msk; | |
505 | ||
506 | if (!mce) | |
507 | return; | |
508 | ||
509 | if (mce->status & MCI_STATUS_UC) | |
510 | return; | |
511 | ||
512 | mcc_conv(mce, &mc); | |
513 | mc.ctl = ctl; | |
514 | mc.flags = fake ? MC_FLG_FALSE : 0; | |
515 | msk = micras_mc_filter(&mc, mce->tsc, 1); | |
516 | if (msk) | |
517 | mcc_ctl_mask(mce->bank, msk); | |
518 | } | |
519 | ||
520 | ||
521 | /* | |
522 | * Only action required for polled MC events is to | |
523 | * pass the event on to the SCIF channel (if connected). | |
524 | * The event should already have caused an excption (the | |
525 | * exception handler choses to ignore corrected errors) | |
526 | * which means it already has been filtered. | |
527 | * Injected corrected events do not cause MCE exceptions | |
528 | * and thus escaped filtering, so we'll filter them here. | |
529 | */ | |
530 | ||
531 | static void | |
532 | mcc_poll(struct mce * mce, uint64_t ctl, int fake) | |
533 | { | |
534 | struct mce_info mc; | |
535 | ||
536 | #if MC_VERBOSE | |
537 | ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status); | |
538 | #endif | |
539 | ||
540 | mcc_conv(mce, &mc); | |
541 | mc.ctl = ctl; | |
542 | mc.flags = fake ? MC_FLG_FALSE : 0; | |
543 | ||
544 | #if BEAM_TEST | |
545 | /* | |
546 | * Under beam test we only want to send the SCIF message | |
547 | */ | |
548 | micras_mc_send(&mc, fake); | |
549 | return; | |
550 | #endif | |
551 | ||
552 | if (micras_mc_send(&mc, fake)) | |
553 | mcc_seen = mcelog.next; | |
554 | ||
555 | /* | |
556 | * According to MCA HAS the MCI_STATUS_VAL will only | |
557 | * be set when an event's enable bit is set, in which | |
558 | * case it is difficult to imagine how events without | |
559 | * the MCI_STATUS_EN can appear here. The second clause | |
560 | * of the test may never actually happen on Kn{F,C}. | |
561 | * Note: MC polling does not capture TSCs | |
562 | */ | |
563 | if (fake || !(mc.status & MCI_STATUS_EN)) { | |
564 | uint32_t msk; | |
565 | ||
566 | msk = micras_mc_filter(&mc, rdtsc(), fake); | |
567 | if (msk) | |
568 | mcc_ctl_mask(mce->bank, msk); | |
569 | } | |
570 | } | |
571 | ||
572 | ||
573 | /* | |
574 | * One CPU entered do_machine_check(). | |
575 | * We get the initial mce record (which has cpu ID), early | |
576 | * control variables and whether the event is injected. | |
577 | * | |
578 | * Since KnF and KnC deviate from the standard IA by not | |
579 | * having the core MCAs broadcast to all CPU's we'll try | |
580 | * to fake standard behavior in order to keep the generic | |
581 | * machine check code intact. | |
582 | * Therefore, if event is real (fake flag unset) and this | |
583 | * CPU is the first seeing it (mcc_exc_mask is empty), | |
584 | * then send IPI to all other CPU's listed in the online | |
585 | * cpumask for vector #18. Later CPUs will see themselves | |
586 | * marked in mcc_exc_mask and return quickly. | |
587 | */ | |
588 | ||
589 | struct cpumask mcc_exc_mask; /* CPU's in mce ctx */ | |
590 | static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */ | |
591 | ||
592 | static void | |
593 | mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg) | |
594 | { | |
595 | unsigned int cpu; | |
596 | ||
597 | /* | |
598 | *TBD: should we use 'extcpu' from the MCE record instead? | |
599 | */ | |
600 | cpu = smp_processor_id(); | |
601 | ||
602 | /* | |
603 | * Injected events invokes all CPUs automatically | |
604 | * by hooking into the NMI notify_die call_chain. | |
605 | * Nothing to do here. | |
606 | */ | |
607 | if (fake) | |
608 | return; | |
609 | ||
610 | #if 1 | |
611 | /* | |
612 | * Avoid the IPI corralling circus on corrected errors, | |
613 | * based on assessment entirely done by mce_severity(). | |
614 | * If the result (no_way_out) is MCE_NO_SEVERITY (=0), then | |
615 | * at worst we may have a correctable error, and that does | |
616 | * not warrant the system lockdown managed by mce_start() | |
617 | * and mce_end(). | |
618 | * Note that MICs do not support newer status bits (MCG_SER_P) | |
619 | * which causes variable mce_ser always to be zero and thus | |
620 | * the test in the inner loop of do_machine_check() will be | |
621 | * reduced to just testing for the UC bit. | |
622 | */ | |
623 | if (! no_way_out) | |
624 | return; | |
625 | #endif | |
626 | ||
627 | /* | |
628 | * Test for entry from MT thread IPIs (testing) | |
629 | * or a 'soft' exception from a IPI issued from | |
630 | * the handler of the first exception. | |
631 | * No further action needed in both cases. | |
632 | */ | |
633 | if (cpumask_test_cpu(cpu, &mcc_exc_mask)) | |
634 | return; | |
635 | ||
636 | /* | |
637 | * Create mcc_exc_mask to flag which CPU's are | |
638 | * to be included in the IPI. This mask is later | |
639 | * used to determine who needs to EOI the local | |
640 | * APIC after MC event handling. | |
641 | */ | |
642 | while(atomic_xchg(&ipi_lock, 1)) | |
643 | cpu_relax(); | |
644 | smp_rmb(); | |
645 | if (cpumask_test_cpu(cpu, &mcc_exc_mask)) { | |
646 | /* | |
647 | * Another CPU got here first | |
648 | */ | |
649 | atomic_xchg(&ipi_lock, 0); | |
650 | return; | |
651 | } | |
652 | cpumask_copy(&mcc_exc_mask, cpu_online_mask); | |
653 | cpumask_clear_cpu(cpu, &mcc_exc_mask); | |
654 | smp_wmb(); | |
655 | atomic_xchg(&ipi_lock, 0); | |
656 | ||
657 | /* | |
658 | * Simulate a broadcast ny sending IPI to all | |
659 | * other CPUs. | |
660 | */ | |
661 | // apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR); | |
662 | apic->send_IPI_allbutself(MCE_VECTOR); | |
663 | } | |
664 | ||
665 | ||
666 | /* | |
667 | * In do_machine_check() bank scan loop. | |
668 | * Called from a lockdown, no synchronization needed. | |
669 | * MC bank scan is complete and the mce event has been | |
670 | * entered into the kernel MC log | |
671 | * | |
672 | *TBD: revise logic on HALT on UC events? | |
673 | * From a state corruption point of view this | |
674 | * _is_ a fatal error because UC bit was set. | |
675 | * However, if the tolerance setting is set | |
676 | * high enough, the generic MC handler may | |
677 | * not chose to panic on this event. | |
678 | * We currently do not have the tolerance value | |
679 | * when recording this event, nor do we have | |
680 | * other factors that mce_reign() use to determine | |
681 | * what to do after reporting event to the host. | |
682 | */ | |
683 | ||
684 | static void | |
685 | mcc_exc_log(struct mce * mce, uint64_t ctl, int fake, | |
686 | int no_way_out, char * msg, int severity, int worst) | |
687 | { | |
688 | struct mce_info mc; | |
689 | uint32_t msk; | |
690 | ||
691 | #if MC_VERBOSE | |
692 | ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n", | |
693 | mce->extcpu, mce->time, no_way_out, msg, severity, worst); | |
694 | #endif | |
695 | ||
696 | /* | |
697 | * Create a message for the host. | |
698 | */ | |
699 | mcc_conv(mce, &mc); | |
700 | mc.ctl = ctl; | |
701 | mc.flags |= fake ? MC_FLG_FALSE : 0; | |
702 | ||
703 | #if BEAM_TEST | |
704 | /* | |
705 | * Under beam test we only want to send the SCIF message | |
706 | * This is guaranteed not to be called re-entrantly. | |
707 | */ | |
708 | micras_mc_send(&mc, 1); | |
709 | return; | |
710 | #endif | |
711 | ||
712 | #ifdef CONFIG_MK1OM | |
713 | /* | |
714 | * If this is a true event then log it in the EEPROM and | |
715 | * notify SMC that we've had a serious machine check error. | |
716 | */ | |
717 | if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) { | |
718 | micras_mc_log(&mc); | |
719 | mc.flags |= MC_FLG_LOG; | |
720 | ||
721 | /* | |
722 | *TBD: Should this be deferred until the actual panic? | |
723 | * The user can raise tolerance such that we in | |
724 | * fact continue operating; in which case the SMC | |
725 | * notification would be (somewhat) misleading. | |
726 | */ | |
727 | micras_mc_ipmi(&mc, 1); | |
728 | } | |
729 | #endif | |
730 | ||
731 | /* | |
732 | * Always notify host and sync to kernel log | |
733 | */ | |
734 | if (micras_mc_send(&mc, 1)) | |
735 | mcc_seen = mcelog.next; | |
736 | ||
737 | #if RAS_HALT | |
738 | if ((mc.flags & MC_FLG_FATAL) && !fake) | |
739 | panic("FATAL core machine check event:\n" | |
740 | "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", | |
741 | mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc); | |
742 | #endif | |
743 | ||
744 | /* | |
745 | * Correctable events can in fact reach us here if | |
746 | * mce_no_way_out() tags them as critical (for other | |
747 | * reasons than the UC flag, e.g. MCIP missing). | |
748 | * If the tolerance setting is high enough to prevent | |
749 | * such events to panic, we'd still want filtering. | |
750 | */ | |
751 | msk = micras_mc_filter(&mc, mce->tsc, 1); | |
752 | if (msk) | |
753 | mcc_ctl_mask(mce->bank, msk); | |
754 | } | |
755 | ||
756 | ||
757 | /* | |
758 | * In mce_panic(). | |
759 | * Current event is about to make the kernel panic. | |
760 | * Sources of this call are | |
761 | * do_machine_check(), when no_way_out set | |
762 | * mce_timed_out(), CPU rendez-vous failed | |
763 | * mce_reign(), when severety high, a CPU hung, or no events | |
764 | */ | |
765 | ||
766 | static void | |
767 | mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake) | |
768 | { | |
769 | /* | |
770 | * Should host be notified in this case? | |
771 | * And if so, how should be presented, we might not | |
772 | * even have a mce record to show when this happens! | |
773 | * If an mce is passed, it has already been seen and | |
774 | * reported to the host by a call to mcc_exc_log(). | |
775 | * If mce is NULL, then this _is_ an MC relatedi panic, | |
776 | * but we have no data fitting for a host notification. | |
777 | * Create a pseudo event and ship that? | |
778 | */ | |
779 | ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n", | |
780 | mce->extcpu, mce->time, msg, exp, fake); | |
781 | } | |
782 | ||
783 | ||
784 | /* | |
785 | * A CPU is leaving do_machine_check(). | |
786 | * We get this after the monarch has 'reigned' and | |
787 | * the response to the event has been completed. | |
788 | */ | |
789 | ||
790 | static void | |
791 | mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order) | |
792 | { | |
793 | unsigned int cpu; | |
794 | int eoi; | |
795 | ||
796 | cpu = smp_processor_id(); | |
797 | ||
798 | /* | |
799 | * Assuming test_and_clear_bit() is atomic. | |
800 | */ | |
801 | smp_rmb(); | |
802 | eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask); | |
803 | smp_wmb(); | |
804 | if (eoi) | |
805 | ack_APIC_irq(); | |
806 | } | |
807 | ||
808 | ||
809 | /* | |
810 | * Routine to scan the kernel's MC log. | |
811 | * Called when SCIF MC session has been created, to bring the host | |
812 | * side up to date with prior unreported MC events, such as events | |
813 | * occurring when MC session was not active (no peer was listening | |
814 | * on the host) and events occurring before RAS module was loaded. | |
815 | * | |
816 | * Notes: | |
817 | * - This is always called in thread context. | |
818 | * - There are no injection flags in the kernel | |
819 | * MC log, i.e. no guarantee events are genuine. | |
820 | * - The MC kernel log has been exported explicitly for this. | |
821 | * | |
822 | * On synchronization (or the lack thereof): | |
823 | * Effectively the mcelog holds a static array of mce's where the | |
824 | * 'finished' flag says whether mce content is valid or not. The | |
825 | * 'next' field is the index of the first element in the array that | |
826 | * has not been assigned for an MC event. It is incremented when a | |
827 | * new event is entered, and reset to zero on reads to /dev/mcelog. | |
828 | * The kernel's event log does not wrap, so it is safe to use it as | |
829 | * an indicator of how many events (finished or not) are in it. | |
830 | * The mcelog's next field is protected by RCU style mechanisms | |
831 | * in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c). | |
832 | * For obvious reasons it is not genuine RCU, e.g. access to 'next' | |
833 | * isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever | |
834 | * masking use of a lock in an RCU macro definition. | |
835 | * There is no RCU moving data around, the mce array does not move, | |
836 | * and the 'finished' flag is set after a wmb() on the mce contents | |
837 | * which means this routine will not clash with the MCE handler. | |
838 | * Collisions with memset() on reads from /dev/mcelog are prevented | |
839 | * by locking of mce_read_mutex. | |
840 | */ | |
841 | ||
842 | void | |
843 | mcc_sync(void) | |
844 | { | |
845 | struct mce_info mc; | |
846 | unsigned seen; | |
847 | ||
848 | if (mce_disabled) | |
849 | return; | |
850 | ||
851 | #if 0 | |
852 | /* | |
853 | * Can't do this until bootstrap scrubs MC banks on all cards. | |
854 | * It has been observed that MCA banks may _not_ be reset on card | |
855 | * reboot which means events picked up by the kernel before loading | |
856 | * the RAS module may have occured in a previous uOS run. | |
857 | * Should be OK post early Jan '12 (flash ver 262, HSD 4115351). | |
858 | */ | |
859 | return; | |
860 | #endif | |
861 | ||
862 | /* | |
863 | * Lock out kernel log access through /dev/mcelog | |
864 | */ | |
865 | mutex_lock(&mce_read_mutex); | |
866 | ||
867 | /* | |
868 | * Start over if the log has been cleared cleared | |
869 | */ | |
870 | if (mcc_seen > mcelog.next) | |
871 | mcc_seen = 0; | |
872 | ||
873 | for(seen = mcc_seen; seen < mcelog.next; seen++) { | |
874 | /* | |
875 | * Basic checks. Index, CPU & bank must be reasonable. | |
876 | */ | |
877 | if (mcelog.entry[seen].finished) { | |
878 | if (mcelog.entry[seen].cpu >= NR_CPUS || | |
879 | mcelog.entry[seen].bank >= 3) { | |
880 | printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n", | |
881 | seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank); | |
882 | continue; | |
883 | } | |
884 | ||
885 | /* | |
886 | * Have good entry, can be UC, but it is 'old'. | |
887 | */ | |
888 | mcc_conv(&mcelog.entry[seen], &mc); | |
889 | mc.ctl = 0; | |
890 | ||
891 | #ifdef CONFIG_MK1OM | |
892 | /* | |
893 | * Log this event in the eeprom and notify | |
894 | * that we've had a serious machine check error. | |
895 | */ | |
896 | if (mc.flags & MC_FLG_FATAL) { | |
897 | in_sync = 1; | |
898 | micras_mc_log(&mc); | |
899 | in_sync = 0; | |
900 | mc.flags |= MC_FLG_LOG; | |
901 | micras_mc_ipmi(&mc, 0); | |
902 | } | |
903 | #endif | |
904 | ||
905 | /* | |
906 | * Notify host about this too | |
907 | */ | |
908 | if (! micras_mc_send(&mc, 0)) | |
909 | break; | |
910 | } | |
911 | } | |
912 | mcc_seen = mcelog.next; | |
913 | ||
914 | /* | |
915 | * Done, release lock | |
916 | */ | |
917 | mutex_unlock(&mce_read_mutex); | |
918 | } | |
919 | ||
920 | ||
921 | /* | |
922 | * Setup excetion handlers by hooking into the | |
923 | * kernel's native MCA handler. | |
924 | */ | |
925 | ||
926 | int __init | |
927 | mcc_init(void) | |
928 | { | |
929 | if (mce_disabled) { | |
930 | printk("RAS.core: disabled\n"); | |
931 | } | |
932 | else { | |
933 | mca_poll = mcc_poll; | |
934 | mca_exc_flt = mcc_exc_flt; | |
935 | mca_exc_entry = mcc_exc_entry; | |
936 | mca_exc_log = mcc_exc_log; | |
937 | mca_exc_panic = mcc_exc_panic; | |
938 | mca_exc_exit = mcc_exc_exit; | |
939 | mca_print = 0; /* For debug: ee_printk; */ | |
940 | printk("RAS.core: init complete\n"); | |
941 | } | |
942 | ||
943 | return 0; | |
944 | } | |
945 | ||
946 | ||
947 | /* | |
948 | * Cleanup for module unload. | |
949 | * Clear/restore hooks in the native MCA handler. | |
950 | */ | |
951 | ||
952 | int __exit | |
953 | mcc_exit(void) | |
954 | { | |
955 | mca_poll = 0; | |
956 | mca_exc_flt = 0; | |
957 | mca_exc_entry = 0; | |
958 | mca_exc_log = 0; | |
959 | mca_exc_panic = 0; | |
960 | mca_exc_exit = 0; | |
961 | mca_print = 0; | |
962 | ||
963 | /* | |
964 | * Links from kernel's MCE handler cut, | |
965 | * wait for everybody in handler to leave. | |
966 | */ | |
967 | while(atomic_read(&mce_entry)) | |
968 | cpu_relax(); | |
969 | ||
970 | printk("RAS.core: exit complete\n"); | |
971 | return 0; | |
972 | } | |
973 |