| 1 | /* |
| 2 | * Copyright 2010-2017 Intel Corporation. |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License, version 2, |
| 6 | * as published by the Free Software Foundation. |
| 7 | * |
| 8 | * This program is distributed in the hope that it will be useful, |
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 11 | * General Public License for more details. |
| 12 | * |
| 13 | * Disclaimer: The codes contained in these modules may be specific to |
| 14 | * the Intel Software Development Platform codenamed Knights Ferry, |
| 15 | * and the Intel product codenamed Knights Corner, and are not backward |
| 16 | * compatible with other Intel products. Additionally, Intel will NOT |
| 17 | * support the codes or instruction set in future products. |
| 18 | * |
| 19 | * Intel offers no warranty of any kind regarding the code. This code is |
| 20 | * licensed on an "AS IS" basis and Intel is not obligated to provide |
| 21 | * any support, assistance, installation, training, or other services |
| 22 | * of any kind. Intel is also not obligated to provide any updates, |
| 23 | * enhancements or extensions. Intel specifically disclaims any warranty |
| 24 | * of merchantability, non-infringement, fitness for any particular |
| 25 | * purpose, and any other warranty. |
| 26 | * |
| 27 | * Further, Intel disclaims all liability of any kind, including but |
| 28 | * not limited to liability for infringement of any proprietary rights, |
| 29 | * relating to the use of the code, even if Intel is notified of the |
| 30 | * possibility of such liability. Except as expressly stated in an Intel |
| 31 | * license agreement provided with this code and agreed upon with Intel, |
| 32 | * no license, express or implied, by estoppel or otherwise, to any |
| 33 | * intellectual property rights is granted herein. |
| 34 | */ |
| 35 | |
| 36 | /* |
| 37 | * RAS handler for core MC events |
| 38 | * |
| 39 | * Contains code to intercept MC events, collect information |
| 40 | * from core MCA banks on originating core and possibly on |
| 41 | * all active cores if necessary. |
| 42 | * |
| 43 | * In case of a severe event, defined by corrupted context, |
| 44 | * the handler will add a record of the event in the designated |
| 45 | * EEPROM hanging off the Over-Clocking I2C bus. Next a message |
| 46 | * will be sent to the SMC (enabling IPMI notifications) and at |
| 47 | * last a message is sent to host via the MC SCIF connection |
| 48 | * (if MC SCIF session has been established). |
| 49 | * |
| 50 | * Lesser events will also be sent to the host on a 'FYI' basis, |
| 51 | * but no record will be stored in the event log, nor will the |
| 52 | * SMC be notified. |
| 53 | * |
| 54 | * Special cases of high rate correctable errors may also cause |
| 55 | * events to be recorded in EEPROM on the assumption that the |
| 56 | * root cause will be detectable from maintenance mode. |
| 57 | * |
| 58 | * The handler cannot expect any support from the OS while in |
| 59 | * exception (NMI) context. Therefore, NMI-safe routines has |
| 60 | * been added to mimic some kernel services, e.g. ee_print(). |
| 61 | */ |
| 62 | |
| 63 | #include <linux/types.h> |
| 64 | #include <linux/errno.h> |
| 65 | #include <linux/kernel.h> |
| 66 | #include <linux/mm.h> |
| 67 | #include <linux/mm_types.h> |
| 68 | #include <linux/io.h> |
| 69 | #include <linux/cpumask.h> |
| 70 | #include <asm/mce.h> |
| 71 | #include <asm/apic.h> |
| 72 | #include "micras.h" |
| 73 | |
| 74 | |
| 75 | /* |
| 76 | ** |
| 77 | ** Brief design notes: |
| 78 | ** There are two ways this code normally will be entered. |
| 79 | ** |
| 80 | ** 1) From standard interrupt context (bottom-half). |
| 81 | ** This is supporting MC events picked up by the |
| 82 | ** machine_check_poll(), i.e. events that aren't |
| 83 | ** causing state corrruption (UC bit not set). |
| 84 | ** |
| 85 | ** 2) From exception/NMI context. |
| 86 | ** This handles errors that _did_ flag processor |
| 87 | ** state corruption (UC bit set, or other condition |
| 88 | ** causing the kernel exception handler to pick it up). |
| 89 | ** |
| 90 | ** Both cases can happen simultaneously on different CPU's, |
| 91 | ** which require careful considerations about re-entrant code |
| 92 | ** behaviour here. Particularly nasty is exception context where |
| 93 | ** normal spinlocks won't work (FYI: x86 spinlocks assume interrupt |
| 94 | ** disable can protect a critical region, an assumption that is |
| 95 | ** false when an exception/NMI occur). |
| 96 | ** |
| 97 | ** Standard interrupt context entries occur when non-fatal and |
| 98 | ** thus non-critical MC events are handled. In most cases just |
| 99 | ** results in a regular SCIF send of McInfo structs to the host. |
| 100 | ** Note that the call chain origin is a callout from the timer |
| 101 | ** thread, not from an interrupt service routine, so to name |
| 102 | ** it as standard interrupt context is somewhat misleading. |
| 103 | ** |
| 104 | ** Exception context messages are usuallly fatal and must be |
| 105 | ** dealt with immediately, because otherwise the generic machine |
| 106 | ** handler may panic() the system when exiting exception handler |
| 107 | ** (default behavior, may be tweaked by altering 'threshold'). |
| 108 | ** |
| 109 | ** In order to proceed we can either implement a locking mechanism |
| 110 | ** at every API function entry, or we can let every function do it's |
| 111 | ** thing independently. The latter is preferred, though it gets |
| 112 | ** somewhat complicated because the API between the generic MC |
| 113 | ** handling and RAS module is in fact composed of several calls. |
| 114 | ** |
| 115 | ** If state between API calls needs to be tracked then that can be |
| 116 | ** done by means of pre-allocated arrays, similar to the generic |
| 117 | ** handling in the Linux kernel. Currently the only state variable |
| 118 | ** is the mask of CPUs that has been sent an IPI. |
| 119 | ** |
| 120 | ** Core MC events can be simulated by using the 'mce-inject' tool, |
| 121 | ** consisting of a kernel module and a text mode application program. |
| 122 | ** The 'mce-inject' module knows the difference between fatal and |
| 123 | ** non-fatal events (defined by the UC bit) and acts differently |
| 124 | ** in the two cases. Non-fatal injections cause machine_check_poll() |
| 125 | ** to be called on all CPUs, resulting in events being reported to |
| 126 | ** function mce_poll(). Fatal injections cause do_machine_check() |
| 127 | ** to be called on all CPUs, resulting in calls to the mcc_exc_* |
| 128 | ** routines below. Activities triggered by mce-inject are flagged |
| 129 | ** as 'fake', and shall _NOT_ be logged in the EEPROM. |
| 130 | ** |
| 131 | ** Warning: |
| 132 | ** Controls in the generic MC handling may cause the kernel to |
| 133 | ** panic, _ALSO_ even if no event was found in any MCA banks!! |
| 134 | ** Not sure exactly how to capture that sort of event. |
| 135 | ** |
| 136 | ** Warning: |
| 137 | ** The 'mce-inject' module uses different methods of invoking error |
| 138 | ** handling routines, depending on the mce record (inject_flags). |
| 139 | ** Specifically, the 'mce-inject' module may use of broadcast NMIs |
| 140 | ** to invoke machine_check_poll() or do_machine_check() on all CPUs, |
| 141 | ** which will make these functions execute in exception context. |
| 142 | ** The NMI broadcast mechanism is based on registering a handler on |
| 143 | ** the 'die' notifier chain and then doing an |
| 144 | ** apic->send_IPI_mask(.., NMI_VECTOR), |
| 145 | ** knowing that do_nmi() will invoke this notifier chain when no |
| 146 | ** genuine cause of NMI was found (i.e. if inb(61) returns 0xc0, |
| 147 | ** [which is SERR + IOCHK on chipset register NSR]). |
| 148 | ** Long story short; if 'mce-inject' is used we can not expect that |
| 149 | ** polling is done in standard interrupt context, and need to set |
| 150 | ** the 'in exception context' flag for SCIF access. |
| 151 | ** |
| 152 | */ |
| 153 | |
| 154 | |
| 155 | /* |
| 156 | * Hooks placed in the native machine check handler |
| 157 | * See file arch/x86/kernel/cpu/mcheck/mce.c for placement. |
| 158 | * |
| 159 | * poll After entering a non-UC event into mce_log. |
| 160 | * This happens in normal thread context, which |
| 161 | * means that kernel services are avaialble. |
| 162 | * exc_flt Filter on correctable errors. If events occur |
| 163 | * at a very high rate they can severely slow |
| 164 | * down the system and/or crash it entirely. |
| 165 | * Logic here will disable reporting of some |
| 166 | * events if they are seen too often. |
| 167 | * exc_entry Entering MC exception handler. |
| 168 | * Called _after_ reading MCG_STATUS and the early |
| 169 | * severity assesment by mce_severity() has been |
| 170 | * performed on all banks, such that we get to |
| 171 | * know if the native MC handler will panic. |
| 172 | * exc_log After entering a UC event into mce_log. |
| 173 | * The logged mce record has all available |
| 174 | * details on the event, and this point is the |
| 175 | * best place to perform our RAS activities. |
| 176 | * exc_panic Right before the MC exception handler calls |
| 177 | * the panic function. |
| 178 | * exc_exit Exit the MC exception handler |
| 179 | * print Exception context safe printf to POST-card UART |
| 180 | */ |
| 181 | |
| 182 | extern void (*mca_poll)(struct mce *, uint64_t, int); |
| 183 | extern void (*mca_exc_flt)(struct mce *, uint64_t, int); |
| 184 | extern void (*mca_exc_entry)(struct mce *, int, int, int, char *); |
| 185 | extern void (*mca_exc_log)(struct mce *, uint64_t, int, int, char *, int, int); |
| 186 | extern void (*mca_exc_panic)(struct mce *, char *, char *, int); |
| 187 | extern void (*mca_exc_exit)(struct mce *, int, int, int, int); |
| 188 | extern int (*mca_print)(char *, ...); |
| 189 | |
| 190 | extern struct mce_log mcelog; /* Export from kernel */ |
| 191 | extern struct mutex mce_read_mutex; /* Export from kernel */ |
| 192 | static unsigned mcc_seen; /* Last event in kernel log */ |
| 193 | int in_sync; /* Flag when sync'ing */ |
| 194 | |
| 195 | |
| 196 | /* |
| 197 | * Convert a kernel mce record into a MC API format |
| 198 | */ |
| 199 | |
| 200 | static void |
| 201 | mcc_conv(struct mce * mce, struct mce_info * mc) |
| 202 | { |
| 203 | mc->org = mce->bank; |
| 204 | mc->id = mce->extcpu; |
| 205 | #ifdef CONFIG_MK1OM |
| 206 | mc->pid = xlat_cpu[cpu_data(mc->id).apicid]; |
| 207 | #endif |
| 208 | mc->stamp = mce->time; |
| 209 | mc->status = mce->status; |
| 210 | mc->addr = mce->addr; |
| 211 | mc->misc = mce->misc; |
| 212 | mc->flags = (mc->status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0; |
| 213 | } |
| 214 | |
| 215 | |
| 216 | /* |
| 217 | * Filter for correctable errors, may modify CTL value. |
| 218 | * The filter is pretty crude, we just want to protect |
| 219 | * ourselves from being run over by fast recurring events. |
| 220 | * We keep tabs of events seen in a static array. |
| 221 | * |
| 222 | * Algorithm is like this: |
| 223 | * - test if event is in filter list; if not exit filter. |
| 224 | * - search for instance of this event in history. |
| 225 | * - if not found, insert event in history (strike 1). |
| 226 | * - if found but time since last seen exceeds window, |
| 227 | * then treat event as new in history (new strike 1). |
| 228 | * - if found and within time window, bump strike counter. |
| 229 | * - if strike counter reach maximum, we're fed up and |
| 230 | * turn this event off by clearing the associated |
| 231 | * bit in the offending MCA bank's CTL register and |
| 232 | * send a 'filter' event notification to the host. |
| 233 | * |
| 234 | * Advantages of this design is: |
| 235 | * - individual parameters for every filtered event. |
| 236 | * - only one event history array. |
| 237 | * - no periodic aging of events in history array. |
| 238 | * - no averaging over time required. |
| 239 | * - no moving/reordering of event history entries. |
| 240 | * - new events do not replace older seen event |
| 241 | * - filter reacts immediately when max reached. |
| 242 | * |
| 243 | * Disadvantages are: |
| 244 | * - linear search through filter array. |
| 245 | * - linear search through history array. |
| 246 | * - time parameter not obvious, it's really a limit |
| 247 | * on how old events in history are allowed to be. |
| 248 | * - in pathological cases the filter's reaction time |
| 249 | * will be max * window (when events trickle in at |
| 250 | * a rate just below the window size). |
| 251 | * - data in ADDR and MISC registers are not used to |
| 252 | * match current event with history. Should they be? |
| 253 | * |
| 254 | * For now, both lists are short enough that introducing |
| 255 | * more advanced searches probably are not going to help. |
| 256 | * |
| 257 | * On KnC the flash may have overrides of the mc_turnoff table. |
| 258 | */ |
| 259 | |
| 260 | #define FT ((17 * 60) + 30) * 60 /* Default time window: 17.5 hours */ |
| 261 | |
| 262 | static struct mc_hist { |
| 263 | uint32_t count; /* How many times seen */ |
| 264 | uint64_t last; /* TSC last time seen */ |
| 265 | struct mce_info mc; /* Local MC event record */ |
| 266 | } mc_history[32]; |
| 267 | |
| 268 | static struct mc_disc { |
| 269 | uint8_t bank, ctl; /* Bank selector and control bit # */ |
| 270 | uint16_t win; /* Time window (seconds) */ |
| 271 | uint16_t max; /* Max count */ |
| 272 | uint16_t mca_code; /* MCA code, status[15:0] */ |
| 273 | uint16_t mdl_code; /* Model code, status[31:16] */ |
| 274 | } mc_turnoff[] = { |
| 275 | { 0, 3, FT, 2, 0x0150, 0x0000 }, /* MC0: J-Cache error */ |
| 276 | { 1, 0, FT, 2, 0x010a, 0x0001 }, /* MC1: L2 Tag error */ |
| 277 | { 1, 4, FT, 2, 0x010a, 0x0010 }, /* MC1: L2 Data error */ |
| 278 | { 2, 2, FT, 2, 0x010d, 0x0100 }, /* MC2: Tag State, ext TD */ |
| 279 | { 2, 2, FT, 2, 0x010d, 0x0101 }, /* MC2: Tag State, int TD */ |
| 280 | { 2, 3, FT, 2, 0x012d, 0x0110 }, /* MC2: Core Valid, ext TD */ |
| 281 | { 2, 3, FT, 2, 0x012d, 0x0111 }, /* MC2: Core Valid, int TD */ |
| 282 | { 3, 2, FT, 2, 0x010d, 0x0100 }, /* DBOX: Tag State error, ext TD */ |
| 283 | { 3, 2, FT, 2, 0x010d, 0x0101 }, /* DBOX: Tag State error, int TD */ |
| 284 | { 3, 3, FT, 2, 0x012d, 0x0110 }, /* DBOX: Core Valid error, ext TD */ |
| 285 | { 3, 3, FT, 2, 0x012d, 0x0111 }, /* DBOX: Core Valid error, int TD */ |
| 286 | { 4, 4, FT, 2, 0x0e0b, 0x0030 }, /* SBOX: PCI-e */ |
| 287 | { 5, 0, FT, 2, 0x0001, 0x0000 }, /* GBOX: Ch-0 retraining */ |
| 288 | { 5, 1, FT, 2, 0x0001, 0x0001 }, /* GBOX: Ch-1 retraining */ |
| 289 | { 5, 2, FT, 2, 0x0001, 0x0002 }, /* GBOX: Ch-0 ECC error */ |
| 290 | { 5, 3, FT, 2, 0x0001, 0x0003 }, /* GBOX: Ch-1 ECC error */ |
| 291 | { 6, 3, FT, 2, 0x010e, 0x0008 }, /* TBOX: T2 CRC error */ |
| 292 | }; |
| 293 | |
| 294 | |
| 295 | #ifdef CONFIG_MK1OM |
| 296 | |
| 297 | #define MC_FLT_SIG1 0x0e13c20f /* Start signature */ |
| 298 | #define MC_FLT_SIG2 0xf1ec3df0 /* End signature */ |
| 299 | #define MC_FLT_SIZE 0x200 /* Filter block length */ |
| 300 | |
| 301 | void |
| 302 | mcc_flt_parm(uint8_t * p) |
| 303 | { |
| 304 | uint16_t fnum; |
| 305 | |
| 306 | /* |
| 307 | * Check signatures |
| 308 | */ |
| 309 | if (*((uint32_t *) p) != MC_FLT_SIG1 || |
| 310 | *((uint32_t *)(p + MC_FLT_SIZE - 4)) != MC_FLT_SIG2) { |
| 311 | printk("mcc_flt_parm: signatures not found, (%08x, %08x)\n", |
| 312 | *((uint32_t *) p), *((uint32_t *)(p + MC_FLT_SIZE - 4))); |
| 313 | return; |
| 314 | } |
| 315 | |
| 316 | /* |
| 317 | * After start signature comes filter count (uint16_t) |
| 318 | * followed by 'count' filter descriptors (struct mc_disc). |
| 319 | */ |
| 320 | fnum = *(uint16_t *)(p + 4); |
| 321 | if (fnum > ARRAY_SIZE(mc_turnoff) || |
| 322 | fnum * sizeof(struct mc_disc) + 10 > MC_FLT_SIZE) { |
| 323 | printk("mcc_flt_parm: filter count %d not valid\n", fnum); |
| 324 | return; |
| 325 | } |
| 326 | |
| 327 | /* |
| 328 | * Seems the table is legit, copy it over defaults. |
| 329 | */ |
| 330 | memset(mc_turnoff, '\0', sizeof(mc_turnoff)); |
| 331 | memcpy(mc_turnoff, p + 6, fnum * sizeof(struct mc_disc)); |
| 332 | #if MC_VERBOSE |
| 333 | { |
| 334 | int i; |
| 335 | |
| 336 | for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { |
| 337 | printk("Filter %2d: bank %d, ctl %d, win %d, max %d, mca %04x, mdl %04x\n", |
| 338 | i, mc_turnoff[i].bank, mc_turnoff[i].ctl, mc_turnoff[i].win, |
| 339 | mc_turnoff[i].max, mc_turnoff[i].mca_code, mc_turnoff[i].mdl_code); |
| 340 | } |
| 341 | } |
| 342 | #endif |
| 343 | } |
| 344 | |
| 345 | #endif |
| 346 | |
| 347 | |
| 348 | /* |
| 349 | * Frequency filter for core and un-core MC events |
| 350 | */ |
| 351 | |
| 352 | uint32_t |
| 353 | micras_mc_filter(struct mce_info * mc, uint64_t tsc, int exc) |
| 354 | { |
| 355 | struct mc_disc * dsc; |
| 356 | struct mc_hist * hst; |
| 357 | uint64_t ostamp; |
| 358 | int i, oldest; |
| 359 | |
| 360 | if (mc->status & MCI_STATUS_UC) |
| 361 | return 0; |
| 362 | |
| 363 | /* |
| 364 | * Check if this event may be filtered |
| 365 | */ |
| 366 | dsc = mc_turnoff; |
| 367 | for(i = 0; i < ARRAY_SIZE(mc_turnoff); i++) { |
| 368 | if (dsc->bank == mc->org && |
| 369 | dsc->mca_code == GET_BITS(15, 0, mc->status) && |
| 370 | dsc->mdl_code == GET_BITS(31, 16, mc->status)) |
| 371 | break; |
| 372 | dsc++; |
| 373 | } |
| 374 | if (i == ARRAY_SIZE(mc_turnoff)) |
| 375 | return 0; |
| 376 | |
| 377 | /* |
| 378 | * Have a candidate for filter. |
| 379 | * Have we seen this one before? |
| 380 | */ |
| 381 | oldest = 0; |
| 382 | ostamp = tsc; |
| 383 | hst = mc_history; |
| 384 | for(i = 0; i < ARRAY_SIZE(mc_history); i++) { |
| 385 | /* |
| 386 | * While scanning, find the oldest event too |
| 387 | */ |
| 388 | if (hst->last < ostamp) { |
| 389 | ostamp = hst->last; |
| 390 | oldest = i; |
| 391 | } |
| 392 | |
| 393 | /* |
| 394 | * Does this match event in filter history? |
| 395 | * TBD: how much needs to match? |
| 396 | * For now: cpu (or box), bank, mca_code and model_code. |
| 397 | */ |
| 398 | if (hst->last && |
| 399 | hst->mc.id == mc->id && |
| 400 | hst->mc.org == mc->org && |
| 401 | GET_BITS(15, 0, hst->mc.status) == GET_BITS(15, 0, mc->status) && |
| 402 | GET_BITS(31, 16, hst->mc.status) == GET_BITS(31, 16, mc->status)) |
| 403 | break; |
| 404 | hst++; |
| 405 | } |
| 406 | if (i == ARRAY_SIZE(mc_history)) { |
| 407 | /* |
| 408 | * Not seen this event before. |
| 409 | * 'oldest' is where to store this event. |
| 410 | */ |
| 411 | hst = mc_history + oldest; |
| 412 | hst->count = 1; |
| 413 | hst->last = tsc; |
| 414 | hst->mc = *mc; |
| 415 | return 0; |
| 416 | } |
| 417 | |
| 418 | /* |
| 419 | * Already 'on file in history', test expiration date |
| 420 | */ |
| 421 | if (hst->last + dsc->win * (cpu_khz * 1000LL) < tsc) { |
| 422 | /* |
| 423 | * Matching history element had expired, just overwrite it |
| 424 | */ |
| 425 | hst->count = 1; |
| 426 | hst->last = tsc; |
| 427 | hst->mc = *mc; |
| 428 | return 0; |
| 429 | } |
| 430 | |
| 431 | /* |
| 432 | * Filter element active, bump count and set last seen. |
| 433 | * We do _NOT_ want injected events to enter the EEPROM, |
| 434 | * so that flag is preserved over all event history |
| 435 | */ |
| 436 | hst->count++; |
| 437 | if (mc->flags & MC_FLG_FALSE) |
| 438 | hst->mc.flags |= MC_FLG_FALSE; |
| 439 | if (hst->count < dsc->max) { |
| 440 | hst->last = tsc; |
| 441 | return 0; |
| 442 | } |
| 443 | |
| 444 | /* |
| 445 | * Threshold reached, event source needs to be silenced. |
| 446 | * Store a record of this in the EEPROM and send a |
| 447 | * notification to host about it. Once duly reported, clear |
| 448 | * event from the filter; it is not expected to show up again. |
| 449 | * Note: we report the _first_ event seen, not the |
| 450 | * event at hand. We could save array space |
| 451 | * by sending latest event (less info to keep). |
| 452 | */ |
| 453 | ee_printk("RAS: MCE filter #%d: bank %d, bit %d, limit %d, delta %d (mS)\n", |
| 454 | dsc - mc_turnoff, dsc->bank, dsc->ctl, dsc->max, (tsc - hst->last) / cpu_khz); |
| 455 | hst->mc.flags |= MC_FLG_FILTER; |
| 456 | #ifdef CONFIG_MK1OM |
| 457 | if (!(hst->mc.flags & MC_FLG_FALSE)) { |
| 458 | micras_mc_log(&hst->mc); |
| 459 | hst->mc.flags |= MC_FLG_LOG; |
| 460 | } |
| 461 | #endif |
| 462 | micras_mc_send(&hst->mc, exc); |
| 463 | hst->last = 0; |
| 464 | |
| 465 | /* |
| 466 | * MC events are disabled by caller when a |
| 467 | * non-zero mask is returned by this routine. |
| 468 | */ |
| 469 | return (1 << dsc->ctl); |
| 470 | } |
| 471 | |
| 472 | |
| 473 | /* |
| 474 | * Remove/mask an 'enable-bit' from a core MCA bank. |
| 475 | * Note: This applies to _current_ cpu only. It is not explicitly |
| 476 | * linked to the cpu that was ID'd in the incoming mce struct. |
| 477 | * Happens to be OK for mcc_exc_flt() and mcc_poll() and mcc_exc_log(). |
| 478 | */ |
| 479 | |
| 480 | static void |
| 481 | mcc_ctl_mask(int bank, uint32_t msk) |
| 482 | { |
| 483 | uint32_t ctl_lo, ctl_hi; |
| 484 | |
| 485 | rdmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); |
| 486 | ctl_lo &= ~msk; |
| 487 | wrmsr(MSR_IA32_MCx_CTL(bank), ctl_lo, ctl_hi); |
| 488 | |
| 489 | #if MC_VERBOSE |
| 490 | ee_printk("RAS: ctl mask CPU %d, MC%d_CTL -> %x\n", smp_processor_id(), bank, ctl_lo); |
| 491 | #endif |
| 492 | } |
| 493 | |
| 494 | |
| 495 | /* |
| 496 | * Filtering of correctable core MC events |
| 497 | * Called from the exception handler. |
| 498 | */ |
| 499 | |
| 500 | static void |
| 501 | mcc_exc_flt(struct mce * mce, uint64_t ctl, int fake) |
| 502 | { |
| 503 | struct mce_info mc; |
| 504 | uint32_t msk; |
| 505 | |
| 506 | if (!mce) |
| 507 | return; |
| 508 | |
| 509 | if (mce->status & MCI_STATUS_UC) |
| 510 | return; |
| 511 | |
| 512 | mcc_conv(mce, &mc); |
| 513 | mc.ctl = ctl; |
| 514 | mc.flags = fake ? MC_FLG_FALSE : 0; |
| 515 | msk = micras_mc_filter(&mc, mce->tsc, 1); |
| 516 | if (msk) |
| 517 | mcc_ctl_mask(mce->bank, msk); |
| 518 | } |
| 519 | |
| 520 | |
| 521 | /* |
| 522 | * Only action required for polled MC events is to |
| 523 | * pass the event on to the SCIF channel (if connected). |
| 524 | * The event should already have caused an excption (the |
| 525 | * exception handler choses to ignore corrected errors) |
| 526 | * which means it already has been filtered. |
| 527 | * Injected corrected events do not cause MCE exceptions |
| 528 | * and thus escaped filtering, so we'll filter them here. |
| 529 | */ |
| 530 | |
| 531 | static void |
| 532 | mcc_poll(struct mce * mce, uint64_t ctl, int fake) |
| 533 | { |
| 534 | struct mce_info mc; |
| 535 | |
| 536 | #if MC_VERBOSE |
| 537 | ee_printk("RAS: poll %d, fake %d, status %llx\n", mce->extcpu, fake, mce->status); |
| 538 | #endif |
| 539 | |
| 540 | mcc_conv(mce, &mc); |
| 541 | mc.ctl = ctl; |
| 542 | mc.flags = fake ? MC_FLG_FALSE : 0; |
| 543 | |
| 544 | #if BEAM_TEST |
| 545 | /* |
| 546 | * Under beam test we only want to send the SCIF message |
| 547 | */ |
| 548 | micras_mc_send(&mc, fake); |
| 549 | return; |
| 550 | #endif |
| 551 | |
| 552 | if (micras_mc_send(&mc, fake)) |
| 553 | mcc_seen = mcelog.next; |
| 554 | |
| 555 | /* |
| 556 | * According to MCA HAS the MCI_STATUS_VAL will only |
| 557 | * be set when an event's enable bit is set, in which |
| 558 | * case it is difficult to imagine how events without |
| 559 | * the MCI_STATUS_EN can appear here. The second clause |
| 560 | * of the test may never actually happen on Kn{F,C}. |
| 561 | * Note: MC polling does not capture TSCs |
| 562 | */ |
| 563 | if (fake || !(mc.status & MCI_STATUS_EN)) { |
| 564 | uint32_t msk; |
| 565 | |
| 566 | msk = micras_mc_filter(&mc, rdtsc(), fake); |
| 567 | if (msk) |
| 568 | mcc_ctl_mask(mce->bank, msk); |
| 569 | } |
| 570 | } |
| 571 | |
| 572 | |
| 573 | /* |
| 574 | * One CPU entered do_machine_check(). |
| 575 | * We get the initial mce record (which has cpu ID), early |
| 576 | * control variables and whether the event is injected. |
| 577 | * |
| 578 | * Since KnF and KnC deviate from the standard IA by not |
| 579 | * having the core MCAs broadcast to all CPU's we'll try |
| 580 | * to fake standard behavior in order to keep the generic |
| 581 | * machine check code intact. |
| 582 | * Therefore, if event is real (fake flag unset) and this |
| 583 | * CPU is the first seeing it (mcc_exc_mask is empty), |
| 584 | * then send IPI to all other CPU's listed in the online |
| 585 | * cpumask for vector #18. Later CPUs will see themselves |
| 586 | * marked in mcc_exc_mask and return quickly. |
| 587 | */ |
| 588 | |
| 589 | struct cpumask mcc_exc_mask; /* CPU's in mce ctx */ |
| 590 | static atomic_t ipi_lock = ATOMIC_INIT(0); /* Lock on exc mask */ |
| 591 | |
| 592 | static void |
| 593 | mcc_exc_entry(struct mce * mce, int fake, int no_way_out, int entry, char * msg) |
| 594 | { |
| 595 | unsigned int cpu; |
| 596 | |
| 597 | /* |
| 598 | *TBD: should we use 'extcpu' from the MCE record instead? |
| 599 | */ |
| 600 | cpu = smp_processor_id(); |
| 601 | |
| 602 | /* |
| 603 | * Injected events invokes all CPUs automatically |
| 604 | * by hooking into the NMI notify_die call_chain. |
| 605 | * Nothing to do here. |
| 606 | */ |
| 607 | if (fake) |
| 608 | return; |
| 609 | |
| 610 | #if 1 |
| 611 | /* |
| 612 | * Avoid the IPI corralling circus on corrected errors, |
| 613 | * based on assessment entirely done by mce_severity(). |
| 614 | * If the result (no_way_out) is MCE_NO_SEVERITY (=0), then |
| 615 | * at worst we may have a correctable error, and that does |
| 616 | * not warrant the system lockdown managed by mce_start() |
| 617 | * and mce_end(). |
| 618 | * Note that MICs do not support newer status bits (MCG_SER_P) |
| 619 | * which causes variable mce_ser always to be zero and thus |
| 620 | * the test in the inner loop of do_machine_check() will be |
| 621 | * reduced to just testing for the UC bit. |
| 622 | */ |
| 623 | if (! no_way_out) |
| 624 | return; |
| 625 | #endif |
| 626 | |
| 627 | /* |
| 628 | * Test for entry from MT thread IPIs (testing) |
| 629 | * or a 'soft' exception from a IPI issued from |
| 630 | * the handler of the first exception. |
| 631 | * No further action needed in both cases. |
| 632 | */ |
| 633 | if (cpumask_test_cpu(cpu, &mcc_exc_mask)) |
| 634 | return; |
| 635 | |
| 636 | /* |
| 637 | * Create mcc_exc_mask to flag which CPU's are |
| 638 | * to be included in the IPI. This mask is later |
| 639 | * used to determine who needs to EOI the local |
| 640 | * APIC after MC event handling. |
| 641 | */ |
| 642 | while(atomic_xchg(&ipi_lock, 1)) |
| 643 | cpu_relax(); |
| 644 | smp_rmb(); |
| 645 | if (cpumask_test_cpu(cpu, &mcc_exc_mask)) { |
| 646 | /* |
| 647 | * Another CPU got here first |
| 648 | */ |
| 649 | atomic_xchg(&ipi_lock, 0); |
| 650 | return; |
| 651 | } |
| 652 | cpumask_copy(&mcc_exc_mask, cpu_online_mask); |
| 653 | cpumask_clear_cpu(cpu, &mcc_exc_mask); |
| 654 | smp_wmb(); |
| 655 | atomic_xchg(&ipi_lock, 0); |
| 656 | |
| 657 | /* |
| 658 | * Simulate a broadcast ny sending IPI to all |
| 659 | * other CPUs. |
| 660 | */ |
| 661 | // apic->send_IPI_mask(&mcc_exc_mask, MCE_VECTOR); |
| 662 | apic->send_IPI_allbutself(MCE_VECTOR); |
| 663 | } |
| 664 | |
| 665 | |
| 666 | /* |
| 667 | * In do_machine_check() bank scan loop. |
| 668 | * Called from a lockdown, no synchronization needed. |
| 669 | * MC bank scan is complete and the mce event has been |
| 670 | * entered into the kernel MC log |
| 671 | * |
| 672 | *TBD: revise logic on HALT on UC events? |
| 673 | * From a state corruption point of view this |
| 674 | * _is_ a fatal error because UC bit was set. |
| 675 | * However, if the tolerance setting is set |
| 676 | * high enough, the generic MC handler may |
| 677 | * not chose to panic on this event. |
| 678 | * We currently do not have the tolerance value |
| 679 | * when recording this event, nor do we have |
| 680 | * other factors that mce_reign() use to determine |
| 681 | * what to do after reporting event to the host. |
| 682 | */ |
| 683 | |
| 684 | static void |
| 685 | mcc_exc_log(struct mce * mce, uint64_t ctl, int fake, |
| 686 | int no_way_out, char * msg, int severity, int worst) |
| 687 | { |
| 688 | struct mce_info mc; |
| 689 | uint32_t msk; |
| 690 | |
| 691 | #if MC_VERBOSE |
| 692 | ee_printk("RAS: log %d, wall %lld, nwo %d (%s), sev %d, wst %d\n", |
| 693 | mce->extcpu, mce->time, no_way_out, msg, severity, worst); |
| 694 | #endif |
| 695 | |
| 696 | /* |
| 697 | * Create a message for the host. |
| 698 | */ |
| 699 | mcc_conv(mce, &mc); |
| 700 | mc.ctl = ctl; |
| 701 | mc.flags |= fake ? MC_FLG_FALSE : 0; |
| 702 | |
| 703 | #if BEAM_TEST |
| 704 | /* |
| 705 | * Under beam test we only want to send the SCIF message |
| 706 | * This is guaranteed not to be called re-entrantly. |
| 707 | */ |
| 708 | micras_mc_send(&mc, 1); |
| 709 | return; |
| 710 | #endif |
| 711 | |
| 712 | #ifdef CONFIG_MK1OM |
| 713 | /* |
| 714 | * If this is a true event then log it in the EEPROM and |
| 715 | * notify SMC that we've had a serious machine check error. |
| 716 | */ |
| 717 | if ((mc.flags & (MC_FLG_FALSE | MC_FLG_FATAL)) == MC_FLG_FATAL) { |
| 718 | micras_mc_log(&mc); |
| 719 | mc.flags |= MC_FLG_LOG; |
| 720 | |
| 721 | /* |
| 722 | *TBD: Should this be deferred until the actual panic? |
| 723 | * The user can raise tolerance such that we in |
| 724 | * fact continue operating; in which case the SMC |
| 725 | * notification would be (somewhat) misleading. |
| 726 | */ |
| 727 | micras_mc_ipmi(&mc, 1); |
| 728 | } |
| 729 | #endif |
| 730 | |
| 731 | /* |
| 732 | * Always notify host and sync to kernel log |
| 733 | */ |
| 734 | if (micras_mc_send(&mc, 1)) |
| 735 | mcc_seen = mcelog.next; |
| 736 | |
| 737 | #if RAS_HALT |
| 738 | if ((mc.flags & MC_FLG_FATAL) && !fake) |
| 739 | panic("FATAL core machine check event:\n" |
| 740 | "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n", |
| 741 | mc.org, mc.id, mc.ctl, mc.status, mc.addr, mc.misc); |
| 742 | #endif |
| 743 | |
| 744 | /* |
| 745 | * Correctable events can in fact reach us here if |
| 746 | * mce_no_way_out() tags them as critical (for other |
| 747 | * reasons than the UC flag, e.g. MCIP missing). |
| 748 | * If the tolerance setting is high enough to prevent |
| 749 | * such events to panic, we'd still want filtering. |
| 750 | */ |
| 751 | msk = micras_mc_filter(&mc, mce->tsc, 1); |
| 752 | if (msk) |
| 753 | mcc_ctl_mask(mce->bank, msk); |
| 754 | } |
| 755 | |
| 756 | |
| 757 | /* |
| 758 | * In mce_panic(). |
| 759 | * Current event is about to make the kernel panic. |
| 760 | * Sources of this call are |
| 761 | * do_machine_check(), when no_way_out set |
| 762 | * mce_timed_out(), CPU rendez-vous failed |
| 763 | * mce_reign(), when severety high, a CPU hung, or no events |
| 764 | */ |
| 765 | |
| 766 | static void |
| 767 | mcc_exc_panic(struct mce * mce, char * msg, char * exp, int fake) |
| 768 | { |
| 769 | /* |
| 770 | * Should host be notified in this case? |
| 771 | * And if so, how should be presented, we might not |
| 772 | * even have a mce record to show when this happens! |
| 773 | * If an mce is passed, it has already been seen and |
| 774 | * reported to the host by a call to mcc_exc_log(). |
| 775 | * If mce is NULL, then this _is_ an MC relatedi panic, |
| 776 | * but we have no data fitting for a host notification. |
| 777 | * Create a pseudo event and ship that? |
| 778 | */ |
| 779 | ee_printk("RAS: panic %d, wall %lld, msg %s, exp %s, fake %d\n", |
| 780 | mce->extcpu, mce->time, msg, exp, fake); |
| 781 | } |
| 782 | |
| 783 | |
| 784 | /* |
| 785 | * A CPU is leaving do_machine_check(). |
| 786 | * We get this after the monarch has 'reigned' and |
| 787 | * the response to the event has been completed. |
| 788 | */ |
| 789 | |
| 790 | static void |
| 791 | mcc_exc_exit(struct mce * mce, int no_way_out, int worst, int entry, int order) |
| 792 | { |
| 793 | unsigned int cpu; |
| 794 | int eoi; |
| 795 | |
| 796 | cpu = smp_processor_id(); |
| 797 | |
| 798 | /* |
| 799 | * Assuming test_and_clear_bit() is atomic. |
| 800 | */ |
| 801 | smp_rmb(); |
| 802 | eoi = cpumask_test_and_clear_cpu(cpu, &mcc_exc_mask); |
| 803 | smp_wmb(); |
| 804 | if (eoi) |
| 805 | ack_APIC_irq(); |
| 806 | } |
| 807 | |
| 808 | |
| 809 | /* |
| 810 | * Routine to scan the kernel's MC log. |
| 811 | * Called when SCIF MC session has been created, to bring the host |
| 812 | * side up to date with prior unreported MC events, such as events |
| 813 | * occurring when MC session was not active (no peer was listening |
| 814 | * on the host) and events occurring before RAS module was loaded. |
| 815 | * |
| 816 | * Notes: |
| 817 | * - This is always called in thread context. |
| 818 | * - There are no injection flags in the kernel |
| 819 | * MC log, i.e. no guarantee events are genuine. |
| 820 | * - The MC kernel log has been exported explicitly for this. |
| 821 | * |
| 822 | * On synchronization (or the lack thereof): |
| 823 | * Effectively the mcelog holds a static array of mce's where the |
| 824 | * 'finished' flag says whether mce content is valid or not. The |
| 825 | * 'next' field is the index of the first element in the array that |
| 826 | * has not been assigned for an MC event. It is incremented when a |
| 827 | * new event is entered, and reset to zero on reads to /dev/mcelog. |
| 828 | * The kernel's event log does not wrap, so it is safe to use it as |
| 829 | * an indicator of how many events (finished or not) are in it. |
| 830 | * The mcelog's next field is protected by RCU style mechanisms |
| 831 | * in the kernel MCA handler (see arch/x86/kernel/cpu/mcheck/mce.c). |
| 832 | * For obvious reasons it is not genuine RCU, e.g. access to 'next' |
| 833 | * isn't within rcu_read_lock()/rcu_read_unlock() pair, just a clever |
| 834 | * masking use of a lock in an RCU macro definition. |
| 835 | * There is no RCU moving data around, the mce array does not move, |
| 836 | * and the 'finished' flag is set after a wmb() on the mce contents |
| 837 | * which means this routine will not clash with the MCE handler. |
| 838 | * Collisions with memset() on reads from /dev/mcelog are prevented |
| 839 | * by locking of mce_read_mutex. |
| 840 | */ |
| 841 | |
| 842 | void |
| 843 | mcc_sync(void) |
| 844 | { |
| 845 | struct mce_info mc; |
| 846 | unsigned seen; |
| 847 | |
| 848 | if (mce_disabled) |
| 849 | return; |
| 850 | |
| 851 | #if 0 |
| 852 | /* |
| 853 | * Can't do this until bootstrap scrubs MC banks on all cards. |
| 854 | * It has been observed that MCA banks may _not_ be reset on card |
| 855 | * reboot which means events picked up by the kernel before loading |
| 856 | * the RAS module may have occured in a previous uOS run. |
| 857 | * Should be OK post early Jan '12 (flash ver 262, HSD 4115351). |
| 858 | */ |
| 859 | return; |
| 860 | #endif |
| 861 | |
| 862 | /* |
| 863 | * Lock out kernel log access through /dev/mcelog |
| 864 | */ |
| 865 | mutex_lock(&mce_read_mutex); |
| 866 | |
| 867 | /* |
| 868 | * Start over if the log has been cleared cleared |
| 869 | */ |
| 870 | if (mcc_seen > mcelog.next) |
| 871 | mcc_seen = 0; |
| 872 | |
| 873 | for(seen = mcc_seen; seen < mcelog.next; seen++) { |
| 874 | /* |
| 875 | * Basic checks. Index, CPU & bank must be reasonable. |
| 876 | */ |
| 877 | if (mcelog.entry[seen].finished) { |
| 878 | if (mcelog.entry[seen].cpu >= NR_CPUS || |
| 879 | mcelog.entry[seen].bank >= 3) { |
| 880 | printk("mcc_sync: entry %d contains garbage, cpu %d, bank %d\n", |
| 881 | seen, mcelog.entry[seen].cpu, mcelog.entry[seen].bank); |
| 882 | continue; |
| 883 | } |
| 884 | |
| 885 | /* |
| 886 | * Have good entry, can be UC, but it is 'old'. |
| 887 | */ |
| 888 | mcc_conv(&mcelog.entry[seen], &mc); |
| 889 | mc.ctl = 0; |
| 890 | |
| 891 | #ifdef CONFIG_MK1OM |
| 892 | /* |
| 893 | * Log this event in the eeprom and notify |
| 894 | * that we've had a serious machine check error. |
| 895 | */ |
| 896 | if (mc.flags & MC_FLG_FATAL) { |
| 897 | in_sync = 1; |
| 898 | micras_mc_log(&mc); |
| 899 | in_sync = 0; |
| 900 | mc.flags |= MC_FLG_LOG; |
| 901 | micras_mc_ipmi(&mc, 0); |
| 902 | } |
| 903 | #endif |
| 904 | |
| 905 | /* |
| 906 | * Notify host about this too |
| 907 | */ |
| 908 | if (! micras_mc_send(&mc, 0)) |
| 909 | break; |
| 910 | } |
| 911 | } |
| 912 | mcc_seen = mcelog.next; |
| 913 | |
| 914 | /* |
| 915 | * Done, release lock |
| 916 | */ |
| 917 | mutex_unlock(&mce_read_mutex); |
| 918 | } |
| 919 | |
| 920 | |
| 921 | /* |
| 922 | * Setup excetion handlers by hooking into the |
| 923 | * kernel's native MCA handler. |
| 924 | */ |
| 925 | |
| 926 | int __init |
| 927 | mcc_init(void) |
| 928 | { |
| 929 | if (mce_disabled) { |
| 930 | printk("RAS.core: disabled\n"); |
| 931 | } |
| 932 | else { |
| 933 | mca_poll = mcc_poll; |
| 934 | mca_exc_flt = mcc_exc_flt; |
| 935 | mca_exc_entry = mcc_exc_entry; |
| 936 | mca_exc_log = mcc_exc_log; |
| 937 | mca_exc_panic = mcc_exc_panic; |
| 938 | mca_exc_exit = mcc_exc_exit; |
| 939 | mca_print = 0; /* For debug: ee_printk; */ |
| 940 | printk("RAS.core: init complete\n"); |
| 941 | } |
| 942 | |
| 943 | return 0; |
| 944 | } |
| 945 | |
| 946 | |
| 947 | /* |
| 948 | * Cleanup for module unload. |
| 949 | * Clear/restore hooks in the native MCA handler. |
| 950 | */ |
| 951 | |
| 952 | int __exit |
| 953 | mcc_exit(void) |
| 954 | { |
| 955 | mca_poll = 0; |
| 956 | mca_exc_flt = 0; |
| 957 | mca_exc_entry = 0; |
| 958 | mca_exc_log = 0; |
| 959 | mca_exc_panic = 0; |
| 960 | mca_exc_exit = 0; |
| 961 | mca_print = 0; |
| 962 | |
| 963 | /* |
| 964 | * Links from kernel's MCE handler cut, |
| 965 | * wait for everybody in handler to leave. |
| 966 | */ |
| 967 | while(atomic_read(&mce_entry)) |
| 968 | cpu_relax(); |
| 969 | |
| 970 | printk("RAS.core: exit complete\n"); |
| 971 | return 0; |
| 972 | } |
| 973 | |