Updated `README.md` with instructions for building/using the kernel module.
[xeon-phi-kernel-module] / ras / micras_uncore.c
CommitLineData
800f879a
AT
1/*
2 * Copyright 2010-2017 Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License, version 2,
6 * as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * Disclaimer: The codes contained in these modules may be specific to
14 * the Intel Software Development Platform codenamed Knights Ferry,
15 * and the Intel product codenamed Knights Corner, and are not backward
16 * compatible with other Intel products. Additionally, Intel will NOT
17 * support the codes or instruction set in future products.
18 *
19 * Intel offers no warranty of any kind regarding the code. This code is
20 * licensed on an "AS IS" basis and Intel is not obligated to provide
21 * any support, assistance, installation, training, or other services
22 * of any kind. Intel is also not obligated to provide any updates,
23 * enhancements or extensions. Intel specifically disclaims any warranty
24 * of merchantability, non-infringement, fitness for any particular
25 * purpose, and any other warranty.
26 *
27 * Further, Intel disclaims all liability of any kind, including but
28 * not limited to liability for infringement of any proprietary rights,
29 * relating to the use of the code, even if Intel is notified of the
30 * possibility of such liability. Except as expressly stated in an Intel
31 * license agreement provided with this code and agreed upon with Intel,
32 * no license, express or implied, by estoppel or otherwise, to any
33 * intellectual property rights is granted herein.
34 */
35
36/*
37 * RAS handler for uncore MC events
38 *
39 * Contains code to intercept MC events, collect information
40 * from uncore MCA banks and handle the situation.
41 *
42 * In case of a severe event, defined by corrupted context,
43 * the handler will add a record of the event in the designated
44 * EEPROM hanging off the Over Clocking I2C bus. After that
45 * a message will be sent to the SMC (enabling IPMI notifications)
46 * and at last a message is sent to the host via the MC SCIF
47 * connection.
48 *
49 * Lesser events will also be sent to the host on a 'FYI' basis,
50 * but no rocord will be stored in the event log.
51 *
52 * This is in all aspects similar to the reaction to a severe
53 * core MC event. Differences are in the MC bank access (mmio),
54 * and that the event is delivered via an interrupt instead of
55 * an exception. Still, the handler cannot expect any support
56 * from the OS.
57 */
58
59#include <linux/types.h>
60#include <linux/errno.h>
61#include <linux/kernel.h>
62#include <linux/interrupt.h>
63#include <linux/nmi.h>
64#include <asm/mce.h>
65#include <asm/msr.h>
66#include <asm/processor.h>
67#include <asm/mic/mic_common.h>
68#include <asm/mic/mic_knc/autobaseaddress.h>
69#include <asm/mic/mic_knc/micsboxdefine.h>
70#include "micras.h"
71
72
73/*
74 * Hooks placed in the native machine check handler
75 * See file arch/x86/kernel/traps.c for placement
76 *
77 * nmi Entered NMI exception handler.
78 * Called before any other tests, which allow us
79 * to test for and handle un-core MCA events before
80 * the traditional NMI handling.
81 * Note that the mce-inject mechanism also uses
82 * NMI's to distribute calls to do_machine_check().
83 */
84
85extern int (*mca_nmi)(int);
86
87
88
89/*
90 * Table of un-core MCA banks.
91 * Though there are differences in register count and sizes, un-core bank
92 * registers are always spaced 8 bytes apart, so all we need to know is
93 * the location of the first MCA bank register (CTL) to find them.
94 * If bank is present, the bank register offsets for ctl, status, addr,
95 * and misc are thus 0, 8, 16, and 24 respectively.
96 * Default CTL masks pulled from the register documentation
97 * Some SKUs don't have support for all BOXs but that will be handled
98 * at runtime in the support code, not at compile time by this table.
99 */
100
101
102#ifdef CONFIG_ML1OM
103#define SBOX_DEF 0x000e /* All (7) */
104#define DBOX_DEF 0x0003 /* All (2) */
105#define GBOX_DEF 0x0003 /* All (2) */
106#endif
107#ifdef CONFIG_MK1OM
108#define SBOX_DEF 0x03ce /* All - PCIe errors (7) */
109#define DBOX_DEF 0x000f /* All (4) */
110#define GBOX_DEF 0x3ffffffff /* All (34) */
111#define TBOX_DEF 0x001f /* All (5) */
112#endif
113
114#define MCU_CTL_64 (1 << 0) /* Bank has 64 bit CTL register */
115#define MCU_NO_ADDR (1 << 1) /* Bank has no ADDR register */
116#define MCU_ADDR_32 (1 << 2) /* Bank has 32 bit ADDR register */
117#define MCU_NO_MISC (1 << 3) /* Bank has no MISC register */
118#define MCU_MISC_64 (1 << 4) /* Bank has 64 bit MISC register */
119
120#define MCU_CTRL 0
121#define MCU_STAT 8
122#define MCU_ADDR 16
123#define MCU_MISC 24
124
125typedef struct _mcu_rec {
126 uint8_t num; /* 'BOX' count */
127 uint8_t org; /* Origin code */
128 uint8_t qflg; /* Quirk flags */
129 uint16_t ofs; /* MCA bank base offset */
130 uint64_t ctl; /* Initial CTL mask */
131 uint32_t (*rl)(int, uint32_t); /* 32-bit MMIO read */
132 void (*wl)(int, uint32_t, uint32_t); /* 32-bit MMIO write */
133 uint64_t (*rq)(int, uint32_t); /* 64-bit MMIO read */
134 void (*wq)(int, uint32_t, uint64_t); /* 64-bit MMIO write */
135} McuRec;
136
137
138static McuRec mcu_src[] = {
139 { 1, MC_ORG_SBOX, MCU_MISC_64, SBOX_MCX_CTL_LO,
140 SBOX_DEF, mr_sbox_rl, mr_sbox_wl, mr_sbox_rq, mr_sbox_wq },
141 { DBOX_NUM, MC_ORG_DBOX, MCU_NO_MISC, DBOX_MC2_CTL,
142 DBOX_DEF, mr_dbox_rl, mr_dbox_wl, mr_dbox_rq, mr_dbox_wq },
143 { GBOX_NUM, MC_ORG_GBOX, MCU_CTL_64, GBOX_FBOX_MCA_CTL_LO,
144 GBOX_DEF, mr_gbox_rl, mr_gbox_wl, mr_gbox_rq, mr_gbox_wq },
145#ifdef CONFIG_MK1OM
146 { TBOX_NUM, MC_ORG_TBOX, MCU_CTL_64 | MCU_NO_MISC | MCU_ADDR_32, TXS_MCX_CONTROL,
147 TBOX_DEF, mr_tbox_rl, mr_tbox_wl, mr_tbox_rq, mr_tbox_wq },
148#endif
149};
150
151#define GBOX_BROKEN 1 /* Set if GBOX MCA bank is borken */
152
153#if GBOX_BROKEN
154/*
155 * Si design managed to break the GBOX MCA bank concept
156 * by not filling useful data into ADDR and MISC registers.
157 * Instead they use a bunch of registers in another part
158 * of the GBOX (mbox to be specific) to hold this info.
159 * In order to get at the right register it is necesary
160 * to partially decode the STATUS register and from there
161 * select an GBOX.MBOX register.
162 * Since the new registers are all 32 bits wide, we'll stick
163 * the value into MISC register if Misc_V bit of STATUS is
164 * not set. The following table is used for register selection
165 *
166 * model code base width Chan Notes
167 * 0 017c 32 0 26 bit address, CRC (retrain)
168 * 1 097c 32 1 26 bit address, CRC (retrain)
169 * 2 01e0 32 0 26 bit address, ECC
170 * 3 09e0 32 1 26 bit address, ECC
171 * 4 01dc 32 0 26 bit address, UC CAPE
172 * 5 09dc 32 1 26 bit address, UC CAPE
173 * 31 01a4 32 0 26 bit address, UC ECC
174 * 32 09a4 32 1 26 bit address, UC ECC
175 *
176 * Note: model code is simply the enable bit number in CTL
177 */
178
179static struct liu {
180 uint16_t mcode;
181 uint16_t base;
182} liu[] = {
183 { 0, 0x17c }, /* Correctable CRC (retrain) ch 0 */
184 { 1, 0x97c }, /* Correctable CRC (retrain) ch 1 */
185 { 2, 0x1e0 }, /* Correctable ECC, ch 0 */
186 { 3, 0x9e0 }, /* Correctable ECC, ch 1 */
187 { 4, 0x1dc }, /* Uncorrectable CAPE, ch 0 */
188 { 5, 0x9dc }, /* Uncorrectable CAPE, ch 1 */
189 { 31, 0x1a4 }, /* Uncorrectable ECC, ch 0 */
190 { 32, 0x9a4 } /* Uncorrectable ECC, ch 1 */
191};
192
193static void
194mcu_gbox_fixup(McuRec * mr, int num, MceInfo * mi)
195{
196 int i;
197 uint16_t mcode;
198
199 /*
200 * Skip if Status.Misc_v set
201 */
202 if (mi->status & (1ULL << 59))
203 return;
204
205 /*
206 * Get model code and if it's in the array, then read
207 * the addressed register into MISC. We don't set the
208 * Status.Misc_v bit because we want to distinguish
209 * this hack from the real MCA bank register.
210 */
211 mcode = GET_BITS(31, 16, mi->status);
212 for(i = 0; i < ARRAY_SIZE(liu); i++)
213 if (liu[i].mcode == mcode) {
214 mi->misc = (uint64_t) mr->rl(num, liu[i].base);
215 break;
216 }
217}
218#endif
219
220/*
221 * Read Ctrl, Addr and Misc registers from an un-core MCA bank.
222 * The Status register is read/cleared in mcu_scan().
223 */
224
225static void
226mcu_read(McuRec * mr, int num, MceInfo * mi)
227{
228 if (mr->qflg & MCU_CTL_64)
229 mi->ctl = mr->rq(num, mr->ofs + MCU_CTRL);
230 else
231 mi->ctl = (uint64_t) mr->rl(num, mr->ofs + MCU_CTRL);
232
233 if (mr->qflg & MCU_NO_ADDR)
234 mi->addr = 0;
235 else {
236 if (mr->qflg & MCU_ADDR_32)
237 mi->addr = (uint64_t) mr->rl(num, mr->ofs + MCU_ADDR);
238 else
239 mi->addr = mr->rq(num, mr->ofs + MCU_ADDR);
240 }
241
242 if (mr->qflg & MCU_NO_MISC)
243 mi->misc = 0;
244 else {
245 if (mr->qflg & MCU_MISC_64)
246 mi->misc = mr->rq(num, mr->ofs + MCU_MISC);
247 else
248 mi->misc = (uint64_t) mr->rl(num, mr->ofs + MCU_MISC);
249 }
250
251#if GBOX_BROKEN
252 if (mr->org == MC_ORG_GBOX)
253 mcu_gbox_fixup(mr, num, mi);
254#endif
255}
256
257
258/*
259 * Reset one un-core MCA bank
260 * Any quirks go here.
261 */
262
263static void
264mcu_reset(McuRec * mr, int num, int arm)
265{
266 uint64_t ctl;
267
268 mr->wq(num, mr->ofs + MCU_STAT, 0);
269
270 if (! (mr->qflg & MCU_NO_ADDR)) {
271 if (mr->qflg & MCU_ADDR_32)
272 mr->wl(num, mr->ofs + MCU_ADDR, 0);
273 else
274 mr->wq(num, mr->ofs + MCU_ADDR, 0);
275 }
276
277 if (! (mr->qflg & MCU_NO_MISC)) {
278 if (mr->qflg & MCU_MISC_64)
279 mr->wq(num, mr->ofs + MCU_MISC, 0);
280 else
281 mr->wl(num, mr->ofs + MCU_MISC, 0);
282 }
283
284 ctl = arm ? mr->ctl : 0;
285
286#ifdef CONFIG_MK1OM
287 if (ctl && mr->org == MC_ORG_SBOX && mic_hw_stepping(0) == KNC_A_STEP)
288 ctl &= ~PUT_BIT(3, 1); /* A0 SBOX 'unclaimed address' bug */
289
290 if (ctl && mr->org == MC_ORG_GBOX && mr_mch() != 16)
291 ctl &= ~(uint64_t) PUT_BIT(6, 1); /* B0 GBOX 'Invalid Channel' (SKU 3 & 4) */
292#endif
293
294 if (mr->qflg & MCU_CTL_64)
295 mr->wq(num, mr->ofs + MCU_CTRL, ctl);
296 else
297 mr->wl(num, mr->ofs + MCU_CTRL, ctl);
298}
299
300
301/*
302 * Un-core MC bank pre-scan
303 * Walk through all un-core MC sources to see if any events are pending.
304 * Stops on 1st match where STATUS has both VAL bit set. On some BOXes,
305 * like GBOX, interrupt may be signalled without the EN bit being set.
306 * See HSD 4116374 for details.
307 */
308
309static int
310mcu_prescan(void)
311{
312 int i, j;
313 uint64_t status;
314 struct _mcu_rec * mr;
315
316 for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
317 mr = mcu_src + i;
318
319#ifdef CONFIG_MK1OM
320 if (mr->org == MC_ORG_TBOX && !mr_txs())
321 continue;
322#endif
323
324 for(j = 0; j < mr->num; j++) {
325 status = mr->rq(j, mr->ofs + MCU_STAT);
326 if (status & MCI_STATUS_VAL)
327 return 1;
328 }
329 }
330
331 return 0;
332}
333
334
335/*
336 * Un-core MC bank scanner.
337 * Walks through all un-core MC sources for new events.
338 * If any found, then process them same way as core events.
339 */
340
341static int
342mcu_scan(void)
343{
344 MceInfo mc, uc;
345 int gone, seen;
346 int i, j;
347 struct _mcu_rec * mr;
348
349 /*
350 * Walk list of known un-core MC sources
351 */
352 gone = seen = 0;
353 memset(&uc, 0, sizeof(uc));
354 for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
355 mr = mcu_src + i;
356
357#ifdef CONFIG_MK1OM
358 if (mr->org == MC_ORG_TBOX && !mr_txs())
359 continue;
360#endif
361
362 for(j = 0; j < mr->num; j++) {
363
364 /*
365 * Read status to see if we have something of interest.
366 * As per HSD 4116374 the status register is cleared
367 * after read, if it had valid content.
368 *TBD: Clear unconditionally?
369 */
370 mc.status = mr->rq(j, mr->ofs + MCU_STAT);
371 if (mc.status & MCI_STATUS_VAL)
372 mr->wq(j, mr->ofs + MCU_STAT, 0);
373 else
374 continue;
375
376 /*
377 * Bank had valid content (VAL bit set).
378 * Verify the event was subscribed to (EN bit set).
379 * If not, the event is ignored.
380 */
381 if (! (mc.status & MCI_STATUS_EN))
382 continue;
383
384 /*
385 * Valid and enabled event, read remaining bank registers.
386 */
387 seen++;
388 mcu_read(mr, j, &mc);
389
390 /*
391 * Fill out blanks in the MceInfo record
392 */
393 mc.org = mr->org;
394 mc.id = j;
395 mc.stamp = get_seconds();
396 mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
397
398 /*
399 * If any way to detect injected errors then this is
400 * the place to do so and indicate by MC_FLG_FALSE flag
401 */
402
403 if (mc.flags & MC_FLG_FATAL) {
404#ifdef CONFIG_MK1OM
405#if MC_VERBOSE
406 ee_printk("Uncore fatal MC: org %d, id %d, status %lx\n", mc.org, mc.id, mc.status);
407#endif
408
409 /*
410 * Log UC events in the eeprom.
411 */
412 micras_mc_log(&mc);
413 mc.flags |= MC_FLG_LOG;
414
415 /*
416 * Notify SMC that we've had a serious machine check error.
417 */
418 micras_mc_ipmi(&mc, 1);
419#endif
420 /*
421 * Remember 1st fatal (UC) event
422 */
423 if (! gone++)
424 uc = mc;
425 }
426
427 /*
428 * Notify host
429 */
430 micras_mc_send(&mc, 1);
431
432 /*
433 * Filter corrected errors.
434 */
435 if (! (mc.flags & MC_FLG_FATAL)) {
436 uint64_t tsc, msk;
437
438 tsc = rdtsc();
439 msk = micras_mc_filter(&mc, tsc, 1);
440 if (msk) {
441#if MC_VERBOSE
442 ee_printk("Uncore filter: org %d, id %d, ctrl %lx, mask %lx\n", mc.org, mc.id, mc.ctl, msk);
443#endif
444 if (mr->qflg & MCU_CTL_64)
445 mr->wq(j, mr->ofs + MCU_CTRL, mc.ctl & ~msk);
446 else
447 mr->wl(j, mr->ofs + MCU_CTRL, (uint32_t)(mc.ctl & ~msk));
448 }
449 }
450
451 /*
452 * Any event post processing goes here.
453 * This would be things like cache line refresh and such.
454 * Actual algorithms are TBD.
455 */
456 }
457 }
458
459#if RAS_HALT
460 if (gone) {
461 atomic_inc(&mce_entry);
462 panic("FATAL un-core machine check event:\n"
463 "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
464 uc.org, uc.id, uc.ctl, uc.status, uc.addr, uc.misc);
465 }
466#endif
467
468 return seen;
469}
470
471
472/*
473 * NMI handler.
474 *
475 * Once we get control in 1st interrupt (NMI or regular), we'll
476 * use IPIs from the local APIC to force all active CPU's into
477 * our RAS NMI handler, similar to the core MC handler.
478 * After that, the same logic as for the generic MC handler is
479 * applied to corral all CPU's through well defined rendez-vous
480 * points where only one cpu gets to run the un-core MC event
481 * scan while everybody else are sitting in a holding pen.
482 * If containment wasn't an issue we could simply let the BP
483 * run the scan without involving other CPUs at all.
484 */
485
486#define SPINUNIT 50
487#define SERIAL_MCU 0
488
489struct cpumask mcu_exc_mask; /* NMI recipients */
490static int mcu_cpu = -1; /* SBOX target CPU */
491#if MCU_NMI
492static uint64_t mcu_redir; /* SBOX I/O-APIC redirection entry */
493static uint64_t mcu_old_redir; /* Restore value for redirection entry */
494#else
495unsigned int mcu_eoi; /* 1st interrupt from local APIC */
496#endif
497static atomic_t mcu_callin; /* Entry rendez-vous gate */
498static atomic_t mcu_leavin; /* Hold rendez-vous gate */
499
500
501static int
502mcu_timed_out(int64_t * timeout)
503{
504 if (*timeout < SPINUNIT)
505 return 1;
506
507 *timeout -= SPINUNIT;
508 touch_nmi_watchdog();
509 ndelay(SPINUNIT);
510
511 return 0;
512}
513
514
515static int
516mcu_wait(void)
517{
518 int cpus, order;
519 int64_t timeout;
520
521 cpus = num_online_cpus();
522 timeout = 1 * NSEC_PER_SEC; /* 1 Second */
523
524 /*
525 * Flush all caches
526 */
527
528 /*
529 * 'Entry' rendez-vous point.
530 * Wait here until all CPUs has entered.
531 */
532 order = atomic_inc_return(&mcu_callin);
533 while(atomic_read(&mcu_callin) != cpus) {
534 if (mcu_timed_out(&timeout)) {
535 /*
536 * Timout waiting for CPU enter rendez-vous
537 */
538 return -1;
539 }
540 }
541
542 /*
543 * 'Hold' rendez-vous point.
544 * All CPUs drop by here 'simultaneously'.
545 * The first CPU that 'enter'ed (order of 1) will
546 * fall thru while the others wait until their
547 * number number comes up in the 'leavin' counter
548 * (or if a timeout happens). This also has a
549 * serializing effect, where one CPU leaves this
550 * loop at a time.
551 */
552 if (order == 1) {
553#if SERIAL_MCU
554 atomic_set(&mcu_leavin, 1);
555#endif
556 }
557 else {
558 while(atomic_read(&mcu_leavin) < order) {
559 if (mcu_timed_out(&timeout)) {
560 /*
561 * Timout waiting in CPU hold rendez-vous
562 */
563 return -1;
564 }
565 }
566 }
567
568 return order;
569}
570
571
572static int
573mcu_go(int order)
574{
575 int ret;
576 int64_t timeout;
577
578 ret = -1;
579 if (order < 0)
580 goto mcu_reset;
581
582#if SERIAL_MCU
583 /*
584 * If any 'per-CPU' activity is needed in isolation
585 * (one CPU at a time) then that code needs to go here.
586 */
587
588 atomic_inc(&mcu_leavin); /* Next CPU out of hold */
589#endif
590
591 timeout = NSEC_PER_SEC; /* 1 Second */
592 if (order == 1) {
593 int cpus;
594
595 /*
596 * The first CPU that entered (order of 1) waits here
597 * for the others to leave the 'hold' loop in mca_wait()
598 * and enter the 'exit' rendez-vous loop below.
599 * Once they are there, it will run the uncore MCA bank
600 * scan while the others are parked in 'exit' loop below.
601 */
602 cpus = num_online_cpus();
603#if SERIAL_MCU
604 while(atomic_read(&mcu_leavin) <= cpus) {
605 if (mcu_timed_out(&timeout)) {
606 /*
607 * Timout waiting for CPU exit rendez-vous
608 */
609 goto mcu_reset;
610 }
611 }
612#else
613 atomic_set(&mcu_leavin, cpus);
614#endif
615 mcu_scan();
616 ret = 0;
617 }
618 else {
619 /*
620 * Exit rendez-vous point.
621 */
622 while(atomic_read(&mcu_leavin) != 0) {
623 if (mcu_timed_out(&timeout)) {
624 /*
625 * Timout waiting in CPU exit rendez-vous
626 */
627 goto mcu_reset;
628 }
629 }
630 return 0;
631 }
632
633 /*
634 * Reset rendez-vous counters, letting all CPUs
635 * leave this function 'simultaneously'.
636 */
637mcu_reset:
638 atomic_set(&mcu_callin, 0);
639 atomic_set(&mcu_leavin, 0);
640 return ret;
641}
642
643
644/*
645 * NMI exception handler
646 * Uncertain if all cpumask_* functions implies barriers,
647 * so erroring on the safe side explicit barriers is used.
648 */
649
650#if BEAM_TEST
651static int
652mcu_nmi(int cpu)
653{
654#ifdef CONFIG_MK1OM
655 uint32_t mcg_status_lo, mcg_status_hi;
656#endif
657 struct _mcu_rec * mr;
658 MceInfo mc;
659 int i, j;
660
661 if (cpu != mcu_cpu)
662 return 0;
663
664 if (! mcu_prescan())
665 return 0;
666
667 wbinvd();
668
669#ifdef CONFIG_MK1OM
670 rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
671 wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi);
672#endif
673
674 for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
675 mr = mcu_src + i;
676
677#ifdef CONFIG_MK1OM
678 if (mr->org == MC_ORG_TBOX && !mr_txs())
679 continue;
680#endif
681
682 for(j = 0; j < mr->num; j++) {
683 mc.status = mr->rq(j, mr->ofs + MCU_STAT);
684
685 if (! (mc.status & MCI_STATUS_VAL))
686 continue;
687
688 if (! (mc.status & MCI_STATUS_EN)) {
689 mr->wq(j, mr->ofs + MCU_STAT, 0);
690 continue;
691 }
692
693 mcu_read(mr, j, &mc);
694 mr->wq(j, mr->ofs + MCU_STAT, 0);
695
696 mc.org = mr->org;
697 mc.id = j;
698 mc.stamp = get_seconds();
699 mc.flags = (mc.status & MCI_STATUS_UC) ? MC_FLG_FATAL : 0;
700
701 micras_mc_send(&mc, 1);
702 }
703 }
704
705#ifdef CONFIG_MK1OM
706 wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
707#endif
708 return 1;
709
710 /*
711 * Damn compiler options !!!!!!
712 * Don't want more changes than this routine, so
713 * added dummies to shut up gcc about unused code.
714 */
715 i = mcu_wait();
716 mcu_go(i);
717}
718#else
719
720static atomic_t mcu_entry;
721
722static int
723mcu_nmi(int cpu)
724{
725#ifdef CONFIG_MK1OM
726 uint32_t mcg_status_lo, mcg_status_hi;
727#endif
728 int order, eoi;
729
730 atomic_inc(&mcu_entry);
731
732 /*
733 * Get MCA status from SBOX.
734 */
735#if 0
736 /*
737 * If no source bits set, this was not an un-core MCA
738 * This would work if the SBOX_MCA_INT_STAT actually worked
739 * as described both in HAS and register specification.
740 * Unfortunately, it doesn't, as per tribal knowledge errata.
741 */
742 uint32_t int_stat, int_en;
743
744 int_en = mr_sbox_rl(0, SBOX_MCA_INT_EN);
745 int_stat = mr_sbox_rl(0, SBOX_MCA_INT_STAT);
746 if (! (int_en & int_stat)) {
747 atomic_dec(&mcu_entry);
748 return 0;
749 }
750#else
751 /*
752 * Instead of having a single source of pending un-core MCA events,
753 * we now have to walk all BOXes to check if there is a valid event
754 * pending in one of them. That is much more expensive as we have
755 * to check this on all NMIs, including our own cascade NMIs used
756 * to corrall all CPUs in their rendezvouz point(s). We try to avoid
757 * this scan if there already is an un-core NMI in progress.
758 * We know that:
759 * un-core MCA NMIs are sent to just one CPU, mcu_cpu
760 * CPUs targeted in the cascade are in mcu_exc_mask
761 * non-zero atomic variable 'mcu_callin' tells cascade is in progress
762 */
763 if (!cpumask_empty(&mcu_exc_mask))
764 goto invited;
765 if (cpu != mcu_cpu) {
766 atomic_dec(&mcu_entry);
767 return 0;
768 }
769
770 /*
771 * On CPU 0 and no un-core handling in progress!
772 * Then scan all BOXes for valid events pending,
773 * If there wasn't any, this is a false alarm and
774 * we'll re-connect MC lines and return.
775 */
776 if (! mcu_prescan()) {
777 atomic_dec(&mcu_entry);
778 return 0;
779 }
780
781invited:
782#endif
783
784 /*
785 * Flush all caches.
786 * This is uncore so it should not be necessary to
787 * empty internal (L1) caches, doesn't harm either.
788 */
789 wbinvd();
790
791 /*
792 * We do not want to be interrupted by a core MC
793 * exception while handling an NMI. We can block
794 * core MC events by setting the MCG_STATUS_MCIP.
795 * This is a MSR, so it has to be done on all CPUs.
796 * On KnC that is, KnF does not have that MSR.
797 */
798#ifdef CONFIG_MK1OM
799 rdmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
800 wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo | MCG_STATUS_MCIP, mcg_status_hi);
801#endif
802
803 /*
804 * Special for the SBOX NMI target CPU:
805 * - disconnect un-core MC lines from SBOX I/O-APIC, such
806 * that we don't get stacked NMIs in the Local APICs.
807 * - simulate a NMI broadcast by sending NMI to all _other_
808 * active CPUs via IPIs. The SBOX could do a broadcast,
809 * but that will send NMIs to sleeping CPUs too, which
810 * we prefer to avoid if possible.
811 *TBD: should creating the mcu_exc_mask be protected by
812 * lock, similar to core events? Who can interfere?
813 */
814 if (cpu == mcu_cpu) {
815 mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
816 cpumask_copy(&mcu_exc_mask, cpu_online_mask);
817 cpumask_clear_cpu(cpu, &mcu_exc_mask);
818 smp_wmb();
819 // apic->send_IPI_mask(&mcu_exc_mask, NMI_VECTOR);
820 apic->send_IPI_allbutself(NMI_VECTOR);
821#if !MCU_NMI
822 if (mcu_eoi) {
823 smp_rmb();
824 cpumask_set_cpu(cpu, &mcc_exc_mask);
825 smp_wmb();
826 mcu_eoi = 0;
827 }
828#endif
829 }
830
831 /*
832 * Corral all CPUs through the rendez-vous point maze.
833 * It guarantees that:
834 * - No CPU leaves mcu_wait() until all has entered.
835 * - One CPU leaves mcu_wait() at a time.
836 * - No CPU leaves mcu_go() until all has entered.
837 * - While one CPU is in transit between mcu_wait()
838 * and mcu_go(), all other CPUs are sitting in
839 * tight busy-wait loops in either function.
840 * - All CPUs leaves mcu_go() at the same time.
841 * If there is any 'per-cpu' activity that needs to be
842 * run in isolation, it must be placed between mcu_wait()
843 * and mcu_go().
844 */
845 order = mcu_wait();
846 if (mcu_go(order)) {
847 /*
848 * Timeout waiting at one of the rendez-vous points.
849 * Scan the un-core MCA banks just in case.
850 */
851 mcu_scan();
852 }
853
854 /*
855 * Special for the SBOX NMI target CPU:
856 * - reconnect un-core MC lines through to SBOX I/O-APIC.
857 * If new events already are pending, then this will
858 * result in a 'rising-edge' trigger to the I/O-APIC.
859 */
860 if (cpu == mcu_cpu)
861 mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07);
862
863 /*
864 * If this CPU got its NMI from an IPI, then it must
865 * send an ACK to its local APIC (I think).
866 */
867 smp_rmb();
868 eoi = cpumask_test_and_clear_cpu(cpu, &mcu_exc_mask);
869 smp_wmb();
870 if (eoi)
871 ack_APIC_irq();
872
873 /*
874 * Restore core MCG status and return 1 indicating to the
875 * kernel NMI handler we've handled it.
876 *TBD: reduce to one write per core instead of one per thread?
877 */
878#ifdef CONFIG_MK1OM
879 wrmsr(MSR_IA32_MCG_STATUS, mcg_status_lo, mcg_status_hi);
880#endif
881 atomic_dec(&mcu_entry);
882 return 1;
883}
884#endif
885
886
887#if !MCU_NMI
888/*
889 * MCA handler if using standard interrupts
890 * It's just a trampoline to convert a regular interrupt
891 * into an NMI, which is only needed if the I/O-APIC can't
892 * generate and NMI.
893 *
894 *TBD: remove all this? It is not used on KnC, and the KnF's
895 * I've tested this on all have been OK sending NMIs.
896 */
897
898static irqreturn_t
899sbox_handler(int irq, void * tag)
900{
901 /*
902 * Convert this regular interrupt into an NMI.
903 */
904 mcu_cpu = smp_processor_id();
905 mcu_eoi = 1;
906 apic->send_IPI_self(NMI_VECTOR);
907 return IRQ_HANDLED;
908}
909#endif
910
911
912/*
913 * Reset all uncore MCA banks to defaults
914 */
915
916void
917box_reset(int arm)
918{
919 int i, j;
920 struct _mcu_rec * mr;
921
922 for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
923 mr = mcu_src + i;
924
925#ifdef CONFIG_MK1OM
926 if (mr->org == MC_ORG_TBOX && !mr_txs())
927 continue;
928#endif
929
930 for(j = 0; j < mr->num; j++) {
931 uint64_t status;
932
933 /*
934 *TBD: Do we want to pick up existing MCA events or drop
935 * them because we don't know _when_ they occurred?
936 * Reporting them would require internal buffer because
937 * it's unlikely the SCIF MC session is up at this point.
938 * For now we just enter events into the system log.
939 */
940 status = mr->rq(j, mr->ofs + MCU_STAT);
941 if (status & MCI_STATUS_VAL) {
942 MceInfo mc;
943
944 mcu_read(mr, j, &mc);
945 printk("RAS.uncore: discard MC event:\n"
946 "bnk %d, id %d, ctl %llx, stat %llx, addr %llx, misc %llx\n",
947 mr->org, j, mc.ctl, status, mc.addr, mc.misc);
948 }
949
950 /*
951 * Reset MCA bank registers.
952 */
953 mcu_reset(mr, j, arm);
954 }
955 }
956}
957
958
959/*
960 * Setup interrupt handlers by hooking into the SBOX's I/O-APIC.
961 * For now, we send an NMI to single CPU, and let it process the
962 * event. This may need to be expanded into a broadcast NMI similar
963 * to what the generic core MC event handler does in order to keep
964 * containment at high as we possibly can.
965 *
966 *TBD: code a dual rendez-vous mechanism on all active CPUs.
967 */
968
969int __init
970mcu_init(void)
971{
972#if MC_VERBOSE
973 int i, j;
974#endif
975
976 if (mce_disabled) {
977 printk("RAS.uncore: disabled\n");
978 }
979 else {
980 /*
981 * Clear rendez-vous counters
982 */
983 atomic_set(&mcu_callin, 0);
984 atomic_set(&mcu_leavin, 0);
985
986#if MC_VERBOSE
987 /*
988 * For debug only:
989 * Record all SBOX I/O-APIC registers to kernel log
990 */
991 printk("SBOX_APICIDR: %lx\n", mr_sbox_rl(0, SBOX_APICIDR));
992 printk("SBOX_APICVER: %lx\n", mr_sbox_rl(0, SBOX_APICVER));
993 printk("SBOX_APICAPR: %lx\n", mr_sbox_rl(0, SBOX_APICAPR));
994 for(i = 0; i < 26 ; i++)
995 printk("APICCRT%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICRT0 + (8 * i)));
996 for(i = 0; i < 8 ; i++)
997 printk("APICICR%d: %llx\n", i, mr_sbox_rq(0, SBOX_APICICR0 + (8 * i)));
998 printk("SBOX_MCA_INT_EN: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_EN));
999 printk("SBOX_MCA_INT_STAT: %lx\n", mr_sbox_rl(0, SBOX_MCA_INT_STAT));
1000#endif
1001
1002 /*
1003 * Disconnect un-core MC lines from SBOX I/O-APIC, setup the
1004 * individual BOXes, and clear any un-core MC pending flags
1005 * from SBOX I/O-APIC
1006 */
1007 mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
1008 box_reset(1);
1009 mr_sbox_wl(0, SBOX_MCA_INT_STAT, 0);
1010
1011 /*
1012 * Setup the SBOX I/O-APIC.
1013 * Un-core MC events are routed through a mask in register
1014 * SBOX_MCA_INT_EN into I/O APIC redirection table entry #16.
1015 * Ideally we want all uncore MC events to be handled similar
1016 * to core MCAs, which means we'd like an NMI on all CPUs.
1017 * On KnF the I/O-APIC may not trigger an NMI (PoC security)
1018 * and on KnC where NMI delivery is possible, it appears not
1019 * to be ideal to broadcast it to all CPUs because it could
1020 * wake up cores put to sleep bu power management rules.
1021 * See MCA HAS, SBOX HAS Vol 4, and A0 Vol 2 for details.
1022 *
1023 * The redirection table entry has the following format:
1024 * 47:32 Destination ID field
1025 * 17 Interrrupt set (testing: trigger an interrupt)
1026 * 16 Interrupt mask (0=enable, 1=disable)
1027 * 15 Trigger mode (0=edge, 1=level)
1028 * 14 Remote IRR (0=inactive, 1=accepted)
1029 * 13 Interrupt polarity (0=active_high, 1=active_low)
1030 * 12 Delivery status (0=idle, 1=send_pending)
1031 * 11 Destination mode (0=physical, 1=logical)
1032 * 10:8 Delivery mode (0=fixed, low, SMI, rsvd, NMI, INIT, rsvd, ext)
1033 * 7:0 Interrupt vector
1034 *
1035 * The I/O-APIC input is 'rising edge', so we'd need to select
1036 * it to be edge triggered, active high.
1037 */
1038#if MCU_NMI
1039 /*
1040 * If event delivery by NMI is preferred, we want it delivered on
1041 * the BP. There is already an NMI handler present, so we have to
1042 * tap into the existing NMI handler for the event notifications.
1043 *
1044 * The bit-fiddling below says:
1045 * NMI delivery | Destination CPU APIC ID
1046 */
1047 mcu_cpu = 0;
1048 mcu_redir = PUT_BITS(10, 8, 4) | PUT_BITS(47, 32, (uint64_t) cpu_data(mcu_cpu).apicid);
1049 mcu_old_redir = mr_sbox_rq(0, SBOX_APICRT16);
1050 mr_sbox_wq(0, SBOX_APICRT16, mcu_redir | PUT_BITS(16, 16, 1));
1051 mr_sbox_wq(0, SBOX_APICRT16, mcu_redir);
1052#else
1053 /*
1054 * If event delivery by regular interrupt is preferred, then all
1055 * I/O-APIC setup will be handled by calling request_irq(16,..).
1056 * There is no guarantee that the event will be sent to the BP
1057 * (though it's more than likely) so we'll defer indentifying the
1058 * event handling CPU (mcu_cpu) till we receive the callback from
1059 * the interrupt handling sus-system.
1060 * The sbox_handler() function just converts the callback into an
1061 * NMI because the only way containment can be achieved is to be
1062 * able to lock down the system completely, which is not realistic
1063 * using regular interrupts.
1064 */
1065 mcu_eoi = 0;
1066 (void) request_irq(16, sbox_handler, IRQF_TRIGGER_HIGH, "un-core mce", (void *) 42);
1067#endif
1068
1069 /*
1070 * Finally, place hook in NMI handler in case there's
1071 * an un-core event pending and connect un-core MC lines
1072 * through to SBOX I/O-APIC. From this point onwards we
1073 * can get uncore MC events at any time.
1074 */
1075 mca_nmi = mcu_nmi;
1076 mr_sbox_wl(0, SBOX_MCA_INT_EN, mr_txs() ? 0x0fffff07 : 0xff07);
1077
1078#if MC_VERBOSE
1079 /*
1080 * For debug only
1081 * Record initial uncore MCA banks to kernel log.
1082 */
1083 printk("RAS.uncore: dumping all banks\n");
1084
1085 /*
1086 * Dump all MCA registers we set to kernel log
1087 */
1088 for(i = 0; i < ARRAY_SIZE(mcu_src); i++) {
1089 char * boxname;
1090 struct _mcu_rec * mr;
1091 uint64_t ctl, stat, addr, misc;
1092
1093 mr = mcu_src + i;
1094#ifdef CONFIG_MK1OM
1095 if (mr->org == MC_ORG_TBOX && !mr_txs())
1096 continue;
1097#endif
1098 switch(mr->org) {
1099 case MC_ORG_SBOX: boxname = "SBOX"; break;
1100 case MC_ORG_DBOX: boxname = "DBOX"; break;
1101 case MC_ORG_GBOX: boxname = "GBOX"; break;
1102 case MC_ORG_TBOX: boxname = "TBOX"; break;
1103 default: boxname = "??"; /* Damn compiler */
1104 }
1105
1106 for(j = 0; j < mr->num; j++) {
1107
1108 if (mr->qflg & MCU_CTL_64)
1109 ctl = mr->rq(j, mr->ofs + MCU_CTRL);
1110 else
1111 ctl = (uint64_t) mr->rl(j, mr->ofs + MCU_CTRL);
1112
1113 stat = mr->rq(j, mr->ofs + MCU_STAT);
1114
1115 if (mr->qflg & MCU_NO_ADDR)
1116 addr = 0;
1117 else {
1118 if (mr->qflg & MCU_ADDR_32)
1119 addr = (uint64_t) mr->rl(j, mr->ofs + MCU_ADDR);
1120 else
1121 addr = mr->rq(j, mr->ofs + MCU_ADDR);
1122 }
1123
1124 if (mr->qflg & MCU_NO_MISC)
1125 misc = 0;
1126 else {
1127 if (mr->qflg & MCU_MISC_64)
1128 misc = mr->rq(j, mr->ofs + MCU_MISC);
1129 else
1130 misc = (uint64_t) mr->rl(j, mr->ofs + MCU_MISC);
1131 }
1132
1133 printk("RAS.uncore: %s[%d] = { %llx, %llx, %llx, %llx }\n",
1134 boxname, j, ctl, stat, addr, misc);
1135 }
1136 }
1137 printk("RAS.uncore: MCA_INT_EN = %x\n", mr_sbox_rl(0, SBOX_MCA_INT_EN));
1138 printk("RAS.uncore: APICRT16 = %llx\n", mr_sbox_rq(0, SBOX_APICRT16));
1139#endif
1140
1141 printk("RAS.uncore: init complete\n");
1142 }
1143
1144 return 0;
1145}
1146
1147
1148/*
1149 * Cleanup for module unload.
1150 * Clear/restore hooks in the SBOX's I/O-APIC.
1151 */
1152
1153int __exit
1154mcu_exit(void)
1155{
1156 if (! mce_disabled) {
1157
1158 /*
1159 * Disconnect uncore MC lines from SBOX I/O-APIC.
1160 * No new uncore MC interrupts will be made.
1161 */
1162 mr_sbox_wl(0, SBOX_MCA_INT_EN, 0);
1163
1164 /*
1165 * Disconnect exception handler.
1166 */
1167#if MCU_NMI
1168 mcu_redir = 0;
1169 mr_sbox_wq(0, SBOX_APICRT16, mcu_old_redir);
1170#else
1171 mcu_eoi = 0;
1172 free_irq(16, (void *) 42);
1173#endif
1174
1175 /*
1176 * Cut link from kernel's NMI handler and
1177 * wait for everybody in handler to leave.
1178 */
1179 mca_nmi = 0;
1180 while(atomic_read(&mcu_entry))
1181 cpu_relax();
1182 mcu_cpu = -1;
1183
1184 /*
1185 * No more events will be received, clear
1186 * MC reporting in all BOXes (just in case)
1187 */
1188 box_reset(0);
1189 }
1190
1191 printk("RAS.uncore: exit complete\n");
1192 return 0;
1193}
1194