1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char * const rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char * const to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char * const f15h_mc1_mce_desc[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
87 static const char * const f15h_mc2_mce_desc[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
101 "PRB address parity error"
104 static const char * const mc4_mce_desc[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const mc5_mce_desc[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
142 static bool f12h_mc0_mce(u16 ec, u8 xec)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll == LL_L1)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
160 static bool f10h_mc0_mce(u16 ec, u8 xec)
162 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
163 pr_cont("during data scrub.\n");
166 return f12h_mc0_mce(ec, xec);
169 static bool k8_mc0_mce(u16 ec, u8 xec)
172 pr_cont("during system linefill.\n");
176 return f10h_mc0_mce(ec, xec);
179 static bool f14h_mc0_mce(u16 ec, u8 xec)
186 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4 == R4_DRD ? "load/hw prf" : "store"));
196 pr_cont("Copyback parity error on a tag miss.\n");
199 pr_cont("Tag parity error during snoop.\n");
204 } else if (BUS_ERROR(ec)) {
206 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
209 pr_cont("System read data error on a ");
213 pr_cont("TLB reload.\n");
231 static bool f15h_mc0_mce(u16 ec, u8 xec)
239 pr_cont("Data Array access error.\n");
243 pr_cont("UC error during a linefill from L2/NB.\n");
248 pr_cont("STQ access error.\n");
252 pr_cont("SCB access error.\n");
256 pr_cont("Tag error.\n");
260 pr_cont("LDQ access error.\n");
266 } else if (BUS_ERROR(ec)) {
269 pr_cont("System Read Data Error.\n");
271 pr_cont(" Internal error condition type %d.\n", xec);
278 static void decode_mc0_mce(struct mce *m)
280 u16 ec = EC(m->status);
281 u8 xec = XEC(m->status, xec_mask);
283 pr_emerg(HW_ERR "MC0 Error: ");
285 /* TLB error signatures are the same across families */
287 if (TT(ec) == TT_DATA) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec),
289 ((xec == 2) ? "locked miss"
290 : (xec ? "multimatch" : "parity")));
293 } else if (fam_ops->mc0_mce(ec, xec))
296 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
299 static bool k8_mc1_mce(u16 ec, u8 xec)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll == 0x1) {
312 pr_cont("Parity error during data load.\n");
316 pr_cont("Copyback Parity/Victim error.\n");
320 pr_cont("Tag Snoop error.\n");
333 static bool f14h_mc1_mce(u16 ec, u8 xec)
339 if (TT(ec) != 0 || LL(ec) != 1)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4 == R4_SNOOP)
345 pr_cont("Tag error during snoop/victimization.\n");
352 static bool f15h_mc1_mce(u16 ec, u8 xec)
361 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
365 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
369 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
373 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
382 static void decode_mc1_mce(struct mce *m)
384 u16 ec = EC(m->status);
385 u8 xec = XEC(m->status, xec_mask);
387 pr_emerg(HW_ERR "MC1 Error: ");
390 pr_cont("%s TLB %s.\n", LL_MSG(ec),
391 (xec ? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec)) {
393 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
395 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
396 } else if (fam_ops->mc1_mce(ec, xec))
399 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
402 static bool k8_mc2_mce(u16 ec, u8 xec)
407 pr_cont(" in the write data buffers.\n");
409 pr_cont(" in the victim data buffers.\n");
410 else if (xec == 0x2 && MEM_ERROR(ec))
411 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
412 else if (xec == 0x0) {
414 pr_cont(": %s error in a Page Descriptor Cache or "
415 "Guest TLB.\n", TT_MSG(ec));
416 else if (BUS_ERROR(ec))
417 pr_cont(": %s/ECC error in data read from NB: %s.\n",
418 R4_MSG(ec), PP_MSG(ec));
419 else if (MEM_ERROR(ec)) {
423 pr_cont(": %s error during data copyback.\n",
426 pr_cont(": %s parity/ECC error during data "
427 "access from L2.\n", R4_MSG(ec));
438 static bool f15h_mc2_mce(u16 ec, u8 xec)
444 pr_cont("Data parity TLB read error.\n");
446 pr_cont("Poison data provided for TLB fill.\n");
449 } else if (BUS_ERROR(ec)) {
453 pr_cont("Error during attempted NB data read.\n");
454 } else if (MEM_ERROR(ec)) {
457 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
461 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
472 static void decode_mc2_mce(struct mce *m)
474 u16 ec = EC(m->status);
475 u8 xec = XEC(m->status, xec_mask);
477 pr_emerg(HW_ERR "MC2 Error: ");
479 if (!fam_ops->mc2_mce(ec, xec))
480 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
483 static void decode_mc3_mce(struct mce *m)
485 u16 ec = EC(m->status);
486 u8 xec = XEC(m->status, xec_mask);
488 if (boot_cpu_data.x86 >= 0x14) {
489 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
490 " please report on LKML.\n");
494 pr_emerg(HW_ERR "MC3 Error");
499 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
502 pr_cont(" during %s.\n", R4_MSG(ec));
509 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
512 static void decode_mc4_mce(struct mce *m)
514 struct cpuinfo_x86 *c = &boot_cpu_data;
515 int node_id = amd_get_nb_id(m->extcpu);
516 u16 ec = EC(m->status);
517 u8 xec = XEC(m->status, 0x1f);
520 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
525 /* special handling for DRAM ECCs */
526 if (xec == 0x0 || xec == 0x8) {
527 /* no ECCs on F11h */
531 pr_cont("%s.\n", mc4_mce_desc[xec]);
534 nb_bus_decoder(node_id, m);
541 pr_cont("GART Table Walk data error.\n");
542 else if (BUS_ERROR(ec))
543 pr_cont("DMA Exclusion Vector Table Walk error.\n");
549 if (boot_cpu_data.x86 == 0x15)
550 pr_cont("Compute Unit Data Error.\n");
563 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
567 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
570 static void decode_mc5_mce(struct mce *m)
572 struct cpuinfo_x86 *c = &boot_cpu_data;
573 u8 xec = XEC(m->status, xec_mask);
575 if (c->x86 == 0xf || c->x86 == 0x11)
578 pr_emerg(HW_ERR "MC5 Error: ");
580 if (xec == 0x0 || xec == 0xc)
581 pr_cont("%s.\n", mc5_mce_desc[xec]);
583 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
590 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
593 static void decode_mc6_mce(struct mce *m)
595 u8 xec = XEC(m->status, xec_mask);
597 pr_emerg(HW_ERR "MC6 Error: ");
601 pr_cont("Free List");
605 pr_cont("Physical Register File");
609 pr_cont("Retire Queue");
613 pr_cont("Scheduler table");
617 pr_cont("Status Register File");
625 pr_cont(" parity error.\n");
630 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
633 static inline void amd_decode_err_code(u16 ec)
636 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
639 pr_cont(", mem/io: %s", II_MSG(ec));
641 pr_cont(", tx: %s", TT_MSG(ec));
643 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
644 pr_cont(", mem-tx: %s", R4_MSG(ec));
647 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
654 * Filter out unwanted MCE signatures here.
656 static bool amd_filter_mce(struct mce *m)
658 u8 xec = (m->status >> 16) & 0x1f;
661 * NB GART TLB error reporting is disabled by default.
663 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
669 static const char *decode_error_status(struct mce *m)
671 if (m->status & MCI_STATUS_UC) {
672 if (m->status & MCI_STATUS_PCC)
673 return "System Fatal error.";
674 if (m->mcgstatus & MCG_STATUS_RIPV)
675 return "Uncorrected, software restartable error.";
676 return "Uncorrected, software containable error.";
679 if (m->status & MCI_STATUS_DEFERRED)
680 return "Deferred error.";
682 return "Corrected error, no action required.";
685 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
687 struct mce *m = (struct mce *)data;
688 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
691 if (amd_filter_mce(m))
727 pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m));
729 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
731 c->x86, c->x86_model, c->x86_mask,
733 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
734 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
735 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
736 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
737 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
741 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
742 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
744 /* do the two bits[14:13] together */
745 ecc = (m->status >> 45) & 0x3;
747 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
749 pr_cont("]: 0x%016llx\n", m->status);
751 if (m->status & MCI_STATUS_ADDRV)
752 pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
754 amd_decode_err_code(m->status & 0xffff);
758 EXPORT_SYMBOL_GPL(amd_decode_mce);
760 static struct notifier_block amd_mce_dec_nb = {
761 .notifier_call = amd_decode_mce,
764 static int __init mce_amd_init(void)
766 struct cpuinfo_x86 *c = &boot_cpu_data;
768 if (c->x86_vendor != X86_VENDOR_AMD)
771 if (c->x86 < 0xf || c->x86 > 0x15)
774 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
780 fam_ops->mc0_mce = k8_mc0_mce;
781 fam_ops->mc1_mce = k8_mc1_mce;
782 fam_ops->mc2_mce = k8_mc2_mce;
786 fam_ops->mc0_mce = f10h_mc0_mce;
787 fam_ops->mc1_mce = k8_mc1_mce;
788 fam_ops->mc2_mce = k8_mc2_mce;
792 fam_ops->mc0_mce = k8_mc0_mce;
793 fam_ops->mc1_mce = k8_mc1_mce;
794 fam_ops->mc2_mce = k8_mc2_mce;
798 fam_ops->mc0_mce = f12h_mc0_mce;
799 fam_ops->mc1_mce = k8_mc1_mce;
800 fam_ops->mc2_mce = k8_mc2_mce;
804 nb_err_cpumask = 0x3;
805 fam_ops->mc0_mce = f14h_mc0_mce;
806 fam_ops->mc1_mce = f14h_mc1_mce;
807 fam_ops->mc2_mce = k8_mc2_mce;
812 fam_ops->mc0_mce = f15h_mc0_mce;
813 fam_ops->mc1_mce = f15h_mc1_mce;
814 fam_ops->mc2_mce = f15h_mc2_mce;
818 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
823 pr_info("MCE: In-kernel MCE decoding enabled.\n");
825 mce_register_decode_chain(&amd_mce_dec_nb);
829 early_initcall(mce_amd_init);
832 static void __exit mce_amd_exit(void)
834 mce_unregister_decode_chain(&amd_mce_dec_nb);
838 MODULE_DESCRIPTION("AMD MCE decoder");
839 MODULE_ALIAS("edac-mce-amd");
840 MODULE_LICENSE("GPL");
841 module_exit(mce_amd_exit);