1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static bool f12h_dc_mce(u16 ec, u8 xec)
87 pr_cont("during L1 linefill from L2.\n");
89 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
96 static bool f10h_dc_mce(u16 ec, u8 xec)
98 u8 r4 = (ec >> 4) & 0xf;
101 if (r4 == R4_GEN && ll == LL_L1) {
102 pr_cont("during data scrub.\n");
105 return f12h_dc_mce(ec, xec);
108 static bool k8_dc_mce(u16 ec, u8 xec)
111 pr_cont("during system linefill.\n");
115 return f10h_dc_mce(ec, xec);
118 static bool f14h_dc_mce(u16 ec, u8 xec)
120 u8 r4 = (ec >> 4) & 0xf;
122 u8 tt = (ec >> 2) & 0x3;
128 if (tt != TT_DATA || ll != LL_L1)
134 pr_cont("Data/Tag parity error due to %s.\n",
135 (r4 == R4_DRD ? "load/hw prf" : "store"));
138 pr_cont("Copyback parity error on a tag miss.\n");
141 pr_cont("Tag parity error during snoop.\n");
146 } else if (BUS_ERROR(ec)) {
148 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
151 pr_cont("System read data error on a ");
155 pr_cont("TLB reload.\n");
173 static bool f15h_dc_mce(u16 ec, u8 xec)
181 pr_cont("Data Array access error.\n");
185 pr_cont("UC error during a linefill from L2/NB.\n");
190 pr_cont("STQ access error.\n");
194 pr_cont("SCB access error.\n");
198 pr_cont("Tag error.\n");
202 pr_cont("LDQ access error.\n");
208 } else if (BUS_ERROR(ec)) {
211 pr_cont("during system linefill.\n");
213 pr_cont(" Internal %s condition.\n",
214 ((xec == 1) ? "livelock" : "deadlock"));
221 static void amd_decode_dc_mce(struct mce *m)
223 u16 ec = m->status & 0xffff;
224 u8 xec = (m->status >> 16) & xec_mask;
226 pr_emerg(HW_ERR "Data Cache Error: ");
228 /* TLB error signatures are the same across families */
230 u8 tt = (ec >> 2) & 0x3;
233 pr_cont("%s TLB %s.\n", LL_MSG(ec),
234 ((xec == 2) ? "locked miss"
235 : (xec ? "multimatch" : "parity")));
238 } else if (fam_ops->dc_mce(ec, xec))
241 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
244 static bool k8_ic_mce(u16 ec)
247 u8 r4 = (ec >> 4) & 0xf;
254 pr_cont("during a linefill from L2.\n");
255 else if (ll == 0x1) {
258 pr_cont("Parity error during data load.\n");
262 pr_cont("Copyback Parity/Victim error.\n");
266 pr_cont("Tag Snoop error.\n");
279 static bool f14h_ic_mce(u16 ec)
282 u8 tt = (ec >> 2) & 0x3;
283 u8 r4 = (ec >> 4) & 0xf;
287 if (tt != 0 || ll != 1)
291 pr_cont("Data/tag array parity error for a tag hit.\n");
292 else if (r4 == R4_SNOOP)
293 pr_cont("Tag error during snoop/victimization.\n");
300 static void amd_decode_ic_mce(struct mce *m)
302 u16 ec = m->status & 0xffff;
303 u8 xec = (m->status >> 16) & xec_mask;
305 pr_emerg(HW_ERR "Instruction Cache Error: ");
308 pr_cont("%s TLB %s.\n", LL_MSG(ec),
309 (xec ? "multimatch" : "parity error"));
310 else if (BUS_ERROR(ec)) {
311 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
313 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
314 } else if (fam_ops->ic_mce(ec))
317 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
320 static void amd_decode_bu_mce(struct mce *m)
322 u32 ec = m->status & 0xffff;
323 u32 xec = (m->status >> 16) & xec_mask;
325 pr_emerg(HW_ERR "Bus Unit Error");
328 pr_cont(" in the write data buffers.\n");
330 pr_cont(" in the victim data buffers.\n");
331 else if (xec == 0x2 && MEM_ERROR(ec))
332 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
333 else if (xec == 0x0) {
335 pr_cont(": %s error in a Page Descriptor Cache or "
336 "Guest TLB.\n", TT_MSG(ec));
337 else if (BUS_ERROR(ec))
338 pr_cont(": %s/ECC error in data read from NB: %s.\n",
339 RRRR_MSG(ec), PP_MSG(ec));
340 else if (MEM_ERROR(ec)) {
341 u8 rrrr = (ec >> 4) & 0xf;
344 pr_cont(": %s error during data copyback.\n",
346 else if (rrrr <= 0x1)
347 pr_cont(": %s parity/ECC error during data "
348 "access from L2.\n", RRRR_MSG(ec));
359 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
362 static void amd_decode_ls_mce(struct mce *m)
364 u16 ec = m->status & 0xffff;
365 u8 xec = (m->status >> 16) & xec_mask;
367 if (boot_cpu_data.x86 == 0x14) {
368 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
369 " please report on LKML.\n");
373 pr_emerg(HW_ERR "Load Store Error");
376 u8 r4 = (ec >> 4) & 0xf;
378 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
381 pr_cont(" during %s.\n", RRRR_MSG(ec));
388 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
391 static bool k8_nb_mce(u16 ec, u8 xec)
397 pr_cont("CRC error detected on HT link.\n");
401 pr_cont("Invalid GART PTE entry during GART table walk.\n");
405 pr_cont("Unsupported atomic RMW received from an IO link.\n");
410 if (boot_cpu_data.x86 == 0x11)
413 pr_cont("DRAM ECC error detected on the NB.\n");
417 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
428 static bool f10h_nb_mce(u16 ec, u8 xec)
433 if (k8_nb_mce(ec, xec))
447 pr_cont("GART Table Walk data error.\n");
448 else if (BUS_ERROR(ec))
449 pr_cont("DMA Exclusion Vector Table Walk error.\n");
467 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
473 static bool nb_noop_mce(u16 ec, u8 xec)
478 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
480 u8 xec = (m->status >> 16) & 0x1f;
481 u16 ec = m->status & 0xffff;
482 u32 nbsh = (u32)(m->status >> 32);
484 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
487 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
488 * value encoding has changed so interpret those differently
490 if ((boot_cpu_data.x86 == 0x10) &&
491 (boot_cpu_data.x86_model > 7)) {
492 if (nbsh & K8_NBSH_ERR_CPU_VAL)
493 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
495 u8 assoc_cpus = nbsh & nb_err_cpumask;
498 pr_cont(", core: %d", fls(assoc_cpus) - 1);
503 pr_cont("Sync error (sync packets on HT link detected).\n");
507 pr_cont("HT Master abort.\n");
511 pr_cont("HT Target abort.\n");
515 pr_cont("NB Watchdog timeout.\n");
519 pr_cont("SVM DMA Exclusion Vector error.\n");
526 if (!fam_ops->nb_mce(ec, xec))
529 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
530 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
531 nb_bus_decoder(node_id, m, nbcfg);
536 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
538 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
540 static void amd_decode_fr_mce(struct mce *m)
542 if (boot_cpu_data.x86 == 0xf ||
543 boot_cpu_data.x86 == 0x11)
546 /* we have only one error signature so match all fields at once. */
547 if ((m->status & 0xffff) == 0x0f0f) {
548 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
553 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
556 static inline void amd_decode_err_code(u16 ec)
559 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
560 TT_MSG(ec), LL_MSG(ec));
561 } else if (MEM_ERROR(ec)) {
562 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
563 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
564 } else if (BUS_ERROR(ec)) {
565 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
566 "Participating Processor: %s\n",
567 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
570 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
574 * Filter out unwanted MCE signatures here.
576 static bool amd_filter_mce(struct mce *m)
578 u8 xec = (m->status >> 16) & 0x1f;
581 * NB GART TLB error reporting is disabled by default.
583 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
589 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
591 struct mce *m = (struct mce *)data;
594 if (amd_filter_mce(m))
597 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
599 pr_cont("%sorrected error, other errors lost: %s, "
600 "CPU context corrupt: %s",
601 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
602 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
603 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
605 /* do the two bits[14:13] together */
606 ecc = (m->status >> 45) & 0x3;
608 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
614 amd_decode_dc_mce(m);
618 amd_decode_ic_mce(m);
622 amd_decode_bu_mce(m);
626 amd_decode_ls_mce(m);
630 node = amd_get_nb_id(m->extcpu);
631 amd_decode_nb_mce(node, m, 0);
635 amd_decode_fr_mce(m);
642 amd_decode_err_code(m->status & 0xffff);
646 EXPORT_SYMBOL_GPL(amd_decode_mce);
648 static struct notifier_block amd_mce_dec_nb = {
649 .notifier_call = amd_decode_mce,
652 static int __init mce_amd_init(void)
654 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
657 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
658 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
661 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
665 switch (boot_cpu_data.x86) {
667 fam_ops->dc_mce = k8_dc_mce;
668 fam_ops->ic_mce = k8_ic_mce;
669 fam_ops->nb_mce = k8_nb_mce;
673 fam_ops->dc_mce = f10h_dc_mce;
674 fam_ops->ic_mce = k8_ic_mce;
675 fam_ops->nb_mce = f10h_nb_mce;
679 fam_ops->dc_mce = k8_dc_mce;
680 fam_ops->ic_mce = k8_ic_mce;
681 fam_ops->nb_mce = f10h_nb_mce;
685 fam_ops->dc_mce = f12h_dc_mce;
686 fam_ops->ic_mce = k8_ic_mce;
687 fam_ops->nb_mce = nb_noop_mce;
691 nb_err_cpumask = 0x3;
692 fam_ops->dc_mce = f14h_dc_mce;
693 fam_ops->ic_mce = f14h_ic_mce;
694 fam_ops->nb_mce = nb_noop_mce;
699 fam_ops->dc_mce = f15h_dc_mce;
703 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
709 pr_info("MCE: In-kernel MCE decoding enabled.\n");
711 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
715 early_initcall(mce_amd_init);
718 static void __exit mce_amd_exit(void)
720 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
724 MODULE_DESCRIPTION("AMD MCE decoder");
725 MODULE_ALIAS("edac-mce-amd");
726 MODULE_LICENSE("GPL");
727 module_exit(mce_amd_exit);