1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
46 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
60 const char *to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
64 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char *f10h_nb_mce_desc[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec, u8 xec)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll == LL_L1)
142 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
149 static bool f10h_dc_mce(u16 ec, u8 xec)
151 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
152 pr_cont("during data scrub.\n");
155 return f12h_dc_mce(ec, xec);
158 static bool k8_dc_mce(u16 ec, u8 xec)
161 pr_cont("during system linefill.\n");
165 return f10h_dc_mce(ec, xec);
168 static bool f14h_dc_mce(u16 ec, u8 xec)
175 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
181 pr_cont("Data/Tag parity error due to %s.\n",
182 (r4 == R4_DRD ? "load/hw prf" : "store"));
185 pr_cont("Copyback parity error on a tag miss.\n");
188 pr_cont("Tag parity error during snoop.\n");
193 } else if (BUS_ERROR(ec)) {
195 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
198 pr_cont("System read data error on a ");
202 pr_cont("TLB reload.\n");
220 static bool f15h_dc_mce(u16 ec, u8 xec)
228 pr_cont("Data Array access error.\n");
232 pr_cont("UC error during a linefill from L2/NB.\n");
237 pr_cont("STQ access error.\n");
241 pr_cont("SCB access error.\n");
245 pr_cont("Tag error.\n");
249 pr_cont("LDQ access error.\n");
255 } else if (BUS_ERROR(ec)) {
258 pr_cont("during system linefill.\n");
260 pr_cont(" Internal %s condition.\n",
261 ((xec == 1) ? "livelock" : "deadlock"));
268 static void amd_decode_dc_mce(struct mce *m)
270 u16 ec = EC(m->status);
271 u8 xec = XEC(m->status, xec_mask);
273 pr_emerg(HW_ERR "Data Cache Error: ");
275 /* TLB error signatures are the same across families */
277 if (TT(ec) == TT_DATA) {
278 pr_cont("%s TLB %s.\n", LL_MSG(ec),
279 ((xec == 2) ? "locked miss"
280 : (xec ? "multimatch" : "parity")));
283 } else if (fam_ops->dc_mce(ec, xec))
286 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
289 static bool k8_ic_mce(u16 ec, u8 xec)
298 pr_cont("during a linefill from L2.\n");
299 else if (ll == 0x1) {
302 pr_cont("Parity error during data load.\n");
306 pr_cont("Copyback Parity/Victim error.\n");
310 pr_cont("Tag Snoop error.\n");
323 static bool f14h_ic_mce(u16 ec, u8 xec)
329 if (TT(ec) != 0 || LL(ec) != 1)
333 pr_cont("Data/tag array parity error for a tag hit.\n");
334 else if (r4 == R4_SNOOP)
335 pr_cont("Tag error during snoop/victimization.\n");
342 static bool f15h_ic_mce(u16 ec, u8 xec)
351 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
355 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
359 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
368 static void amd_decode_ic_mce(struct mce *m)
370 u16 ec = EC(m->status);
371 u8 xec = XEC(m->status, xec_mask);
373 pr_emerg(HW_ERR "Instruction Cache Error: ");
376 pr_cont("%s TLB %s.\n", LL_MSG(ec),
377 (xec ? "multimatch" : "parity error"));
378 else if (BUS_ERROR(ec)) {
379 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
381 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
382 } else if (fam_ops->ic_mce(ec, xec))
385 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
388 static void amd_decode_bu_mce(struct mce *m)
390 u16 ec = EC(m->status);
391 u8 xec = XEC(m->status, xec_mask);
393 pr_emerg(HW_ERR "Bus Unit Error");
396 pr_cont(" in the write data buffers.\n");
398 pr_cont(" in the victim data buffers.\n");
399 else if (xec == 0x2 && MEM_ERROR(ec))
400 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
401 else if (xec == 0x0) {
403 pr_cont(": %s error in a Page Descriptor Cache or "
404 "Guest TLB.\n", TT_MSG(ec));
405 else if (BUS_ERROR(ec))
406 pr_cont(": %s/ECC error in data read from NB: %s.\n",
407 R4_MSG(ec), PP_MSG(ec));
408 else if (MEM_ERROR(ec)) {
412 pr_cont(": %s error during data copyback.\n",
415 pr_cont(": %s parity/ECC error during data "
416 "access from L2.\n", R4_MSG(ec));
427 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
430 static void amd_decode_cu_mce(struct mce *m)
432 u16 ec = EC(m->status);
433 u8 xec = XEC(m->status, xec_mask);
435 pr_emerg(HW_ERR "Combined Unit Error: ");
439 pr_cont("Data parity TLB read error.\n");
441 pr_cont("Poison data provided for TLB fill.\n");
444 } else if (BUS_ERROR(ec)) {
448 pr_cont("Error during attempted NB data read.\n");
449 } else if (MEM_ERROR(ec)) {
452 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
456 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
467 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
470 static void amd_decode_ls_mce(struct mce *m)
472 u16 ec = EC(m->status);
473 u8 xec = XEC(m->status, xec_mask);
475 if (boot_cpu_data.x86 >= 0x14) {
476 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
477 " please report on LKML.\n");
481 pr_emerg(HW_ERR "Load Store Error");
486 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
489 pr_cont(" during %s.\n", R4_MSG(ec));
496 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
499 static bool k8_nb_mce(u16 ec, u8 xec)
505 pr_cont("CRC error detected on HT link.\n");
509 pr_cont("Invalid GART PTE entry during GART table walk.\n");
513 pr_cont("Unsupported atomic RMW received from an IO link.\n");
518 if (boot_cpu_data.x86 == 0x11)
521 pr_cont("DRAM ECC error detected on the NB.\n");
525 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
536 static bool f10h_nb_mce(u16 ec, u8 xec)
541 if (k8_nb_mce(ec, xec))
555 pr_cont("GART Table Walk data error.\n");
556 else if (BUS_ERROR(ec))
557 pr_cont("DMA Exclusion Vector Table Walk error.\n");
565 if (boot_cpu_data.x86 == 0x15)
566 pr_cont("Compute Unit Data Error.\n");
584 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
590 static bool nb_noop_mce(u16 ec, u8 xec)
595 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
597 u16 ec = EC(m->status);
598 u8 xec = XEC(m->status, 0x1f);
599 u32 nbsh = (u32)(m->status >> 32);
601 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
604 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
605 * value encoding has changed so interpret those differently
607 if ((boot_cpu_data.x86 == 0x10) &&
608 (boot_cpu_data.x86_model > 7)) {
609 if (nbsh & K8_NBSH_ERR_CPU_VAL)
610 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
612 u8 assoc_cpus = nbsh & nb_err_cpumask;
615 pr_cont(", core: %d", fls(assoc_cpus) - 1);
620 pr_cont("Sync error (sync packets on HT link detected).\n");
624 pr_cont("HT Master abort.\n");
628 pr_cont("HT Target abort.\n");
632 pr_cont("NB Watchdog timeout.\n");
636 pr_cont("SVM DMA Exclusion Vector error.\n");
643 if (!fam_ops->nb_mce(ec, xec))
646 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
647 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
648 nb_bus_decoder(node_id, m, nbcfg);
653 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
655 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
657 static void amd_decode_fr_mce(struct mce *m)
659 struct cpuinfo_x86 *c = &boot_cpu_data;
660 u8 xec = XEC(m->status, xec_mask);
662 if (c->x86 == 0xf || c->x86 == 0x11)
665 if (c->x86 != 0x15 && xec != 0x0)
668 pr_emerg(HW_ERR "%s Error: ",
669 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
671 if (xec == 0x0 || xec == 0xc)
672 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
674 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
681 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
684 static void amd_decode_fp_mce(struct mce *m)
686 u8 xec = XEC(m->status, xec_mask);
688 pr_emerg(HW_ERR "Floating Point Unit Error: ");
692 pr_cont("Free List");
696 pr_cont("Physical Register File");
700 pr_cont("Retire Queue");
704 pr_cont("Scheduler table");
708 pr_cont("Status Register File");
716 pr_cont(" parity error.\n");
721 pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
724 static inline void amd_decode_err_code(u16 ec)
727 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
730 pr_cont(", mem/io: %s", II_MSG(ec));
732 pr_cont(", tx: %s", TT_MSG(ec));
734 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
735 pr_cont(", mem-tx: %s", R4_MSG(ec));
738 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
745 * Filter out unwanted MCE signatures here.
747 static bool amd_filter_mce(struct mce *m)
749 u8 xec = (m->status >> 16) & 0x1f;
752 * NB GART TLB error reporting is disabled by default.
754 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
760 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
762 struct mce *m = (struct mce *)data;
763 struct cpuinfo_x86 *c = &boot_cpu_data;
766 if (amd_filter_mce(m))
769 pr_emerg(HW_ERR "MC%d_STATUS[%s|%s|%s|%s|%s",
771 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
772 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
773 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
774 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
775 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
779 ((m->status & BIT_64(44)) ? "Deferred" : "-"),
780 ((m->status & BIT_64(43)) ? "Poison" : "-"));
782 /* do the two bits[14:13] together */
783 ecc = (m->status >> 45) & 0x3;
785 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
787 pr_cont("]: 0x%016llx\n", m->status);
792 amd_decode_dc_mce(m);
796 amd_decode_ic_mce(m);
801 amd_decode_cu_mce(m);
803 amd_decode_bu_mce(m);
807 amd_decode_ls_mce(m);
811 node = amd_get_nb_id(m->extcpu);
812 amd_decode_nb_mce(node, m, 0);
816 amd_decode_fr_mce(m);
820 amd_decode_fp_mce(m);
827 amd_decode_err_code(m->status & 0xffff);
831 EXPORT_SYMBOL_GPL(amd_decode_mce);
833 static struct notifier_block amd_mce_dec_nb = {
834 .notifier_call = amd_decode_mce,
837 static int __init mce_amd_init(void)
839 struct cpuinfo_x86 *c = &boot_cpu_data;
841 if (c->x86_vendor != X86_VENDOR_AMD)
844 if ((c->x86 < 0xf || c->x86 > 0x12) &&
845 (c->x86 != 0x14 || c->x86_model > 0xf) &&
846 (c->x86 != 0x15 || c->x86_model > 0xf))
849 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
855 fam_ops->dc_mce = k8_dc_mce;
856 fam_ops->ic_mce = k8_ic_mce;
857 fam_ops->nb_mce = k8_nb_mce;
861 fam_ops->dc_mce = f10h_dc_mce;
862 fam_ops->ic_mce = k8_ic_mce;
863 fam_ops->nb_mce = f10h_nb_mce;
867 fam_ops->dc_mce = k8_dc_mce;
868 fam_ops->ic_mce = k8_ic_mce;
869 fam_ops->nb_mce = f10h_nb_mce;
873 fam_ops->dc_mce = f12h_dc_mce;
874 fam_ops->ic_mce = k8_ic_mce;
875 fam_ops->nb_mce = nb_noop_mce;
879 nb_err_cpumask = 0x3;
880 fam_ops->dc_mce = f14h_dc_mce;
881 fam_ops->ic_mce = f14h_ic_mce;
882 fam_ops->nb_mce = nb_noop_mce;
887 fam_ops->dc_mce = f15h_dc_mce;
888 fam_ops->ic_mce = f15h_ic_mce;
889 fam_ops->nb_mce = f10h_nb_mce;
893 printk(KERN_WARNING "Huh? What family is that: %d?!\n", c->x86);
898 pr_info("MCE: In-kernel MCE decoding enabled.\n");
900 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
904 early_initcall(mce_amd_init);
907 static void __exit mce_amd_exit(void)
909 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
913 MODULE_DESCRIPTION("AMD MCE decoder");
914 MODULE_ALIAS("edac-mce-amd");
915 MODULE_LICENSE("GPL");
916 module_exit(mce_amd_exit);