From a450e8f55a57d049ac3afe218f06567e12d6b4f5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Sat, 22 Nov 2014 21:58:09 +1100 Subject: [PATCH] powerpc/eeh: Dump PHB diag-data early On PowerNV platform, PHB diag-data is dumped after stopping device drivers. In case of recursive EEH errors, the kernel is usually crashed before dumping PHB diag-data for the second EEH error. It's hard to locate the root cause of the second EEH error without PHB diag-data. The patch adds one more EEH option "eeh=early_log", which helps dumping PHB diag-data immediately once frozen PE is detected, in order to get the PHB diag-data for the second EEH error. Signed-off-by: Gavin Shan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh.c | 2 ++ arch/powerpc/platforms/powernv/eeh-ioda.c | 13 ++++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 2e633b41712a..0652ebe117af 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -39,6 +39,7 @@ struct device_node; #define EEH_PROBE_MODE_DEV 0x04 /* From PCI device */ #define EEH_PROBE_MODE_DEVTREE 0x08 /* From device tree */ #define EEH_ENABLE_IO_FOR_LOG 0x10 /* Enable IO for log */ +#define EEH_EARLY_DUMP_LOG 0x20 /* Dump log immediately */ /* * Delay for PE reset, all in ms diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index f1c6b115cb37..05be77d9ea0e 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -143,6 +143,8 @@ static int __init eeh_setup(char *str) { if (!strcmp(str, "off")) eeh_add_flag(EEH_FORCE_DISABLED); + else if (!strcmp(str, "early_log")) + eeh_add_flag(EEH_EARLY_DUMP_LOG); return 1; } diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index fb38fe4dba89..2809c9895288 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -353,6 +353,9 @@ static int ioda_eeh_get_phb_state(struct eeh_pe *pe) } else if (!(pe->state & EEH_PE_ISOLATED)) { eeh_pe_state_mark(pe, EEH_PE_ISOLATED); ioda_eeh_phb_diag(pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); } return result; @@ -451,6 +454,9 @@ static int ioda_eeh_get_pe_state(struct eeh_pe *pe) eeh_pe_state_mark(pe, EEH_PE_ISOLATED); ioda_eeh_phb_diag(pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); } return result; @@ -730,7 +736,8 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option) static int ioda_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len) { - pnv_pci_dump_phb_diag_data(pe->phb, pe->data); + if (!eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); return 0; } @@ -1086,6 +1093,10 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) !((*pe)->state & EEH_PE_ISOLATED)) { eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); ioda_eeh_phb_diag(*pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data((*pe)->phb, + (*pe)->data); } /* -- 2.39.5