From 442305b152778f07504e9fdf64815d4841279bbe Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Jun 2009 22:48:29 -0300 Subject: [PATCH] i7core_edac: Add a memory check routine, based on device 3 function 4 This function appears only on Xeon 5500 datasheet. Yet, testing with a Xeon 3503 showed that this is also implemented on other Nehalem processors. At the first read, MC_TEST_ERR_RCV1 and MC_TEST_ERR_RCV0 can contain any value. Modify CE error logic to update the error count only after the second read. An alternative approach would be to do a write at rcv0 and rcv1 registers, but it seemed better to keep they untouched, since BIOS might eventually assume that they are exclusive for their usage. Signed-off-by: Mauro Carvalho Chehab --- drivers/edac/i7core_edac.c | 115 ++++++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 7 deletions(-) diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 190596af601a..b5dbc2b83961 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -62,6 +62,18 @@ #define MC_STATUS 0x4c #define MC_MAX_DOD 0x64 +/* + * OFFSETS for Device 3 Function 4, as inicated on Xeon 5500 datasheet: + * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf + */ + +#define MC_TEST_ERR_RCV1 0x60 + #define DIMM2_COR_ERR(r) ((r) & 0x7fff) + +#define MC_TEST_ERR_RCV0 0x64 + #define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff) + #define DIMM0_COR_ERR(r) ((r) & 0x7fff) + /* OFFSETS for Devices 4,5 and 6 Function 0 */ #define MC_CHANNEL_DIMM_INIT_PARAMS 0x58 @@ -136,8 +148,9 @@ */ #define NUM_CHANS 3 -#define NUM_MCR_FUNCS 4 -#define NUM_CHAN_FUNCS 3 +#define MAX_DIMMS 3 /* Max DIMMS per channel */ +#define MAX_MCR_FUNC 4 +#define MAX_CHAN_FUNC 3 struct i7core_info { u32 mc_control; @@ -159,8 +172,8 @@ struct i7core_inject { }; struct i7core_channel { - u32 ranks; - u32 dimms; + u32 ranks; + u32 dimms; }; struct pci_id_descr { @@ -171,11 +184,16 @@ struct pci_id_descr { }; struct i7core_pvt { - struct pci_dev *pci_mcr[NUM_MCR_FUNCS]; - struct pci_dev *pci_ch[NUM_CHANS][NUM_CHAN_FUNCS]; + struct pci_dev *pci_mcr[MAX_MCR_FUNC + 1]; + struct pci_dev *pci_ch[NUM_CHANS][MAX_CHAN_FUNC + 1]; struct i7core_info info; struct i7core_inject inject; struct i7core_channel channel[NUM_CHANS]; + + int ce_count_available; + unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */ + int last_ce_count[MAX_DIMMS]; + }; /* Device name and register DID (Device ID) */ @@ -749,6 +767,19 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci, return sprintf(data, "%d\n", pvt->inject.enable); } +static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data) +{ + struct i7core_pvt *pvt = mci->pvt_info; + + if (!pvt->ce_count_available) + return sprintf(data, "unavailable\n"); + + return sprintf(data, "dimm0: %lu\ndimm1: %lu\ndimm2: %lu\n", + pvt->ce_count[0], + pvt->ce_count[1], + pvt->ce_count[2]); +} + /* * Sysfs struct */ @@ -789,6 +820,13 @@ static struct mcidev_sysfs_attribute i7core_inj_attrs[] = { }, .show = i7core_inject_enable_show, .store = i7core_inject_enable_store, + }, { + .attr = { + .name = "corrected_error_counts", + .mode = (S_IRUGO | S_IWUSR) + }, + .show = i7core_ce_regs_show, + .store = NULL, }, }; @@ -879,13 +917,76 @@ static int i7core_get_devices(struct mem_ctl_info *mci, struct pci_dev *mcidev) return 0; } +/**************************************************************************** + Error check routines + ****************************************************************************/ + +/* This function is based on the device 3 function 4 registers as described on: + * Intel Xeon Processor 5500 Series Datasheet Volume 2 + * http://www.intel.com/Assets/PDF/datasheet/321322.pdf + * also available at: + * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf + */ +static void check_mc_test_err(struct mem_ctl_info *mci) +{ + struct i7core_pvt *pvt = mci->pvt_info; + u32 rcv1, rcv0; + int new0, new1, new2; + + if (!pvt->pci_mcr[4]) { + debugf0("%s MCR registers not found\n",__func__); + return; + } + + /* Corrected error reads */ + pci_read_config_dword(pvt->pci_mcr[4], MC_TEST_ERR_RCV1, &rcv1); + pci_read_config_dword(pvt->pci_mcr[4], MC_TEST_ERR_RCV0, &rcv0); + + /* Store the new values */ + new2 = DIMM2_COR_ERR(rcv1); + new1 = DIMM1_COR_ERR(rcv0); + new0 = DIMM0_COR_ERR(rcv0); + + debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n", + (pvt->ce_count_available ? "UPDATE" : "READ"), + rcv1, rcv0, new0, new1, new2); + + /* Updates CE counters if it is not the first time here */ + if (pvt->ce_count_available) { + /* Updates CE counters */ + int add0, add1, add2; + + add2 = new2 - pvt->last_ce_count[2]; + add1 = new1 - pvt->last_ce_count[1]; + add0 = new0 - pvt->last_ce_count[0]; + + if (add2 < 0) + add2 += 0x7fff; + pvt->ce_count[2] += add2; + + if (add1 < 0) + add1 += 0x7fff; + pvt->ce_count[1] += add1; + + if (add0 < 0) + add0 += 0x7fff; + pvt->ce_count[0] += add0; + } else + pvt->ce_count_available = 1; + + /* Store the new values */ + pvt->last_ce_count[2] = new2; + pvt->last_ce_count[1] = new1; + pvt->last_ce_count[0] = new0; +} + /* * i7core_check_error Retrieve and process errors reported by the * hardware. Called by the Core module. */ static void i7core_check_error(struct mem_ctl_info *mci) { - /* FIXME: need a real code here */ + check_mc_test_err(mci); } /* -- 2.39.5