]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/edac/amd64_edac.c
Merge tag 'stable/for-linus-3.8-rc0-bugfix-tag' of git://git.kernel.org/pub/scm/linux...
[karo-tx-linux.git] / drivers / edac / amd64_edac.c
index cc8e7c78a23ca22972bfa570dc88d1fcf449e979..f74a684269ff6c8ceb25241d82816ddbab035888 100644 (file)
@@ -60,8 +60,8 @@ struct scrubrate {
        { 0x00, 0UL},        /* scrubbing off */
 };
 
-static int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
-                                     u32 *val, const char *func)
+int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
+                              u32 *val, const char *func)
 {
        int err = 0;
 
@@ -423,7 +423,6 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
                             u64 *hole_offset, u64 *hole_size)
 {
        struct amd64_pvt *pvt = mci->pvt_info;
-       u64 base;
 
        /* only revE and later have the DRAM Hole Address Register */
        if (boot_cpu_data.x86 == 0xf && pvt->ext_model < K8_REV_E) {
@@ -462,10 +461,8 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
         * addresses in the hole so that they start at 0x100000000.
         */
 
-       base = dhar_base(pvt);
-
-       *hole_base = base;
-       *hole_size = (0x1ull << 32) - base;
+       *hole_base = dhar_base(pvt);
+       *hole_size = (1ULL << 32) - *hole_base;
 
        if (boot_cpu_data.x86 > 0xf)
                *hole_offset = f10_dhar_offset(pvt);
@@ -513,15 +510,15 @@ static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr)
 {
        struct amd64_pvt *pvt = mci->pvt_info;
        u64 dram_base, hole_base, hole_offset, hole_size, dram_addr;
-       int ret = 0;
+       int ret;
 
        dram_base = get_dram_base(pvt, pvt->mc_node_id);
 
        ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
                                      &hole_size);
        if (!ret) {
-               if ((sys_addr >= (1ull << 32)) &&
-                   (sys_addr < ((1ull << 32) + hole_size))) {
+               if ((sys_addr >= (1ULL << 32)) &&
+                   (sys_addr < ((1ULL << 32) + hole_size))) {
                        /* use DHAR to translate SysAddr to DramAddr */
                        dram_addr = sys_addr - hole_offset;
 
@@ -712,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
 
 /* Map the Error address to a PAGE and PAGE OFFSET. */
 static inline void error_address_to_page_and_offset(u64 error_address,
-                                                   u32 *page, u32 *offset)
+                                                   struct err_info *err)
 {
-       *page = (u32) (error_address >> PAGE_SHIFT);
-       *offset = ((u32) error_address) & ~PAGE_MASK;
+       err->page = (u32) (error_address >> PAGE_SHIFT);
+       err->offset = ((u32) error_address) & ~PAGE_MASK;
 }
 
 /*
@@ -1026,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range)
 }
 
 static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
-                                   u16 syndrome)
+                                   struct err_info *err)
 {
-       struct mem_ctl_info *src_mci;
        struct amd64_pvt *pvt = mci->pvt_info;
-       int channel, csrow;
-       u32 page, offset;
 
-       error_address_to_page_and_offset(sys_addr, &page, &offset);
+       error_address_to_page_and_offset(sys_addr, err);
 
        /*
         * Find out which node the error address belongs to. This may be
         * different from the node that detected the error.
         */
-       src_mci = find_mc_by_sys_addr(mci, sys_addr);
-       if (!src_mci) {
+       err->src_mci = find_mc_by_sys_addr(mci, sys_addr);
+       if (!err->src_mci) {
                amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n",
                             (unsigned long)sys_addr);
-               edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                                    page, offset, syndrome,
-                                    -1, -1, -1,
-                                    "failed to map error addr to a node",
-                                    "");
+               err->err_code = ERR_NODE;
                return;
        }
 
        /* Now map the sys_addr to a CSROW */
-       csrow = sys_addr_to_csrow(src_mci, sys_addr);
-       if (csrow < 0) {
-               edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                                    page, offset, syndrome,
-                                    -1, -1, -1,
-                                    "failed to map error addr to a csrow",
-                                    "");
+       err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr);
+       if (err->csrow < 0) {
+               err->err_code = ERR_CSROW;
                return;
        }
 
        /* CHIPKILL enabled */
        if (pvt->nbcfg & NBCFG_CHIPKILL) {
-               channel = get_channel_from_ecc_syndrome(mci, syndrome);
-               if (channel < 0) {
+               err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
+               if (err->channel < 0) {
                        /*
                         * Syndrome didn't map, so we don't know which of the
                         * 2 DIMMs is in error. So we need to ID 'both' of them
                         * as suspect.
                         */
-                       amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - "
+                       amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - "
                                      "possible error reporting race\n",
-                                     syndrome);
-                       edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                                            page, offset, syndrome,
-                                            csrow, -1, -1,
-                                            "unknown syndrome - possible error reporting race",
-                                            "");
+                                     err->syndrome);
+                       err->err_code = ERR_CHANNEL;
                        return;
                }
        } else {
@@ -1090,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
                 * was obtained from email communication with someone at AMD.
                 * (Wish the email was placed in this comment - norsk)
                 */
-               channel = ((sys_addr & BIT(3)) != 0);
+               err->channel = ((sys_addr & BIT(3)) != 0);
        }
-
-       edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1,
-                            page, offset, syndrome,
-                            csrow, channel, -1,
-                            "", "");
 }
 
 static int ddr2_cs_size(unsigned i, bool dct_width)
@@ -1482,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr)
 
 /* For a given @dram_range, check if @sys_addr falls within it. */
 static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
-                                 u64 sys_addr, int *nid, int *chan_sel)
+                                 u64 sys_addr, int *chan_sel)
 {
        int cs_found = -EINVAL;
        u64 chan_addr;
@@ -1555,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range,
 
        cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel);
 
-       if (cs_found >= 0) {
-               *nid = node_id;
+       if (cs_found >= 0)
                *chan_sel = channel;
-       }
+
        return cs_found;
 }
 
 static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
-                                      int *node, int *chan_sel)
+                                      int *chan_sel)
 {
        int cs_found = -EINVAL;
        unsigned range;
@@ -1577,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
                    (get_dram_limit(pvt, range) >= sys_addr)) {
 
                        cs_found = f1x_match_to_this_node(pvt, range,
-                                                         sys_addr, node,
-                                                         chan_sel);
+                                                         sys_addr, chan_sel);
                        if (cs_found >= 0)
                                break;
                }
@@ -1594,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
  * (MCX_ADDR).
  */
 static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
-                                    u16 syndrome)
+                                    struct err_info *err)
 {
        struct amd64_pvt *pvt = mci->pvt_info;
-       u32 page, offset;
-       int nid, csrow, chan = 0;
 
-       error_address_to_page_and_offset(sys_addr, &page, &offset);
+       error_address_to_page_and_offset(sys_addr, err);
 
-       csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
-
-       if (csrow < 0) {
-               edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                                    page, offset, syndrome,
-                                    -1, -1, -1,
-                                    "failed to map error addr to a csrow",
-                                    "");
+       err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel);
+       if (err->csrow < 0) {
+               err->err_code = ERR_CSROW;
                return;
        }
 
@@ -1619,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
         * this point.
         */
        if (dct_ganging_enabled(pvt))
-               chan = get_channel_from_ecc_syndrome(mci, syndrome);
-
-       edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                            page, offset, syndrome,
-                            csrow, chan, -1,
-                            "", "");
+               err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome);
 }
 
 /*
@@ -1633,14 +1596,11 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr,
  */
 static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
 {
-       int dimm, size0, size1, factor = 0;
+       int dimm, size0, size1;
        u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases;
        u32 dbam  = ctrl ? pvt->dbam1 : pvt->dbam0;
 
        if (boot_cpu_data.x86 == 0xf) {
-               if (pvt->dclr0 & WIDTH_128)
-                       factor = 1;
-
                /* K8 families < revF not supported yet */
               if (pvt->ext_model < K8_REV_F)
                        return;
@@ -1671,8 +1631,8 @@ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
                                                     DBAM_DIMM(dimm, dbam));
 
                amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n",
-                               dimm * 2,     size0 << factor,
-                               dimm * 2 + 1, size1 << factor);
+                               dimm * 2,     size0,
+                               dimm * 2 + 1, size1);
        }
 }
 
@@ -1893,101 +1853,56 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
        return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
 }
 
-/*
- * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
- * ADDRESS and process.
- */
-static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       u64 sys_addr;
-       u16 syndrome;
-
-       /* Ensure that the Error Address is VALID */
-       if (!(m->status & MCI_STATUS_ADDRV)) {
-               amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
-               edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-                                    0, 0, 0,
-                                    -1, -1, -1,
-                                    "HW has no ERROR_ADDRESS available",
-                                    "");
-               return;
-       }
-
-       sys_addr = get_error_address(m);
-       syndrome = extract_syndrome(m->status);
-
-       amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr);
-
-       pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome);
-}
-
-/* Handle any Un-correctable Errors (UEs) */
-static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m)
+static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err,
+                           u8 ecc_type)
 {
-       struct mem_ctl_info *log_mci, *src_mci = NULL;
-       int csrow;
-       u64 sys_addr;
-       u32 page, offset;
-
-       log_mci = mci;
+       enum hw_event_mc_err_type err_type;
+       const char *string;
 
-       if (!(m->status & MCI_STATUS_ADDRV)) {
-               amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n");
-               edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-                                    0, 0, 0,
-                                    -1, -1, -1,
-                                    "HW has no ERROR_ADDRESS available",
-                                    "");
+       if (ecc_type == 2)
+               err_type = HW_EVENT_ERR_CORRECTED;
+       else if (ecc_type == 1)
+               err_type = HW_EVENT_ERR_UNCORRECTED;
+       else {
+               WARN(1, "Something is rotten in the state of Denmark.\n");
                return;
        }
 
-       sys_addr = get_error_address(m);
-       error_address_to_page_and_offset(sys_addr, &page, &offset);
-
-       /*
-        * Find out which node the error address belongs to. This may be
-        * different from the node that detected the error.
-        */
-       src_mci = find_mc_by_sys_addr(mci, sys_addr);
-       if (!src_mci) {
-               amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n",
-                                 (unsigned long)sys_addr);
-               edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-                                    page, offset, 0,
-                                    -1, -1, -1,
-                                    "ERROR ADDRESS NOT mapped to a MC",
-                                    "");
-               return;
+       switch (err->err_code) {
+       case DECODE_OK:
+               string = "";
+               break;
+       case ERR_NODE:
+               string = "Failed to map error addr to a node";
+               break;
+       case ERR_CSROW:
+               string = "Failed to map error addr to a csrow";
+               break;
+       case ERR_CHANNEL:
+               string = "unknown syndrome - possible error reporting race";
+               break;
+       default:
+               string = "WTF error";
+               break;
        }
 
-       log_mci = src_mci;
-
-       csrow = sys_addr_to_csrow(log_mci, sys_addr);
-       if (csrow < 0) {
-               amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n",
-                                 (unsigned long)sys_addr);
-               edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-                                    page, offset, 0,
-                                    -1, -1, -1,
-                                    "ERROR ADDRESS NOT mapped to CS",
-                                    "");
-       } else {
-               edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-                                    page, offset, 0,
-                                    csrow, -1, -1,
-                                    "", "");
-       }
+       edac_mc_handle_error(err_type, mci, 1,
+                            err->page, err->offset, err->syndrome,
+                            err->csrow, err->channel, -1,
+                            string, "");
 }
 
 static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
                                            struct mce *m)
 {
-       u16 ec = EC(m->status);
-       u8 xec = XEC(m->status, 0x1f);
+       struct amd64_pvt *pvt = mci->pvt_info;
        u8 ecc_type = (m->status >> 45) & 0x3;
+       u8 xec = XEC(m->status, 0x1f);
+       u16 ec = EC(m->status);
+       u64 sys_addr;
+       struct err_info err;
 
-       /* Bail early out if this was an 'observed' error */
+       /* Bail out early if this was an 'observed' error */
        if (PP(ec) == NBSL_PP_OBS)
                return;
 
@@ -1995,10 +1910,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
        if (xec && xec != F10_NBSL_EXT_ERR_ECC)
                return;
 
+       memset(&err, 0, sizeof(err));
+
+       sys_addr = get_error_address(m);
+
        if (ecc_type == 2)
-               amd64_handle_ce(mci, m);
-       else if (ecc_type == 1)
-               amd64_handle_ue(mci, m);
+               err.syndrome = extract_syndrome(m->status);
+
+       pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err);
+
+       __log_bus_error(mci, &err, ecc_type);
 }
 
 void amd64_decode_bus_error(int node_id, struct mce *m)
@@ -2166,6 +2087,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
        u32 cs_mode, nr_pages;
        u32 dbam = dct ? pvt->dbam1 : pvt->dbam0;
 
+
        /*
         * The math on this doesn't look right on the surface because x/2*4 can
         * be simplified to x*2 but this expression makes use of the fact that
@@ -2173,13 +2095,13 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
         * number of bits to shift the DBAM register to extract the proper CSROW
         * field.
         */
-       cs_mode =  (dbam >> ((csrow_nr / 2) * 4)) & 0xF;
+       cs_mode = DBAM_DIMM(csrow_nr / 2, dbam);
 
        nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT);
 
-       edac_dbg(0, "  (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode);
-       edac_dbg(0, "    nr_pages/channel= %u  channel-count = %d\n",
-                nr_pages, pvt->channel_count);
+       edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n",
+                   csrow_nr, dct,  cs_mode);
+       edac_dbg(0, "nr_pages/channel: %u\n", nr_pages);
 
        return nr_pages;
 }
@@ -2190,15 +2112,14 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
  */
 static int init_csrows(struct mem_ctl_info *mci)
 {
+       struct amd64_pvt *pvt = mci->pvt_info;
        struct csrow_info *csrow;
        struct dimm_info *dimm;
-       struct amd64_pvt *pvt = mci->pvt_info;
-       u64 base, mask;
-       u32 val;
-       int i, j, empty = 1;
-       enum mem_type mtype;
        enum edac_type edac_mode;
+       enum mem_type mtype;
+       int i, j, empty = 1;
        int nr_pages = 0;
+       u32 val;
 
        amd64_read_pci_cfg(pvt->F3, NBCFG, &val);
 
@@ -2208,29 +2129,35 @@ static int init_csrows(struct mem_ctl_info *mci)
                 pvt->mc_node_id, val,
                 !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE));
 
+       /*
+        * We iterate over DCT0 here but we look at DCT1 in parallel, if needed.
+        */
        for_each_chip_select(i, 0, pvt) {
-               csrow = mci->csrows[i];
+               bool row_dct0 = !!csrow_enabled(i, 0, pvt);
+               bool row_dct1 = false;
+
+               if (boot_cpu_data.x86 != 0xf)
+                       row_dct1 = !!csrow_enabled(i, 1, pvt);
 
-               if (!csrow_enabled(i, 0, pvt) && !csrow_enabled(i, 1, pvt)) {
-                       edac_dbg(1, "----CSROW %d VALID for MC node %d\n",
-                                i, pvt->mc_node_id);
+               if (!row_dct0 && !row_dct1)
                        continue;
-               }
 
+               csrow = mci->csrows[i];
                empty = 0;
-               if (csrow_enabled(i, 0, pvt))
+
+               edac_dbg(1, "MC node: %d, csrow: %d\n",
+                           pvt->mc_node_id, i);
+
+               if (row_dct0)
                        nr_pages = amd64_csrow_nr_pages(pvt, 0, i);
-               if (csrow_enabled(i, 1, pvt))
-                       nr_pages += amd64_csrow_nr_pages(pvt, 1, i);
 
-               get_cs_base_and_mask(pvt, i, 0, &base, &mask);
-               /* 8 bytes of resolution */
+               /* K8 has only one DCT */
+               if (boot_cpu_data.x86 != 0xf && row_dct1)
+                       nr_pages += amd64_csrow_nr_pages(pvt, 1, i);
 
                mtype = amd64_determine_memory_type(pvt, i);
 
-               edac_dbg(1, "  for MC node %d csrow %d:\n", pvt->mc_node_id, i);
-               edac_dbg(1, "    nr_pages: %u\n",
-                        nr_pages * pvt->channel_count);
+               edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages);
 
                /*
                 * determine whether CHIPKILL or JUST ECC or NO ECC is operating
@@ -2247,6 +2174,7 @@ static int init_csrows(struct mem_ctl_info *mci)
                        dimm->edac_mode = edac_mode;
                        dimm->nr_pages = nr_pages;
                }
+               csrow->nr_pages = nr_pages;
        }
 
        return empty;
@@ -2591,6 +2519,7 @@ static int amd64_init_one_instance(struct pci_dev *F2)
 
        mci->pvt_info = pvt;
        mci->pdev = &pvt->F2->dev;
+       mci->csbased = 1;
 
        setup_mci_misc_attrs(mci, fam_type);