From d02342151c51344034fbdeceff8effcb0a77c573 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Fri, 7 Aug 2009 10:44:22 +0000 Subject: [PATCH] myri10ge: improve parity error detection and recovery Improve myri10ge parity error detection and recovery: 1) Don't restore PCI config space to a rebooted NIC until AFTER the host is quiescent. 2) Let myri10ge_close() know the NIC is dead, so it won't waste time waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN 3) When the NIC is quiet (link down, or otherwise idle link) use a pci config space read to detect a rebooted NIC. Otherwise we might never notice that a NIC rebooted Signed-off-by: Andrew Gallatin Signed-off-by: Brice Goglin Signed-off-by: David S. Miller --- drivers/net/myri10ge/myri10ge.c | 63 ++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c index 1a34f7e11d98..75deef35b1e0 100644 --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -75,7 +75,7 @@ #include "myri10ge_mcp.h" #include "myri10ge_mcp_gen_header.h" -#define MYRI10GE_VERSION_STR "1.5.0-1.418" +#define MYRI10GE_VERSION_STR "1.5.0-1.432" MODULE_DESCRIPTION("Myricom 10G driver (10GbE)"); MODULE_AUTHOR("Maintainer: help@myri.com"); @@ -188,6 +188,7 @@ struct myri10ge_slice_state { dma_addr_t fw_stats_bus; int watchdog_tx_done; int watchdog_tx_req; + int watchdog_rx_done; #ifdef CONFIG_MYRI10GE_DCA int cached_dca_tag; int cpu; @@ -256,6 +257,7 @@ struct myri10ge_priv { u32 link_changes; u32 msg_enable; unsigned int board_number; + int rebooted; }; static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat"; @@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev) netif_carrier_off(dev); netif_tx_stop_all_queues(dev); - old_down_cnt = mgp->down_cnt; - mb(); - status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); - if (status) - printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n", - dev->name); - - wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ); - if (old_down_cnt == mgp->down_cnt) - printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name); + if (mgp->rebooted == 0) { + old_down_cnt = mgp->down_cnt; + mb(); + status = + myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); + if (status) + printk(KERN_ERR + "myri10ge: %s: Couldn't bring down link\n", + dev->name); + wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, + HZ); + if (old_down_cnt == mgp->down_cnt) + printk(KERN_ERR "myri10ge: %s never got down irq\n", + dev->name); + } netif_tx_disable(dev); myri10ge_free_irq(mgp); for (i = 0; i < mgp->num_slices; i++) @@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work) container_of(work, struct myri10ge_priv, watchdog_work); struct myri10ge_tx_buf *tx; u32 reboot; - int status; + int status, rebooted; int i; u16 cmd, vendor; mgp->watchdog_resets++; pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); + rebooted = 0; if ((cmd & PCI_COMMAND_MASTER) == 0) { /* Bus master DMA disabled? Check to see * if the card rebooted due to a parity error @@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work) myri10ge_reset_recover ? " " : " not"); if (myri10ge_reset_recover == 0) return; - + rtnl_lock(); + mgp->rebooted = 1; + rebooted = 1; + myri10ge_close(mgp->dev); myri10ge_reset_recover--; - + mgp->rebooted = 0; /* * A rebooted nic will come back with config space as * it was after power was applied to PCIe bus. @@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work) } } - rtnl_lock(); - myri10ge_close(mgp->dev); + if (!rebooted) { + rtnl_lock(); + myri10ge_close(mgp->dev); + } status = myri10ge_load_firmware(mgp, 1); if (status != 0) printk(KERN_ERR "myri10ge: %s: failed to load firmware\n", @@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg) { struct myri10ge_priv *mgp; struct myri10ge_slice_state *ss; - int i, reset_needed; + int i, reset_needed, busy_slice_cnt; u32 rx_pause_cnt; + u16 cmd; mgp = (struct myri10ge_priv *)arg; rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); + busy_slice_cnt = 0; for (i = 0, reset_needed = 0; i < mgp->num_slices && reset_needed == 0; ++i) { @@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg) reset_needed = 1; } } + if (ss->watchdog_tx_done != ss->tx.done || + ss->watchdog_rx_done != ss->rx_done.cnt) { + busy_slice_cnt++; + } ss->watchdog_tx_done = ss->tx.done; ss->watchdog_tx_req = ss->tx.req; + ss->watchdog_rx_done = ss->rx_done.cnt; + } + /* if we've sent or received no traffic, poll the NIC to + * ensure it is still there. Otherwise, we risk not noticing + * an error in a timely fashion */ + if (busy_slice_cnt == 0) { + pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); + if ((cmd & PCI_COMMAND_MASTER) == 0) { + reset_needed = 1; + } } mgp->watchdog_pause = rx_pause_cnt; -- 2.39.5