From f388c7a38d6546cc0e84679b06ddcea4f5ad70d3 Mon Sep 17 00:00:00 2001 From: Sameer Nanda Date: Thu, 3 May 2012 15:44:13 +1000 Subject: [PATCH] NMI watchdog: fix for lockup detector breakage on resume On the suspend/resume path the boot CPU does not go though an offline->online transition. This breaks the NMI detector post-resume since it depends on PMU state that is lost when the system gets suspended. Fix this by forcing a CPU offline->online transition for the lockup detector on the boot CPU during resume. To provide more context, we enable NMI watchdog on Chrome OS. We have seen several reports of systems freezing up completely which indicated that the NMI watchdog was not firing for some reason. Debugging further, we found a simple way of repro'ing system freezes -- issuing the command 'tasket 1 sh -c "echo nmilockup > /proc/breakme"' after the system has been suspended/resumed one or more times. With this patch in place, the system freeze result in panics, as expected. These panics provide a nice stack trace for us to debug the actual issue causing the freeze. Signed-off-by: Sameer Nanda Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Don Zickus Cc: Mandeep Singh Baines Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 ++++ kernel/power/suspend.c | 3 +++ kernel/watchdog.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b563eeb43824..6669e7294cc9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -317,6 +317,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; void lockup_detector_init(void); +void lockup_detector_bootcpu_resume(void); #else static inline void touch_softlockup_watchdog(void) { @@ -330,6 +331,9 @@ static inline void touch_all_softlockup_watchdogs(void) static inline void lockup_detector_init(void) { } +static inline void lockup_detector_bootcpu_resume(void) +{ +} #endif #ifdef CONFIG_DETECT_HUNG_TASK diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 396d262b8fd0..0d262a85c9a1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -177,6 +177,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + /* Kick the lockup detector */ + lockup_detector_bootcpu_resume(); + Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..85b8f3233f30 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -597,6 +597,22 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; +void lockup_detector_bootcpu_resume(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + /* + * On the suspend/resume path the boot CPU does not go though the + * offline->online transition. This breaks the NMI detector post + * resume. Force an offline->online transition for the boot CPU on + * resume. + */ + cpu_callback(&cpu_nfb, CPU_DEAD, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + + return; +} + void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- 2.39.5