]> git.karo-electronics.de Git - karo-tx-linux.git/commitdiff
NMI watchdog: fix for lockup detector breakage on resume
authorSameer Nanda <snanda@chromium.org>
Thu, 3 May 2012 05:44:13 +0000 (15:44 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Thu, 3 May 2012 05:46:43 +0000 (15:46 +1000)
On the suspend/resume path the boot CPU does not go though an
offline->online transition.  This breaks the NMI detector post-resume
since it depends on PMU state that is lost when the system gets suspended.

Fix this by forcing a CPU offline->online transition for the lockup
detector on the boot CPU during resume.

To provide more context, we enable NMI watchdog on Chrome OS.  We have
seen several reports of systems freezing up completely which indicated
that the NMI watchdog was not firing for some reason.

Debugging further, we found a simple way of repro'ing system freezes --
issuing the command 'tasket 1 sh -c "echo nmilockup > /proc/breakme"'
after the system has been suspended/resumed one or more times.

With this patch in place, the system freeze result in panics, as expected.
 These panics provide a nice stack trace for us to debug the actual issue
causing the freeze.

Signed-off-by: Sameer Nanda <snanda@chromium.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/sched.h
kernel/power/suspend.c
kernel/watchdog.c

index b563eeb43824a9af4ef96df34bea88c9715b34ee..6669e7294cc9f7f78960133fbf7fe741ce2b8fb9 100644 (file)
@@ -317,6 +317,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
                                  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 void lockup_detector_init(void);
+void lockup_detector_bootcpu_resume(void);
 #else
 static inline void touch_softlockup_watchdog(void)
 {
@@ -330,6 +331,9 @@ static inline void touch_all_softlockup_watchdogs(void)
 static inline void lockup_detector_init(void)
 {
 }
+static inline void lockup_detector_bootcpu_resume(void)
+{
+}
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
index 396d262b8fd01381a99de04d2d310be787b37565..0d262a85c9a10b1568c8ccada118ae48be923b4b 100644 (file)
@@ -177,6 +177,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        arch_suspend_enable_irqs();
        BUG_ON(irqs_disabled());
 
+       /* Kick the lockup detector */
+       lockup_detector_bootcpu_resume();
+
  Enable_cpus:
        enable_nonboot_cpus();
 
index e5e1d85b8c7c23090ce59b7e5b1e868535c85ef0..85b8f3233f30a3021a67892e75f4c3f5205e5a10 100644 (file)
@@ -597,6 +597,22 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
 
+void lockup_detector_bootcpu_resume(void)
+{
+       void *cpu = (void *)(long)smp_processor_id();
+
+       /*
+        * On the suspend/resume path the boot CPU does not go though the
+        * offline->online transition. This breaks the NMI detector post
+        * resume. Force an offline->online transition for the boot CPU on
+        * resume.
+        */
+       cpu_callback(&cpu_nfb, CPU_DEAD, cpu);
+       cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+
+       return;
+}
+
 void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();