Merge tag 'v3.6-rc6' into x86/mce

author Ingo Molnar <mingo@kernel.org>

Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)

committer Ingo Molnar <mingo@kernel.org>

Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)
author Ingo Molnar <mingo@kernel.org>
Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)
committer Ingo Molnar <mingo@kernel.org>
Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 8c1beea6cabfdb5a0d920ae5d633d91db8f0cf3d,292d0258311c82d04c5ec0aeab43924d00c669b4..c311122ea838301781d8d5e41723a37cf0e68dde
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -103,6 -103,8 +103,8 @@@ DEFINE_PER_CPU(mce_banks_t, mce_poll_ba
   
   static DEFINE_PER_CPU(struct work_struct, mce_work);
   
+ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+ 
   /*
    * CPU/chipset specific EDAC code can register a notifier call here to print
    * MCE errors in a human-readable form.
@@@ -650,14 -652,18 +652,18 @@@ EXPORT_SYMBOL_GPL(machine_check_poll)
    * Do a quick check if any of the events requires a panic.
    * This decides if we keep the events around or clear them.
    */
- static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
+ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+                         struct pt_regs *regs)
   {
         int i, ret = 0;
   
         for (i = 0; i < banks; i++) {
                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
-               if (m->status & MCI_STATUS_VAL)
+               if (m->status & MCI_STATUS_VAL) {
                         __set_bit(i, validp);
+                       if (quirk_no_way_out)
+                               quirk_no_way_out(i, m, regs);
+               }
                 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
                         ret = 1;
         }
@@@ -1040,7 -1046,7 +1046,7 @@@ void do_machine_check(struct pt_regs *r
         *final = m;
   
         memset(valid_banks, 0, sizeof(valid_banks));
-       no_way_out = mce_no_way_out(&m, &msg, valid_banks);
+       no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
   
         barrier();
   
@@@ -1260,14 -1266,6 +1266,14 @@@ static unsigned long check_interval = 
   static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
   static DEFINE_PER_CPU(struct timer_list, mce_timer);
   
+ +static unsigned long mce_adjust_timer_default(unsigned long interval)
+ +{
+ +      return interval;
+ +}
+ +
+ +static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ +      mce_adjust_timer_default;
+ +
   static void mce_timer_fn(unsigned long data)
   {
         struct timer_list *t = &__get_cpu_var(mce_timer);
@@@ -1278,7 -1276,6 +1284,7 @@@
         if (mce_available(__this_cpu_ptr(&cpu_info))) {
                 machine_check_poll(MCP_TIMESTAMP,
                                 &__get_cpu_var(mce_poll_banks));
+ +              mce_intel_cmci_poll();
         }
   
         /*
@@@ -1286,38 -1283,14 +1292,38 @@@
          * polling interval, otherwise increase the polling interval.
          */
         iv = __this_cpu_read(mce_next_interval);
- -      if (mce_notify_irq())
+ +      if (mce_notify_irq()) {
                 iv = max(iv / 2, (unsigned long) HZ/100);
- -      else
+ +      } else {
                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ +              iv = mce_adjust_timer(iv);
+ +      }
         __this_cpu_write(mce_next_interval, iv);
+ +      /* Might have become 0 after CMCI storm subsided */
+ +      if (iv) {
+ +              t->expires = jiffies + iv;
+ +              add_timer_on(t, smp_processor_id());
+ +      }
+ +}
   
- -      t->expires = jiffies + iv;
- -      add_timer_on(t, smp_processor_id());
+ +/*
+ + * Ensure that the timer is firing in @interval from now.
+ + */
+ +void mce_timer_kick(unsigned long interval)
+ +{
+ +      struct timer_list *t = &__get_cpu_var(mce_timer);
+ +      unsigned long when = jiffies + interval;
+ +      unsigned long iv = __this_cpu_read(mce_next_interval);
+ +
+ +      if (timer_pending(t)) {
+ +              if (time_before(when, t->expires))
+ +                      mod_timer_pinned(t, when);
+ +      } else {
+ +              t->expires = round_jiffies(when);
+ +              add_timer_on(t, smp_processor_id());
+ +      }
+ +      if (interval < iv)
+ +              __this_cpu_write(mce_next_interval, interval);
   }
   
   /* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@@ -1451,6 -1424,34 +1457,34 @@@ static void __mcheck_cpu_init_generic(v
         }
   }
   
+ /*
+  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+  * Vol 3B Table 15-20). But this confuses both the code that determines
+  * whether the machine check occurred in kernel or user mode, and also
+  * the severity assessment code. Pretend that EIPV was set, and take the
+  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+  */
+ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+ {
+       if (bank != 0)
+               return;
+       if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+               return;
+       if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+                         MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+                         MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+                         MCACOD)) !=
+                        (MCI_STATUS_UC|MCI_STATUS_EN|
+                         MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+                         MCI_STATUS_AR|MCACOD_INSTR))
+               return;
+ 
+       m->mcgstatus |= MCG_STATUS_EIPV;
+       m->ip = regs->ip;
+       m->cs = regs->cs;
+ }
+ 
   /* Add per CPU specific workarounds here */
   static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
   {
@@@ -1548,6 -1549,9 +1582,9 @@@
                  */
                 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
                         mce_bootlog = 0;
+ 
+               if (c->x86 == 6 && c->x86_model == 45)
+                       quirk_no_way_out = quirk_sandybridge_ifu;
         }
         if (monarch_timeout < 0)
                 monarch_timeout = 0;
@@@ -1581,7 -1585,6 +1618,7 @@@ static void __mcheck_cpu_init_vendor(st
         switch (c->x86_vendor) {
         case X86_VENDOR_INTEL:
                 mce_intel_feature_init(c);
+ +              mce_adjust_timer = mce_intel_adjust_timer;
                 break;
         case X86_VENDOR_AMD:
                 mce_amd_feature_init(c);
@@@ -1591,28 -1594,23 +1628,28 @@@
         }
   }
   
- -static void __mcheck_cpu_init_timer(void)
+ +static void mce_start_timer(unsigned int cpu, struct timer_list *t)
   {
- -      struct timer_list *t = &__get_cpu_var(mce_timer);
- -      unsigned long iv = check_interval * HZ;
+ +      unsigned long iv = mce_adjust_timer(check_interval * HZ);
   
- -      setup_timer(t, mce_timer_fn, smp_processor_id());
+ +      __this_cpu_write(mce_next_interval, iv);
   
- -      if (mce_ignore_ce)
+ +      if (mce_ignore_ce || !iv)
                 return;
   
- -      __this_cpu_write(mce_next_interval, iv);
- -      if (!iv)
- -              return;
         t->expires = round_jiffies(jiffies + iv);
         add_timer_on(t, smp_processor_id());
   }
   
+ +static void __mcheck_cpu_init_timer(void)
+ +{
+ +      struct timer_list *t = &__get_cpu_var(mce_timer);
+ +      unsigned int cpu = smp_processor_id();
+ +
+ +      setup_timer(t, mce_timer_fn, cpu);
+ +      mce_start_timer(cpu, t);
+ +}
+ +
   /* Handle unconfigured int18 (should never happen) */
   static void unexpected_machine_check(struct pt_regs *regs, long error_code)
   {
@@@ -2296,33 -2294,38 +2333,33 @@@ mce_cpu_callback(struct notifier_block 
         unsigned int cpu = (unsigned long)hcpu;
         struct timer_list *t = &per_cpu(mce_timer, cpu);
   
- -      switch (action) {
+ +      switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_ONLINE:
- -      case CPU_ONLINE_FROZEN:
                 mce_device_create(cpu);
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
                 break;
         case CPU_DEAD:
- -      case CPU_DEAD_FROZEN:
                 if (threshold_cpu_callback)
                         threshold_cpu_callback(action, cpu);
                 mce_device_remove(cpu);
+ +              mce_intel_hcpu_update(cpu);
                 break;
         case CPU_DOWN_PREPARE:
- -      case CPU_DOWN_PREPARE_FROZEN:
- -              del_timer_sync(t);
                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ +              del_timer_sync(t);
                 break;
         case CPU_DOWN_FAILED:
- -      case CPU_DOWN_FAILED_FROZEN:
- -              if (!mce_ignore_ce && check_interval) {
- -                      t->expires = round_jiffies(jiffies +
- -                                      per_cpu(mce_next_interval, cpu));
- -                      add_timer_on(t, cpu);
- -              }
                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ +              mce_start_timer(cpu, t);
                 break;
- -      case CPU_POST_DEAD:
+ +      }
+ +
+ +      if (action == CPU_POST_DEAD) {
                 /* intentionally ignoring frozen here */
                 cmci_rediscover(cpu);
- -              break;
         }
+ +
         return NOTIFY_OK;
   }
author	Ingo Molnar <mingo@kernel.org>
	Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 19 Sep 2012 15:01:25 +0000 (17:01 +0200)