]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - arch/x86/kernel/cpu/perf_counter.c
perf_counter: Generic per counter interrupt throttle
[mv-sheeva.git] / arch / x86 / kernel / cpu / perf_counter.c
index 3f3ae477a7dc6d9aec2f80e31b6862f2aa3f5614..c4b543d1a86fe4dbd60cb1c040250ad0eec5e8bd 100644 (file)
@@ -1,10 +1,11 @@
 /*
  * Performance counter x86 architecture code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *  Copyright(C) 2009 Jaswinder Singh Rajput
- *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
-static bool perf_counters_initialized __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
        struct perf_counter     *counters[X86_PMC_IDX_MAX];
-       unsigned long           used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       unsigned long           interrupts;
-       u64                     throttle_ctrl;
+       unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
        unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       unsigned long           interrupts;
        int                     enabled;
 };
 
@@ -42,10 +41,10 @@ struct x86_pmu {
        const char      *name;
        int             version;
        int             (*handle_irq)(struct pt_regs *, int);
-       u64             (*save_disable_all)(void);
-       void            (*restore_all)(u64);
-       void            (*enable)(int, u64);
-       void            (*disable)(int, u64);
+       void            (*disable_all)(void);
+       void            (*enable_all)(void);
+       void            (*enable)(struct hw_perf_counter *, int);
+       void            (*disable)(struct hw_perf_counter *, int);
        unsigned        eventsel;
        unsigned        perfctr;
        u64             (*event_map)(int);
@@ -55,6 +54,8 @@ struct x86_pmu {
        int             num_counters_fixed;
        int             counter_bits;
        u64             counter_mask;
+       u64             max_period;
+       u64             intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -86,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK                0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK          0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK      0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK              \
        (CORE_EVNTSEL_EVENT_MASK |      \
         CORE_EVNTSEL_UNIT_MASK  |      \
+        CORE_EVNTSEL_EDGE_MASK  |      \
+        CORE_EVNTSEL_INV_MASK  |       \
         CORE_EVNTSEL_COUNTER_MASK)
 
        return event & CORE_EVNTSEL_MASK;
@@ -118,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK  0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK   0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK   0x000040000ULL
+#define K7_EVNTSEL_INV_MASK    0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK        0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK                        \
        (K7_EVNTSEL_EVENT_MASK |        \
         K7_EVNTSEL_UNIT_MASK  |        \
+        K7_EVNTSEL_EDGE_MASK  |        \
+        K7_EVNTSEL_INV_MASK   |        \
         K7_EVNTSEL_COUNTER_MASK)
 
        return event & K7_EVNTSEL_MASK;
@@ -133,11 +142,13 @@ static u64 amd_pmu_raw_event(u64 event)
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
  */
-static void
+static u64
 x86_perf_counter_update(struct perf_counter *counter,
                        struct hw_perf_counter *hwc, int idx)
 {
-       u64 prev_raw_count, new_raw_count, delta;
+       int shift = 64 - x86_pmu.counter_bits;
+       u64 prev_raw_count, new_raw_count;
+       s64 delta;
 
        /*
         * Careful: an NMI might modify the previous counter value.
@@ -160,15 +171,18 @@ again:
         * (counter-)time and add that to the generic counter.
         *
         * Careful, not all hw sign-extends above the physical width
-        * of the count, so we do that by clipping the delta to 32 bits:
+        * of the count.
         */
-       delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+       delta = (new_raw_count << shift) - (prev_raw_count << shift);
+       delta >>= shift;
 
        atomic64_add(delta, &counter->count);
        atomic64_sub(delta, &hwc->period_left);
+
+       return new_raw_count;
 }
 
-static atomic_t num_counters;
+static atomic_t active_counters;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
@@ -221,12 +235,17 @@ static void release_pmc_hardware(void)
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
-       if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+       if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
                release_pmc_hardware();
                mutex_unlock(&pmc_reserve_mutex);
        }
 }
 
+static inline int x86_pmu_initialized(void)
+{
+       return x86_pmu.handle_irq != NULL;
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -236,20 +255,16 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        struct hw_perf_counter *hwc = &counter->hw;
        int err;
 
-       /* disable temporarily */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-               return -ENOSYS;
-
-       if (unlikely(!perf_counters_initialized))
-               return -EINVAL;
+       if (!x86_pmu_initialized())
+               return -ENODEV;
 
        err = 0;
-       if (atomic_inc_not_zero(&num_counters)) {
+       if (!atomic_inc_not_zero(&active_counters)) {
                mutex_lock(&pmc_reserve_mutex);
-               if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+               if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
                        err = -EBUSY;
                else
-                       atomic_inc(&num_counters);
+                       atomic_inc(&active_counters);
                mutex_unlock(&pmc_reserve_mutex);
        }
        if (err)
@@ -273,20 +288,18 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
         * If privileged enough, allow NMI events:
         */
        hwc->nmi = 0;
-       if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+       if (hw_event->nmi) {
+               if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
                hwc->nmi = 1;
+       }
+       perf_counters_lapic_init(hwc->nmi);
 
-       hwc->irq_period         = hw_event->irq_period;
-       /*
-        * Intel PMCs cannot be accessed sanely above 32 bit width,
-        * so we install an artificial 1<<31 period regardless of
-        * the generic counter period:
-        */
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-               if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-                       hwc->irq_period = 0x7FFFFFFF;
+       if (!hwc->irq_period)
+               hwc->irq_period = x86_pmu.max_period;
 
-       atomic64_set(&hwc->period_left, hwc->irq_period);
+       atomic64_set(&hwc->period_left,
+                       min(x86_pmu.max_period, hwc->irq_period));
 
        /*
         * Raw event type provide the config in the event structure
@@ -307,22 +320,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
        return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-       u64 ctrl;
-
-       rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
        wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-       return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-       int enabled, idx;
+       int idx;
+
+       if (!cpuc->enabled)
+               return;
 
-       enabled = cpuc->enabled;
        cpuc->enabled = 0;
        /*
         * ensure we write the disable before we start disabling the
@@ -342,37 +352,31 @@ static u64 amd_pmu_save_disable_all(void)
                val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
                wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
        }
-
-       return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
-       if (unlikely(!perf_counters_initialized))
-               return 0;
-
-       return x86_pmu.save_disable_all();
+       if (!x86_pmu_initialized())
+               return;
+       return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+       wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
        int idx;
 
-       cpuc->enabled = ctrl;
-       barrier();
-       if (!ctrl)
+       if (cpuc->enabled)
                return;
 
+       cpuc->enabled = 1;
+       barrier();
+
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                u64 val;
 
@@ -386,24 +390,17 @@ static void amd_pmu_restore_all(u64 ctrl)
        }
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
-       if (unlikely(!perf_counters_initialized))
+       if (!x86_pmu_initialized())
                return;
-
-       x86_pmu.restore_all(ctrl);
+       x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static inline u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(void)
 {
        u64 status;
 
-       if (unlikely(!perf_counters_initialized))
-               return 0;
        rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
        return status;
@@ -414,56 +411,22 @@ static inline void intel_pmu_ack_status(u64 ack)
        wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_enable_counter(int idx, u64 config)
-{
-       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
-                       config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static void amd_pmu_enable_counter(int idx, u64 config)
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       set_bit(idx, cpuc->active_mask);
-       if (cpuc->enabled)
-               config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-       wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-}
-
-static void hw_perf_enable(int idx, u64 config)
-{
-       if (unlikely(!perf_counters_initialized))
-               return;
-
-       x86_pmu.enable(idx, config);
-}
-
-static void intel_pmu_disable_counter(int idx, u64 config)
-{
-       wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
-}
-
-static void amd_pmu_disable_counter(int idx, u64 config)
-{
-       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-       clear_bit(idx, cpuc->active_mask);
-       wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-
+       int err;
+       err = checking_wrmsrl(hwc->config_base + idx,
+                             hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void hw_perf_disable(int idx, u64 config)
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-       if (unlikely(!perf_counters_initialized))
-               return;
-
-       x86_pmu.disable(idx, config);
+       int err;
+       err = checking_wrmsrl(hwc->config_base + idx,
+                             hwc->config);
 }
 
 static inline void
-__pmc_fixed_disable(struct perf_counter *counter,
-                   struct hw_perf_counter *hwc, unsigned int __idx)
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
        int idx = __idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, mask;
@@ -477,13 +440,20 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__x86_pmu_disable(struct perf_counter *counter,
-                 struct hw_perf_counter *hwc, unsigned int idx)
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-               __pmc_fixed_disable(counter, hwc, idx);
-       else
-               hw_perf_disable(idx, hwc->config);
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_disable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       x86_pmu_disable_counter(hwc, idx);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -497,7 +467,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
                             struct hw_perf_counter *hwc, int idx)
 {
        s64 left = atomic64_read(&hwc->period_left);
-       s64 period = hwc->irq_period;
+       s64 period = min(x86_pmu.max_period, hwc->irq_period);
        int err;
 
        /*
@@ -512,6 +482,11 @@ x86_perf_counter_set_period(struct perf_counter *counter,
                left += period;
                atomic64_set(&hwc->period_left, left);
        }
+       /*
+        * Quirk: certain CPUs dont like it if just 1 event is left:
+        */
+       if (unlikely(left < 2))
+               left = 2;
 
        per_cpu(prev_left[idx], smp_processor_id()) = left;
 
@@ -526,8 +501,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_fixed_enable(struct perf_counter *counter,
-                  struct hw_perf_counter *hwc, unsigned int __idx)
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
        int idx = __idx - X86_PMC_IDX_FIXED;
        u64 ctrl_val, bits, mask;
@@ -552,14 +526,24 @@ __pmc_fixed_enable(struct perf_counter *counter,
        err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void
-__x86_pmu_enable(struct perf_counter *counter,
-                struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+               intel_pmu_enable_fixed(hwc, idx);
+               return;
+       }
+
+       x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-       if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-               __pmc_fixed_enable(counter, hwc, idx);
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+       if (cpuc->enabled)
+               x86_pmu_enable_counter(hwc, idx);
        else
-               hw_perf_enable(idx, hwc->config);
+               x86_pmu_disable_counter(hwc, idx);
 }
 
 static int
@@ -567,7 +551,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
        unsigned int event;
 
-       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+       if (!x86_pmu.num_counters_fixed)
                return -1;
 
        if (unlikely(hwc->nmi))
@@ -600,7 +584,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
                 * Try to get the fixed counter, if that is already taken
                 * then try to get a generic counter:
                 */
-               if (test_and_set_bit(idx, cpuc->used))
+               if (test_and_set_bit(idx, cpuc->used_mask))
                        goto try_generic;
 
                hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
@@ -614,46 +598,54 @@ static int x86_pmu_enable(struct perf_counter *counter)
        } else {
                idx = hwc->idx;
                /* Try to get the previous generic counter again */
-               if (test_and_set_bit(idx, cpuc->used)) {
+               if (test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-                       idx = find_first_zero_bit(cpuc->used,
+                       idx = find_first_zero_bit(cpuc->used_mask,
                                                  x86_pmu.num_counters);
                        if (idx == x86_pmu.num_counters)
                                return -EAGAIN;
 
-                       set_bit(idx, cpuc->used);
+                       set_bit(idx, cpuc->used_mask);
                        hwc->idx = idx;
                }
                hwc->config_base  = x86_pmu.eventsel;
                hwc->counter_base = x86_pmu.perfctr;
        }
 
-       perf_counters_lapic_init(hwc->nmi);
-
-       __x86_pmu_disable(counter, hwc, idx);
+       x86_pmu.disable(hwc, idx);
 
        cpuc->counters[idx] = counter;
-       /*
-        * Make it visible before enabling the hw:
-        */
-       barrier();
+       set_bit(idx, cpuc->active_mask);
 
        x86_perf_counter_set_period(counter, hwc, idx);
-       __x86_pmu_enable(counter, hwc, idx);
+       x86_pmu.enable(hwc, idx);
 
        return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+       struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+       struct hw_perf_counter *hwc = &counter->hw;
+
+       if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+                               cpuc->counters[hwc->idx] != counter))
+               return;
+
+       x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
        u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
        struct cpu_hw_counters *cpuc;
+       unsigned long flags;
        int cpu, idx;
 
        if (!x86_pmu.num_counters)
                return;
 
-       local_irq_disable();
+       local_irq_save(flags);
 
        cpu = smp_processor_id();
        cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -670,7 +662,7 @@ void perf_counter_print_debug(void)
                pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
                pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
        }
-       pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+       pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
 
        for (idx = 0; idx < x86_pmu.num_counters; idx++) {
                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -691,19 +683,22 @@ void perf_counter_print_debug(void)
                pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
                        cpu, idx, pmc_count);
        }
-       local_irq_enable();
+       local_irq_restore(flags);
 }
 
 static void x86_pmu_disable(struct perf_counter *counter)
 {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
        struct hw_perf_counter *hwc = &counter->hw;
-       unsigned int idx = hwc->idx;
+       int idx = hwc->idx;
 
-       __x86_pmu_disable(counter, hwc, idx);
+       /*
+        * Must be done before we disable, otherwise the nmi handler
+        * could reenable again:
+        */
+       clear_bit(idx, cpuc->active_mask);
+       x86_pmu.disable(hwc, idx);
 
-       clear_bit(idx, cpuc->used);
-       cpuc->counters[idx] = NULL;
        /*
         * Make sure the cleared pointer becomes visible before we
         * (potentially) free the counter:
@@ -715,6 +710,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
         * that we are disabling:
         */
        x86_perf_counter_update(counter, hwc, idx);
+       cpuc->counters[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
 }
 
 /*
@@ -730,45 +727,50 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
        x86_perf_counter_set_period(counter, hwc, idx);
 
        if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-               __x86_pmu_enable(counter, hwc, idx);
+               intel_pmu_enable_counter(hwc, idx);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-       int bit, cpu = smp_processor_id();
+       struct cpu_hw_counters *cpuc;
+       struct cpu_hw_counters;
+       int bit, cpu, loops;
        u64 ack, status;
-       struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-       int ret = 0;
 
-       cpuc->throttle_ctrl = intel_pmu_save_disable_all();
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-       status = intel_pmu_get_status(cpuc->throttle_ctrl);
-       if (!status)
-               goto out;
+       perf_disable();
+       status = intel_pmu_get_status();
+       if (!status) {
+               perf_enable();
+               return 0;
+       }
 
-       ret = 1;
+       loops = 0;
 again:
+       if (++loops > 100) {
+               WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+               perf_counter_print_debug();
+               return 1;
+       }
+
        inc_irq_stat(apic_perf_irqs);
        ack = status;
        for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
                struct perf_counter *counter = cpuc->counters[bit];
 
                clear_bit(bit, (unsigned long *) &status);
-               if (!counter)
+               if (!test_bit(bit, cpuc->active_mask))
                        continue;
 
                intel_pmu_save_and_restart(counter);
                if (perf_counter_overflow(counter, nmi, regs, 0))
-                       __x86_pmu_disable(counter, &counter->hw, bit);
+                       intel_pmu_disable_counter(&counter->hw, bit);
        }
 
        intel_pmu_ack_status(ack);
@@ -776,38 +778,49 @@ again:
        /*
         * Repeat if there is more work to be done:
         */
-       status = intel_pmu_get_status(cpuc->throttle_ctrl);
+       status = intel_pmu_get_status();
        if (status)
                goto again;
-out:
-       /*
-        * Restore - do not reenable when global enable is off or throttled:
-        */
-       if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-               intel_pmu_restore_all(cpuc->throttle_ctrl);
 
-       return ret;
-}
+       perf_enable();
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+       return 1;
+}
 
-void perf_counter_unthrottle(void)
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
+       int cpu, idx, handled = 0;
        struct cpu_hw_counters *cpuc;
+       struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
+       u64 val;
 
-       if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-               return;
+       cpu = smp_processor_id();
+       cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-       if (unlikely(!perf_counters_initialized))
-               return;
+       for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+               if (!test_bit(idx, cpuc->active_mask))
+                       continue;
 
-       cpuc = &__get_cpu_var(cpu_hw_counters);
-       if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-               if (printk_ratelimit())
-                       printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
-               hw_perf_restore(cpuc->throttle_ctrl);
+               counter = cpuc->counters[idx];
+               hwc = &counter->hw;
+
+               if (counter->hw_event.nmi != nmi)
+                       continue;
+
+               val = x86_perf_counter_update(counter, hwc, idx);
+               if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+                       continue;
+
+               /* counter overflow */
+               x86_perf_counter_set_period(counter, hwc, idx);
+               handled = 1;
+               inc_irq_stat(apic_perf_irqs);
+               if (perf_counter_overflow(counter, nmi, regs, 0))
+                       amd_pmu_disable_counter(hwc, idx);
        }
-       cpuc->interrupts = 0;
+
+       return handled;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -837,8 +850,9 @@ void perf_counters_lapic_init(int nmi)
 {
        u32 apic_val;
 
-       if (!perf_counters_initialized)
+       if (!x86_pmu_initialized())
                return;
+
        /*
         * Enable the performance counter vector in the APIC LVT:
         */
@@ -858,7 +872,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
        struct die_args *args = __args;
        struct pt_regs *regs;
-       int ret;
+
+       if (!atomic_read(&active_counters))
+               return NOTIFY_DONE;
 
        switch (cmd) {
        case DIE_NMI:
@@ -872,9 +888,16 @@ perf_counter_nmi_handler(struct notifier_block *self,
        regs = args->regs;
 
        apic_write(APIC_LVTPC, APIC_DM_NMI);
-       ret = x86_pmu.handle_irq(regs, 1);
+       /*
+        * Can't rely on the handled return value to say it was our NMI, two
+        * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+        *
+        * If the first NMI handles both, the latter will be empty and daze
+        * the CPU.
+        */
+       x86_pmu.handle_irq(regs, 1);
 
-       return ret ? NOTIFY_STOP : NOTIFY_OK;
+       return NOTIFY_STOP;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
@@ -886,8 +909,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
-       .save_disable_all       = intel_pmu_save_disable_all,
-       .restore_all            = intel_pmu_restore_all,
+       .disable_all            = intel_pmu_disable_all,
+       .enable_all             = intel_pmu_enable_all,
        .enable                 = intel_pmu_enable_counter,
        .disable                = intel_pmu_disable_counter,
        .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
@@ -895,13 +918,19 @@ static struct x86_pmu intel_pmu = {
        .event_map              = intel_pmu_event_map,
        .raw_event              = intel_pmu_raw_event,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
+       /*
+        * Intel PMCs cannot be accessed sanely above 32 bit width,
+        * so we install an artificial 1<<31 period regardless of
+        * the generic counter period:
+        */
+       .max_period             = (1ULL << 31) - 1,
 };
 
 static struct x86_pmu amd_pmu = {
        .name                   = "AMD",
        .handle_irq             = amd_pmu_handle_irq,
-       .save_disable_all       = amd_pmu_save_disable_all,
-       .restore_all            = amd_pmu_restore_all,
+       .disable_all            = amd_pmu_disable_all,
+       .enable_all             = amd_pmu_enable_all,
        .enable                 = amd_pmu_enable_counter,
        .disable                = amd_pmu_disable_counter,
        .eventsel               = MSR_K7_EVNTSEL0,
@@ -912,6 +941,8 @@ static struct x86_pmu amd_pmu = {
        .num_counters           = 4,
        .counter_bits           = 48,
        .counter_mask           = (1ULL << 48) - 1,
+       /* use highest bit to detect overflow */
+       .max_period             = (1ULL << 47) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -940,10 +971,18 @@ static int intel_pmu_init(void)
        x86_pmu = intel_pmu;
        x86_pmu.version = version;
        x86_pmu.num_counters = eax.split.num_counters;
-       x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+
+       /*
+        * Quirk: v2 perfmon does not report fixed-purpose counters, so
+        * assume at least 3 counters:
+        */
+       x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
        x86_pmu.counter_bits = eax.split.bit_width;
        x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+       rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
        return 0;
 }
 
@@ -984,6 +1023,7 @@ void __init init_hw_perf_counters(void)
        perf_max_counters = x86_pmu.num_counters;
 
        pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
+       pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
 
        if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
                x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
@@ -996,9 +1036,8 @@ void __init init_hw_perf_counters(void)
                ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
        pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-       perf_counters_initialized = true;
 
-       perf_counters_lapic_init(0);
+       perf_counters_lapic_init(1);
        register_die_notifier(&perf_counter_nmi_notifier);
 }
 
@@ -1011,6 +1050,7 @@ static const struct pmu pmu = {
        .enable         = x86_pmu_enable,
        .disable        = x86_pmu_disable,
        .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)