perf: Reduce perf_disable() usage

[mv-sheeva.git] / arch / x86 / kernel / cpu / perf_event.c
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 5db5b7d65a180f6a7f0c2cb970d63e04129add2a..846070ce49c3db27346294a6ff10210b3586f999 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -220,6 +220,7 @@ struct x86_pmu {
                                                  struct perf_event *event);
         struct event_constraint *event_constraints;
         void            (*quirks)(void);
+       int             perfctr_second_write;
  
         int             (*cpu_prepare)(int cpu);
         void            (*cpu_starting)(int cpu);
@@ -295,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
          * count to the generic event atomically:
          */
  again:
-       prev_raw_count = atomic64_read(&hwc->prev_count);
+       prev_raw_count = local64_read(&hwc->prev_count);
         rdmsrl(hwc->event_base + idx, new_raw_count);
  
-       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                                         new_raw_count) != prev_raw_count)
                 goto again;
  
@@ -313,8 +314,8 @@ again:
         delta = (new_raw_count << shift) - (prev_raw_count << shift);
         delta >>= shift;
  
-       atomic64_add(delta, &event->count);
-       atomic64_sub(delta, &hwc->period_left);
+       local64_add(delta, &event->count);
+       local64_sub(delta, &hwc->period_left);
  
         return new_raw_count;
  }
@@ -438,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event)
         if (!hwc->sample_period) {
                 hwc->sample_period = x86_pmu.max_period;
                 hwc->last_period = hwc->sample_period;
-               atomic64_set(&hwc->period_left, hwc->sample_period);
+               local64_set(&hwc->period_left, hwc->sample_period);
         } else {
                 /*
                  * If we have a PMU initialized but no APIC
@@ -529,7 +530,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
  /*
   * Setup the hardware configuration for a given attr_type
   */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
  {
         int err;
  
@@ -617,7 +618,7 @@ static void x86_pmu_enable_all(int added)
         }
  }
  
-static const struct pmu pmu;
+static struct pmu pmu;
  
  static inline int is_x86_event(struct perf_event *event)
  {
@@ -885,7 +886,7 @@ static int
  x86_perf_event_set_period(struct perf_event *event)
  {
         struct hw_perf_event *hwc = &event->hw;
-       s64 left = atomic64_read(&hwc->period_left);
+       s64 left = local64_read(&hwc->period_left);
         s64 period = hwc->sample_period;
         int ret = 0, idx = hwc->idx;
  
@@ -897,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
          */
         if (unlikely(left <= -period)) {
                 left = period;
-               atomic64_set(&hwc->period_left, left);
+               local64_set(&hwc->period_left, left);
                 hwc->last_period = period;
                 ret = 1;
         }
  
         if (unlikely(left <= 0)) {
                 left += period;
-               atomic64_set(&hwc->period_left, left);
+               local64_set(&hwc->period_left, left);
                 hwc->last_period = period;
                 ret = 1;
         }
@@ -923,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
          * The hw event starts counting from this event offset,
          * mark it to be able to extra future deltas:
          */
-       atomic64_set(&hwc->prev_count, (u64)-left);
+       local64_set(&hwc->prev_count, (u64)-left);
  
-       wrmsrl(hwc->event_base + idx,
+       wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
+
+       /*
+        * Due to erratum on certan cpu we need
+        * a second write to be sure the register
+        * is updated properly
+        */
+       if (x86_pmu.perfctr_second_write) {
+               wrmsrl(hwc->event_base + idx,
                         (u64)(-left) & x86_pmu.cntval_mask);
+       }
  
         perf_event_update_userpage(event);
  
@@ -959,34 +969,38 @@ static int x86_pmu_enable(struct perf_event *event)
  
         hwc = &event->hw;
  
+       perf_disable();
         n0 = cpuc->n_events;
-       n = collect_events(cpuc, event, false);
-       if (n < 0)
-               return n;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
  
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
          * at commit time(->commit_txn) as a whole
          */
-       if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
-               goto out;
+       if (cpuc->group_flag & PERF_EVENT_TXN)
+               goto done_collect;
  
         ret = x86_pmu.schedule_events(cpuc, n, assign);
         if (ret)
-               return ret;
+               goto out;
         /*
          * copy new assignment, now we know it is possible
          * will be used by hw_perf_enable()
          */
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
-out:
+done_collect:
         cpuc->n_events = n;
         cpuc->n_added += n - n0;
         cpuc->n_txn += n - n0;
  
-       return 0;
+       ret = 0;
+out:
+       perf_enable();
+       return ret;
  }
  
  static int x86_pmu_start(struct perf_event *event)
@@ -1096,7 +1110,7 @@ static void x86_pmu_disable(struct perf_event *event)
          * The events never got scheduled and ->cancel_txn will truncate
          * the event_list.
          */
-       if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+       if (cpuc->group_flag & PERF_EVENT_TXN)
                 return;
  
         x86_pmu_stop(event);
@@ -1144,7 +1158,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                 /*
                  * event overflow
                  */
-               handled         = 1;
+               handled++;
                 data.period     = event->hw.last_period;
  
                 if (!x86_perf_event_set_period(event))
@@ -1190,12 +1204,20 @@ void perf_events_lapic_init(void)
         apic_write(APIC_LVTPC, APIC_DM_NMI);
  }
  
+struct pmu_nmi_state {
+       unsigned int    marked;
+       int             handled;
+};
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
+
  static int __kprobes
  perf_event_nmi_handler(struct notifier_block *self,
                          unsigned long cmd, void *__args)
  {
         struct die_args *args = __args;
-       struct pt_regs *regs;
+       unsigned int this_nmi;
+       int handled;
  
         if (!atomic_read(&active_events))
                 return NOTIFY_DONE;
@@ -1204,22 +1226,47 @@ perf_event_nmi_handler(struct notifier_block *self,
         case DIE_NMI:
         case DIE_NMI_IPI:
                 break;
-
+       case DIE_NMIUNKNOWN:
+               this_nmi = percpu_read(irq_stat.__nmi_count);
+               if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+                       /* let the kernel handle the unknown nmi */
+                       return NOTIFY_DONE;
+               /*
+                * This one is a PMU back-to-back nmi. Two events
+                * trigger 'simultaneously' raising two back-to-back
+                * NMIs. If the first NMI handles both, the latter
+                * will be empty and daze the CPU. So, we drop it to
+                * avoid false-positive 'unknown nmi' messages.
+                */
+               return NOTIFY_STOP;
         default:
                 return NOTIFY_DONE;
         }
  
-       regs = args->regs;
-
         apic_write(APIC_LVTPC, APIC_DM_NMI);
-       /*
-        * Can't rely on the handled return value to say it was our NMI, two
-        * events could trigger 'simultaneously' raising two back-to-back NMIs.
-        *
-        * If the first NMI handles both, the latter will be empty and daze
-        * the CPU.
-        */
-       x86_pmu.handle_irq(regs);
+
+       handled = x86_pmu.handle_irq(args->regs);
+       if (!handled)
+               return NOTIFY_DONE;
+
+       this_nmi = percpu_read(irq_stat.__nmi_count);
+       if ((handled > 1) ||
+               /* the next nmi could be a back-to-back nmi */
+           ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+            (__get_cpu_var(pmu_nmi).handled > 1))) {
+               /*
+                * We could have two subsequent back-to-back nmis: The
+                * first handles more than one counter, the 2nd
+                * handles only one counter and the 3rd handles no
+                * counter.
+                *
+                * This is the 2nd nmi because the previous was
+                * handling more than one counter. We will mark the
+                * next (3rd) and then drop it if unhandled.
+                */
+               __get_cpu_var(pmu_nmi).marked   = this_nmi + 1;
+               __get_cpu_var(pmu_nmi).handled  = handled;
+       }
  
         return NOTIFY_STOP;
  }
@@ -1371,6 +1418,7 @@ void __init init_hw_perf_events(void)
         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(x86_pmu_notifier);
  }
  
@@ -1384,11 +1432,12 @@ static inline void x86_pmu_read(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
-       cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+       perf_disable();
+       cpuc->group_flag |= PERF_EVENT_TXN;
         cpuc->n_txn = 0;
  }
  
@@ -1397,16 +1446,17 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
-       cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+       cpuc->group_flag &= ~PERF_EVENT_TXN;
         /*
          * Truncate the collected events.
          */
         cpuc->n_added -= cpuc->n_txn;
         cpuc->n_events -= cpuc->n_txn;
+       perf_enable();
  }
  
  /*
@@ -1414,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int assign[X86_PMC_IDX_MAX];
@@ -1435,27 +1485,11 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
          */
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
-       /*
-        * Clear out the txn count so that ->cancel_txn() which gets
-        * run after ->commit_txn() doesn't undo things.
-        */
-       cpuc->n_txn = 0;
-
+       cpuc->group_flag &= ~PERF_EVENT_TXN;
+       perf_enable();
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = x86_pmu_enable,
-       .disable        = x86_pmu_disable,
-       .start          = x86_pmu_start,
-       .stop           = x86_pmu_stop,
-       .read           = x86_pmu_read,
-       .unthrottle     = x86_pmu_unthrottle,
-       .start_txn      = x86_pmu_start_txn,
-       .cancel_txn     = x86_pmu_cancel_txn,
-       .commit_txn     = x86_pmu_commit_txn,
-};
-
  /*
   * validate that we can schedule this event
   */
@@ -1530,12 +1564,22 @@ out:
         return ret;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
  {
-       const struct pmu *tmp;
+       struct pmu *tmp;
         int err;
  
-       err = __hw_perf_event_init(event);
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
         if (!err) {
                 /*
                  * we temporarily connect event to its pmu
@@ -1555,27 +1599,28 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (err) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
  }
  
+static struct pmu pmu = {
+       .event_init     = x86_pmu_event_init,
+       .enable         = x86_pmu_enable,
+       .disable        = x86_pmu_disable,
+       .start          = x86_pmu_start,
+       .stop           = x86_pmu_stop,
+       .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
+       .start_txn      = x86_pmu_start_txn,
+       .cancel_txn     = x86_pmu_cancel_txn,
+       .commit_txn     = x86_pmu_commit_txn,
+};
+
  /*
   * callchain support
   */
  
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-
-
  static void
  backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
  {
@@ -1596,7 +1641,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  {
         struct perf_callchain_entry *entry = data;
  
-       callchain_store(entry, addr);
+       perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops backtrace_ops = {
@@ -1607,13 +1652,15 @@ static const struct stacktrace_ops backtrace_ops = {
         .walk_stack             = print_context_stack_bp,
  };
  
-#include "../dumpstack.h"
-
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->ip);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
  
         dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
  }
@@ -1642,7 +1689,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if (fp < compat_ptr(regs->sp))
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = compat_ptr(frame.next_frame);
         }
         return 1;
@@ -1655,19 +1702,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
  }
  #endif
  
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stack_frame frame;
         const void __user *fp;
  
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
  
         fp = (void __user *)regs->bp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->ip);
+       perf_callchain_store(entry, regs->ip);
  
         if (perf_callchain_user32(regs, entry))
                 return;
@@ -1684,68 +1732,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if ((unsigned long)fp < regs->sp)
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = frame.next_frame;
         }
  }
  
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return NULL;
-       }
-
-       if (in_nmi())
-               entry = &__get_cpu_var(pmc_nmi_entry);
-       else
-               entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
-
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-       regs->ip = ip;
-       /*
-        * perf_arch_fetch_caller_regs adds another call, we need to increment
-        * the skip level
-        */
-       regs->bp = rewind_frame_pointer(skip + 1);
-       regs->cs = __KERNEL_CS;
-       /*
-        * We abuse bit 3 to pass exact information, see perf_misc_flags
-        * and the comment with PERF_EFLAGS_EXACT.
-        */
-       regs->flags = 0;
-}
-
  unsigned long perf_instruction_pointer(struct pt_regs *regs)
  {
         unsigned long ip;