perf: Reduce perf_disable() usage

[mv-sheeva.git] / arch / x86 / kernel / cpu / perf_event.c
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index f2da20fda02ddf6fcd449a88ba399fe4ed44af2a..846070ce49c3db27346294a6ff10210b3586f999 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -530,7 +530,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
  /*
   * Setup the hardware configuration for a given attr_type
   */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
  {
         int err;
  
@@ -618,7 +618,7 @@ static void x86_pmu_enable_all(int added)
         }
  }
  
-static const struct pmu pmu;
+static struct pmu pmu;
  
  static inline int is_x86_event(struct perf_event *event)
  {
@@ -969,10 +969,11 @@ static int x86_pmu_enable(struct perf_event *event)
  
         hwc = &event->hw;
  
+       perf_disable();
         n0 = cpuc->n_events;
-       n = collect_events(cpuc, event, false);
-       if (n < 0)
-               return n;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
  
         /*
          * If group events scheduling transaction was started,
@@ -980,23 +981,26 @@ static int x86_pmu_enable(struct perf_event *event)
          * at commit time(->commit_txn) as a whole
          */
         if (cpuc->group_flag & PERF_EVENT_TXN)
-               goto out;
+               goto done_collect;
  
         ret = x86_pmu.schedule_events(cpuc, n, assign);
         if (ret)
-               return ret;
+               goto out;
         /*
          * copy new assignment, now we know it is possible
          * will be used by hw_perf_enable()
          */
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
-out:
+done_collect:
         cpuc->n_events = n;
         cpuc->n_added += n - n0;
         cpuc->n_txn += n - n0;
  
-       return 0;
+       ret = 0;
+out:
+       perf_enable();
+       return ret;
  }
  
  static int x86_pmu_start(struct perf_event *event)
@@ -1154,7 +1158,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                 /*
                  * event overflow
                  */
-               handled         = 1;
+               handled++;
                 data.period     = event->hw.last_period;
  
                 if (!x86_perf_event_set_period(event))
@@ -1200,12 +1204,20 @@ void perf_events_lapic_init(void)
         apic_write(APIC_LVTPC, APIC_DM_NMI);
  }
  
+struct pmu_nmi_state {
+       unsigned int    marked;
+       int             handled;
+};
+
+static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
+
  static int __kprobes
  perf_event_nmi_handler(struct notifier_block *self,
                          unsigned long cmd, void *__args)
  {
         struct die_args *args = __args;
-       struct pt_regs *regs;
+       unsigned int this_nmi;
+       int handled;
  
         if (!atomic_read(&active_events))
                 return NOTIFY_DONE;
@@ -1214,22 +1226,47 @@ perf_event_nmi_handler(struct notifier_block *self,
         case DIE_NMI:
         case DIE_NMI_IPI:
                 break;
-
+       case DIE_NMIUNKNOWN:
+               this_nmi = percpu_read(irq_stat.__nmi_count);
+               if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+                       /* let the kernel handle the unknown nmi */
+                       return NOTIFY_DONE;
+               /*
+                * This one is a PMU back-to-back nmi. Two events
+                * trigger 'simultaneously' raising two back-to-back
+                * NMIs. If the first NMI handles both, the latter
+                * will be empty and daze the CPU. So, we drop it to
+                * avoid false-positive 'unknown nmi' messages.
+                */
+               return NOTIFY_STOP;
         default:
                 return NOTIFY_DONE;
         }
  
-       regs = args->regs;
-
         apic_write(APIC_LVTPC, APIC_DM_NMI);
-       /*
-        * Can't rely on the handled return value to say it was our NMI, two
-        * events could trigger 'simultaneously' raising two back-to-back NMIs.
-        *
-        * If the first NMI handles both, the latter will be empty and daze
-        * the CPU.
-        */
-       x86_pmu.handle_irq(regs);
+
+       handled = x86_pmu.handle_irq(args->regs);
+       if (!handled)
+               return NOTIFY_DONE;
+
+       this_nmi = percpu_read(irq_stat.__nmi_count);
+       if ((handled > 1) ||
+               /* the next nmi could be a back-to-back nmi */
+           ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
+            (__get_cpu_var(pmu_nmi).handled > 1))) {
+               /*
+                * We could have two subsequent back-to-back nmis: The
+                * first handles more than one counter, the 2nd
+                * handles only one counter and the 3rd handles no
+                * counter.
+                *
+                * This is the 2nd nmi because the previous was
+                * handling more than one counter. We will mark the
+                * next (3rd) and then drop it if unhandled.
+                */
+               __get_cpu_var(pmu_nmi).marked   = this_nmi + 1;
+               __get_cpu_var(pmu_nmi).handled  = handled;
+       }
  
         return NOTIFY_STOP;
  }
@@ -1381,6 +1418,7 @@ void __init init_hw_perf_events(void)
         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(x86_pmu_notifier);
  }
  
@@ -1394,10 +1432,11 @@ static inline void x86_pmu_read(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
+       perf_disable();
         cpuc->group_flag |= PERF_EVENT_TXN;
         cpuc->n_txn = 0;
  }
@@ -1407,7 +1446,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -1417,6 +1456,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
          */
         cpuc->n_added -= cpuc->n_txn;
         cpuc->n_events -= cpuc->n_txn;
+       perf_enable();
  }
  
  /*
@@ -1424,7 +1464,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int assign[X86_PMC_IDX_MAX];
@@ -1446,22 +1486,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
         cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+       perf_enable();
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = x86_pmu_enable,
-       .disable        = x86_pmu_disable,
-       .start          = x86_pmu_start,
-       .stop           = x86_pmu_stop,
-       .read           = x86_pmu_read,
-       .unthrottle     = x86_pmu_unthrottle,
-       .start_txn      = x86_pmu_start_txn,
-       .cancel_txn     = x86_pmu_cancel_txn,
-       .commit_txn     = x86_pmu_commit_txn,
-};
-
  /*
   * validate that we can schedule this event
   */
@@ -1536,12 +1564,22 @@ out:
         return ret;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
  {
-       const struct pmu *tmp;
+       struct pmu *tmp;
         int err;
  
-       err = __hw_perf_event_init(event);
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
         if (!err) {
                 /*
                  * we temporarily connect event to its pmu
@@ -1561,27 +1599,28 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (err) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
  }
  
+static struct pmu pmu = {
+       .event_init     = x86_pmu_event_init,
+       .enable         = x86_pmu_enable,
+       .disable        = x86_pmu_disable,
+       .start          = x86_pmu_start,
+       .stop           = x86_pmu_stop,
+       .read           = x86_pmu_read,
+       .unthrottle     = x86_pmu_unthrottle,
+       .start_txn      = x86_pmu_start_txn,
+       .cancel_txn     = x86_pmu_cancel_txn,
+       .commit_txn     = x86_pmu_commit_txn,
+};
+
  /*
   * callchain support
   */
  
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
-
-
  static void
  backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
  {
@@ -1602,7 +1641,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  {
         struct perf_callchain_entry *entry = data;
  
-       callchain_store(entry, addr);
+       perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops backtrace_ops = {
@@ -1613,11 +1652,15 @@ static const struct stacktrace_ops backtrace_ops = {
         .walk_stack             = print_context_stack_bp,
  };
  
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->ip);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
  
         dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
  }
@@ -1646,7 +1689,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if (fp < compat_ptr(regs->sp))
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = compat_ptr(frame.next_frame);
         }
         return 1;
@@ -1659,19 +1702,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
  }
  #endif
  
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stack_frame frame;
         const void __user *fp;
  
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
  
         fp = (void __user *)regs->bp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->ip);
+       perf_callchain_store(entry, regs->ip);
  
         if (perf_callchain_user32(regs, entry))
                 return;
@@ -1688,52 +1732,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if ((unsigned long)fp < regs->sp)
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = frame.next_frame;
         }
  }
  
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return NULL;
-       }
-
-       if (in_nmi())
-               entry = &__get_cpu_var(pmc_nmi_entry);
-       else
-               entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
-
  unsigned long perf_instruction_pointer(struct pt_regs *regs)
  {
         unsigned long ip;