]> git.karo-electronics.de Git - mv-sheeva.git/blobdiff - kernel/perf_event.c
perf: Rework the PMU methods
[mv-sheeva.git] / kernel / perf_event.c
index fb46fd13f31fb75fd6eca190a51b6a80646770ec..3bace4fd035519908ea25ea1e387e6058096ac83 100644 (file)
@@ -31,7 +31,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -72,31 +71,20 @@ static atomic64_t perf_event_id;
  */
 static DEFINE_SPINLOCK(perf_resource_lock);
 
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-       return NULL;
-}
-
-void __weak hw_perf_disable(void)              { barrier(); }
-void __weak hw_perf_enable(void)               { barrier(); }
-
 void __weak perf_event_print_debug(void)       { }
 
-static DEFINE_PER_CPU(int, perf_disable_count);
-
-void perf_disable(void)
+void perf_pmu_disable(struct pmu *pmu)
 {
-       if (!__get_cpu_var(perf_disable_count)++)
-               hw_perf_disable();
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!(*count)++)
+               pmu->pmu_disable(pmu);
 }
 
-void perf_enable(void)
+void perf_pmu_enable(struct pmu *pmu)
 {
-       if (!--__get_cpu_var(perf_disable_count))
-               hw_perf_enable();
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!--(*count))
+               pmu->pmu_enable(pmu);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -156,7 +144,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
        struct perf_event_context *ctx;
 
        rcu_read_lock();
- retry:
+retry:
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
@@ -436,7 +424,7 @@ event_sched_out(struct perf_event *event,
                event->state = PERF_EVENT_STATE_OFF;
        }
        event->tstamp_stopped = ctx->time;
-       event->pmu->disable(event);
+       event->pmu->del(event, 0);
        event->oncpu = -1;
 
        if (!is_software_event(event))
@@ -487,11 +475,6 @@ static void __perf_event_remove_from_context(void *info)
                return;
 
        raw_spin_lock(&ctx->lock);
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level.
-        */
-       perf_disable();
 
        event_sched_out(event, cpuctx, ctx);
 
@@ -507,7 +490,6 @@ static void __perf_event_remove_from_context(void *info)
                            perf_max_events - perf_reserved_percpu);
        }
 
-       perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -628,7 +610,7 @@ void perf_event_disable(struct perf_event *event)
                return;
        }
 
- retry:
+retry:
        task_oncpu_function_call(task, __perf_event_disable, event);
 
        raw_spin_lock_irq(&ctx->lock);
@@ -667,7 +649,7 @@ event_sched_in(struct perf_event *event,
         */
        smp_wmb();
 
-       if (event->pmu->enable(event)) {
+       if (event->pmu->add(event, PERF_EF_START)) {
                event->state = PERF_EVENT_STATE_INACTIVE;
                event->oncpu = -1;
                return -EAGAIN;
@@ -692,21 +674,14 @@ group_sched_in(struct perf_event *group_event,
 {
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu;
-       bool txn = false;
 
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
 
-       /* Check if group transaction availabe */
-       if (pmu->start_txn)
-               txn = true;
-
-       if (txn)
-               pmu->start_txn(pmu);
+       pmu->start_txn(pmu);
 
        if (event_sched_in(group_event, cpuctx, ctx)) {
-               if (txn)
-                       pmu->cancel_txn(pmu);
+               pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
 
@@ -720,7 +695,7 @@ group_sched_in(struct perf_event *group_event,
                }
        }
 
-       if (!txn || !pmu->commit_txn(pmu))
+       if (!pmu->commit_txn(pmu))
                return 0;
 
 group_error:
@@ -735,8 +710,7 @@ group_error:
        }
        event_sched_out(group_event, cpuctx, ctx);
 
-       if (txn)
-               pmu->cancel_txn(pmu);
+       pmu->cancel_txn(pmu);
 
        return -EAGAIN;
 }
@@ -812,12 +786,6 @@ static void __perf_install_in_context(void *info)
        ctx->is_active = 1;
        update_context_time(ctx);
 
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level. NOP for non NMI based events.
-        */
-       perf_disable();
-
        add_event_to_ctx(event, ctx);
 
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -858,9 +826,7 @@ static void __perf_install_in_context(void *info)
        if (!err && !ctx->task && cpuctx->max_pertask)
                cpuctx->max_pertask--;
 
- unlock:
-       perf_enable();
-
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -931,10 +897,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 
        event->state = PERF_EVENT_STATE_INACTIVE;
        event->tstamp_enabled = ctx->time - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry)
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                        sub->tstamp_enabled =
                                ctx->time - sub->total_time_enabled;
+               }
+       }
 }
 
 /*
@@ -979,12 +947,10 @@ static void __perf_event_enable(void *info)
        if (!group_can_go_on(event, cpuctx, 1)) {
                err = -EEXIST;
        } else {
-               perf_disable();
                if (event == leader)
                        err = group_sched_in(event, cpuctx, ctx);
                else
                        err = event_sched_in(event, cpuctx, ctx);
-               perf_enable();
        }
 
        if (err) {
@@ -1000,7 +966,7 @@ static void __perf_event_enable(void *info)
                }
        }
 
- unlock:
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -1041,7 +1007,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_ERROR)
                event->state = PERF_EVENT_STATE_OFF;
 
- retry:
+retry:
        raw_spin_unlock_irq(&ctx->lock);
        task_oncpu_function_call(task, __perf_event_enable, event);
 
@@ -1061,7 +1027,7 @@ void perf_event_enable(struct perf_event *event)
        if (event->state == PERF_EVENT_STATE_OFF)
                __perf_event_mark_enabled(event, ctx);
 
- out:
+out:
        raw_spin_unlock_irq(&ctx->lock);
 }
 
@@ -1097,21 +1063,19 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                goto out;
        update_context_time(ctx);
 
-       perf_disable();
        if (!ctx->nr_active)
-               goto out_enable;
+               goto out;
 
-       if (event_type & EVENT_PINNED)
+       if (event_type & EVENT_PINNED) {
                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+       }
 
-       if (event_type & EVENT_FLEXIBLE)
+       if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
-
- out_enable:
-       perf_enable();
- out:
+       }
+out:
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -1350,9 +1314,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
 
-               if (group_can_go_on(event, cpuctx, can_add_hw))
+               if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
+               }
        }
 }
 
@@ -1368,8 +1333,6 @@ ctx_sched_in(struct perf_event_context *ctx,
 
        ctx->timestamp = perf_clock();
 
-       perf_disable();
-
        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
@@ -1381,8 +1344,7 @@ ctx_sched_in(struct perf_event_context *ctx,
        if (event_type & EVENT_FLEXIBLE)
                ctx_flexible_sched_in(ctx, cpuctx);
 
-       perf_enable();
- out:
+out:
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -1429,8 +1391,6 @@ void perf_event_task_sched_in(struct task_struct *task)
        if (cpuctx->task_ctx == ctx)
                return;
 
-       perf_disable();
-
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
@@ -1443,8 +1403,6 @@ void perf_event_task_sched_in(struct task_struct *task)
        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
        cpuctx->task_ctx = ctx;
-
-       perf_enable();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1482,6 @@ do {                                     \
        return div64_u64(dividend, divisor);
 }
 
-static void perf_event_stop(struct perf_event *event)
-{
-       if (!event->pmu->stop)
-               return event->pmu->disable(event);
-
-       return event->pmu->stop(event);
-}
-
-static int perf_event_start(struct perf_event *event)
-{
-       if (!event->pmu->start)
-               return event->pmu->enable(event);
-
-       return event->pmu->start(event);
-}
-
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -1559,11 +1501,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
        hwc->sample_period = sample_period;
 
        if (local64_read(&hwc->period_left) > 8*sample_period) {
-               perf_disable();
-               perf_event_stop(event);
+               event->pmu->stop(event, PERF_EF_UPDATE);
                local64_set(&hwc->period_left, 0);
-               perf_event_start(event);
-               perf_enable();
+               event->pmu->start(event, PERF_EF_RELOAD);
        }
 }
 
@@ -1592,15 +1532,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
-                       perf_disable();
-                       event->pmu->unthrottle(event);
-                       perf_enable();
+                       event->pmu->start(event, 0);
                }
 
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
 
-               perf_disable();
                event->pmu->read(event);
                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
@@ -1608,7 +1545,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 
                if (delta > 0)
                        perf_adjust_period(event, TICK_NSEC, delta);
-               perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1651,7 +1587,6 @@ void perf_event_task_tick(struct task_struct *curr)
        if (!rotate)
                return;
 
-       perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1663,7 +1598,6 @@ void perf_event_task_tick(struct task_struct *curr)
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-       perf_enable();
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -1723,7 +1657,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
 
        perf_event_task_sched_in(task);
- out:
+out:
        local_irq_restore(flags);
 }
 
@@ -2062,7 +1996,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto errout;
 
- retry:
+retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                unclone_ctx(ctx);
@@ -2090,7 +2024,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        put_task_struct(task);
        return ctx;
 
- errout:
+errout:
        put_task_struct(task);
        return ERR_PTR(err);
 }
@@ -2556,6 +2490,9 @@ int perf_event_task_disable(void)
 
 static int perf_event_index(struct perf_event *event)
 {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
 
@@ -3273,7 +3210,7 @@ again:
        if (handle->wakeup != local_read(&buffer->wakeup))
                perf_output_wakeup(handle);
 
- out:
+out:
        preempt_enable();
 }
 
@@ -4170,8 +4107,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
 
-       throttle = (throttle && event->pmu->unthrottle != NULL);
-
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -4296,7 +4231,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
 
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
 {
@@ -4322,6 +4257,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;
@@ -4421,7 +4359,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
-                       perf_swevent_add(event, nr, nmi, data, regs);
+                       perf_swevent_event(event, nr, nmi, data, regs);
        }
 end:
        rcu_read_unlock();
@@ -4465,7 +4403,7 @@ static void perf_swevent_read(struct perf_event *event)
 {
 }
 
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
 {
        struct hw_perf_event *hwc = &event->hw;
        struct perf_cpu_context *cpuctx;
@@ -4478,6 +4416,8 @@ static int perf_swevent_enable(struct perf_event *event)
                perf_swevent_set_period(event);
        }
 
+       hwc->state = !(flags & PERF_EF_START);
+
        head = find_swevent_head(cpuctx, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
@@ -4487,196 +4427,21 @@ static int perf_swevent_enable(struct perf_event *event)
        return 0;
 }
 
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
 {
        hlist_del_rcu(&event->hlist_entry);
 }
 
-static void perf_swevent_void(struct perf_event *event)
-{
-}
-
-static int perf_swevent_int(struct perf_event *event)
-{
-       return 0;
-}
-
-static struct pmu perf_ops_generic = {
-       .enable         = perf_swevent_enable,
-       .disable        = perf_swevent_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
-       .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_event *event;
-       u64 period;
-
-       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-       event->pmu->read(event);
-
-       perf_sample_data_init(&data, 0);
-       data.period = event->hw.last_period;
-       regs = get_irq_regs();
-
-       if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
-                       if (perf_event_overflow(event, 0, &data, regs))
-                               ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, event->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swevent_hrtimer;
-       if (hwc->sample_period) {
-               u64 period;
-
-               if (hwc->remaining) {
-                       if (hwc->remaining < 0)
-                               period = 10000;
-                       else
-                               period = hwc->remaining;
-                       hwc->remaining = 0;
-               } else {
-                       period = max_t(u64, 10000, hwc->sample_period);
-               }
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->sample_period) {
-               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-               hwc->remaining = ktime_to_ns(remaining);
-
-               hrtimer_cancel(&hwc->hrtimer);
-       }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-       int cpu = raw_smp_processor_id();
-       s64 prev;
-       u64 now;
-
-       now = cpu_clock(cpu);
-       prev = local64_xchg(&event->hw.prev_count, now);
-       local64_add(now - prev, &event->count);
-}
-
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int cpu = raw_smp_processor_id();
-
-       local64_set(&hwc->prev_count, cpu_clock(cpu));
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-       cpu_clock_perf_event_update(event);
-}
-
-static struct pmu perf_ops_cpu_clock = {
-       .enable         = cpu_clock_perf_event_enable,
-       .disable        = cpu_clock_perf_event_disable,
-       .read           = cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = local64_xchg(&event->hw.prev_count, now);
-       delta = now - prev;
-       local64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 now;
-
-       now = event->ctx->time;
-
-       local64_set(&hwc->prev_count, now);
-
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
 {
-       perf_swevent_cancel_hrtimer(event);
-       task_clock_perf_event_update(event, event->ctx->time);
-
+       event->hw.state = 0;
 }
 
-static void task_clock_perf_event_read(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
 {
-       u64 time;
-
-       if (!in_nmi()) {
-               update_context_time(event->ctx);
-               time = event->ctx->time;
-       } else {
-               u64 now = perf_clock();
-               u64 delta = now - event->ctx->timestamp;
-               time = event->ctx->time + delta;
-       }
-
-       task_clock_perf_event_update(event, time);
+       event->hw.state = PERF_HES_STOPPED;
 }
 
-static struct pmu perf_ops_task_clock = {
-       .enable         = task_clock_perf_event_enable,
-       .disable        = task_clock_perf_event_disable,
-       .read           = task_clock_perf_event_read,
-};
-
 /* Deref the hlist from the update side */
 static inline struct swevent_hlist *
 swevent_hlist_deref(struct perf_cpu_context *cpuctx)
@@ -4747,7 +4512,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
        }
        cpuctx->hlist_refcount++;
- exit:
+exit:
        mutex_unlock(&cpuctx->hlist_mutex);
 
        return err;
@@ -4772,7 +4537,7 @@ static int swevent_hlist_get(struct perf_event *event)
        put_online_cpus();
 
        return 0;
- fail:
+fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
@@ -4783,17 +4548,62 @@ static int swevent_hlist_get(struct perf_event *event)
        return err;
 }
 
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       WARN_ON(event->parent);
+
+       atomic_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+       int event_id = event->attr.config;
+
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+       case PERF_COUNT_SW_TASK_CLOCK:
+               return -ENOENT;
+
+       default:
+               break;
+       }
+
+       if (event_id > PERF_COUNT_SW_MAX)
+               return -ENOENT;
 
-static struct pmu perf_ops_tracepoint = {
-       .enable         = perf_trace_enable,
-       .disable        = perf_trace_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
+       if (!event->parent) {
+               int err;
+
+               err = swevent_hlist_get(event);
+               if (err)
+                       return err;
+
+               atomic_inc(&perf_swevent_enabled[event_id]);
+               event->destroy = sw_perf_event_destroy;
+       }
+
+       return 0;
+}
+
+static struct pmu perf_swevent = {
+       .event_init     = perf_swevent_init,
+       .add            = perf_swevent_add,
+       .del            = perf_swevent_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void,
 };
 
+#ifdef CONFIG_EVENT_TRACING
+
 static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
@@ -4837,7 +4647,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 
        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
-                       perf_swevent_add(event, count, 1, &data, regs);
+                       perf_swevent_event(event, count, 1, &data, regs);
        }
 
        perf_swevent_put_recursion_context(rctx);
@@ -4849,10 +4659,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
        perf_trace_destroy(event);
 }
 
-static struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
 {
        int err;
 
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4860,15 +4673,29 @@ static struct pmu *tp_perf_event_init(struct perf_event *event)
        if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                        perf_paranoid_tracepoint_raw() &&
                        !capable(CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
+               return -EPERM;
 
        err = perf_trace_init(event);
        if (err)
-               return NULL;
+               return err;
 
        event->destroy = tp_perf_event_destroy;
 
-       return &perf_ops_tracepoint;
+       return 0;
+}
+
+static struct pmu perf_tracepoint = {
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+       perf_pmu_register(&perf_tracepoint);
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4896,9 +4723,8 @@ static void perf_event_free_filter(struct perf_event *event)
 
 #else
 
-static struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
 {
-       return NULL;
 }
 
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4913,105 +4739,319 @@ static void perf_event_free_filter(struct perf_event *event)
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
 {
-       release_bp_slot(event);
+       struct perf_sample_data sample;
+       struct pt_regs *regs = data;
+
+       perf_sample_data_init(&sample, bp->attr.bp_addr);
+
+       if (!bp->hw.state && !perf_exclude_event(bp, regs))
+               perf_swevent_event(bp, 1, 1, &sample, regs);
 }
+#endif
 
-static struct pmu *bp_perf_event_init(struct perf_event *bp)
+/*
+ * hrtimer based swevent callback
+ */
+
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 {
-       int err;
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
 
-       err = register_perf_hw_breakpoint(bp);
-       if (err)
-               return ERR_PTR(err);
+       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+       event->pmu->read(event);
 
-       bp->destroy = bp_perf_event_destroy;
+       perf_sample_data_init(&data, 0);
+       data.period = event->hw.last_period;
+       regs = get_irq_regs();
 
-       return &perf_ops_bp;
+       if (regs && !perf_exclude_event(event, regs)) {
+               if (!(event->attr.exclude_idle && current->pid == 0))
+                       if (perf_event_overflow(event, 0, &data, regs))
+                               ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
 }
 
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
 {
-       struct perf_sample_data sample;
-       struct pt_regs *regs = data;
+       struct hw_perf_event *hwc = &event->hw;
 
-       perf_sample_data_init(&sample, bp->attr.bp_addr);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               s64 period = local64_read(&hwc->period_left);
+
+               if (period) {
+                       if (period < 0)
+                               period = 10000;
+
+                       local64_set(&hwc->period_left, 0);
+               } else {
+                       period = max_t(u64, 10000, hwc->sample_period);
+               }
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
+}
 
-       if (!perf_exclude_event(bp, regs))
-               perf_swevent_add(bp, 1, 1, &sample, regs);
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->sample_period) {
+               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+               local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+               hrtimer_cancel(&hwc->hrtimer);
+       }
 }
-#else
-static struct pmu *bp_perf_event_init(struct perf_event *bp)
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
 {
-       return NULL;
+       s64 prev;
+       u64 now;
+
+       now = local_clock();
+       prev = local64_xchg(&event->hw.prev_count, now);
+       local64_add(now - prev, &event->count);
 }
 
-void perf_bp_event(struct perf_event *bp, void *regs)
+static void cpu_clock_event_start(struct perf_event *event, int flags)
 {
+       local64_set(&event->hw.prev_count, local_clock());
+       perf_swevent_start_hrtimer(event);
 }
-#endif
 
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       cpu_clock_event_update(event);
+}
 
-static void sw_perf_event_destroy(struct perf_event *event)
+static int cpu_clock_event_add(struct perf_event *event, int flags)
 {
-       u64 event_id = event->attr.config;
+       if (flags & PERF_EF_START)
+               cpu_clock_event_start(event, flags);
 
-       WARN_ON(event->parent);
+       return 0;
+}
 
-       atomic_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+       cpu_clock_event_stop(event, flags);
 }
 
-static struct pmu *sw_perf_event_init(struct perf_event *event)
+static void cpu_clock_event_read(struct perf_event *event)
 {
-       struct pmu *pmu = NULL;
-       u64 event_id = event->attr.config;
+       cpu_clock_event_update(event);
+}
 
-       /*
-        * Software events (currently) can't in general distinguish
-        * between user, kernel and hypervisor events.
-        * However, context switches and cpu migrations are considered
-        * to be kernel events, and page faults are never hypervisor
-        * events.
-        */
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-               pmu = &perf_ops_cpu_clock;
+static int cpu_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
 
-               break;
-       case PERF_COUNT_SW_TASK_CLOCK:
-               /*
-                * If the user instantiates this as a per-cpu event,
-                * use the cpu_clock event instead.
-                */
-               if (event->ctx->task)
-                       pmu = &perf_ops_task_clock;
-               else
-                       pmu = &perf_ops_cpu_clock;
+       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+               return -ENOENT;
 
-               break;
-       case PERF_COUNT_SW_PAGE_FAULTS:
-       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-       case PERF_COUNT_SW_CONTEXT_SWITCHES:
-       case PERF_COUNT_SW_CPU_MIGRATIONS:
-       case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-       case PERF_COUNT_SW_EMULATION_FAULTS:
-               if (!event->parent) {
-                       int err;
-
-                       err = swevent_hlist_get(event);
-                       if (err)
-                               return ERR_PTR(err);
+       return 0;
+}
 
-                       atomic_inc(&perf_swevent_enabled[event_id]);
-                       event->destroy = sw_perf_event_destroy;
+static struct pmu perf_cpu_clock = {
+       .event_init     = cpu_clock_event_init,
+       .add            = cpu_clock_event_add,
+       .del            = cpu_clock_event_del,
+       .start          = cpu_clock_event_start,
+       .stop           = cpu_clock_event_stop,
+       .read           = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = local64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       local64_add(delta, &event->count);
+}
+
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, event->ctx->time);
+       perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               task_clock_event_start(event, flags);
+
+       return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+       task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(event->ctx);
+               time = event->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - event->ctx->timestamp;
+               time = event->ctx->time + delta;
+       }
+
+       task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+               return -ENOENT;
+
+       return 0;
+}
+
+static struct pmu perf_task_clock = {
+       .event_init     = task_clock_event_init,
+       .add            = task_clock_event_add,
+       .del            = task_clock_event_del,
+       .start          = task_clock_event_start,
+       .stop           = task_clock_event_stop,
+       .read           = task_clock_event_read,
+};
+
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+       return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+       perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+       perf_pmu_enable(pmu);
+       return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+       perf_pmu_enable(pmu);
+}
+
+int perf_pmu_register(struct pmu *pmu)
+{
+       int ret;
+
+       mutex_lock(&pmus_lock);
+       ret = -ENOMEM;
+       pmu->pmu_disable_count = alloc_percpu(int);
+       if (!pmu->pmu_disable_count)
+               goto unlock;
+
+       if (!pmu->start_txn) {
+               if (pmu->pmu_enable) {
+                       /*
+                        * If we have pmu_enable/pmu_disable calls, install
+                        * transaction stubs that use that to try and batch
+                        * hardware accesses.
+                        */
+                       pmu->start_txn  = perf_pmu_start_txn;
+                       pmu->commit_txn = perf_pmu_commit_txn;
+                       pmu->cancel_txn = perf_pmu_cancel_txn;
+               } else {
+                       pmu->start_txn  = perf_pmu_nop_void;
+                       pmu->commit_txn = perf_pmu_nop_int;
+                       pmu->cancel_txn = perf_pmu_nop_void;
+               }
+       }
+
+       if (!pmu->pmu_enable) {
+               pmu->pmu_enable  = perf_pmu_nop_void;
+               pmu->pmu_disable = perf_pmu_nop_void;
+       }
+
+       list_add_rcu(&pmu->entry, &pmus);
+       ret = 0;
+unlock:
+       mutex_unlock(&pmus_lock);
+
+       return ret;
+}
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+       mutex_lock(&pmus_lock);
+       list_del_rcu(&pmu->entry);
+       mutex_unlock(&pmus_lock);
+
+       synchronize_srcu(&pmus_srcu);
+
+       free_percpu(pmu->pmu_disable_count);
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+       struct pmu *pmu = NULL;
+       int idx;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               int ret = pmu->event_init(event);
+               if (!ret)
+                       break;
+               if (ret != -ENOENT) {
+                       pmu = ERR_PTR(ret);
+                       break;
                }
-               pmu = &perf_ops_generic;
-               break;
        }
+       srcu_read_unlock(&pmus_srcu, idx);
 
        return pmu;
 }
@@ -5092,29 +5132,8 @@ perf_event_alloc(struct perf_event_attr *attr,
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                goto done;
 
-       switch (attr->type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               pmu = hw_perf_event_init(event);
-               break;
+       pmu = perf_init_event(event);
 
-       case PERF_TYPE_SOFTWARE:
-               pmu = sw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_TRACEPOINT:
-               pmu = tp_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_BREAKPOINT:
-               pmu = bp_perf_event_init(event);
-               break;
-
-
-       default:
-               break;
-       }
 done:
        err = 0;
        if (!pmu)
@@ -5979,22 +5998,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
 
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-       .notifier_call          = perf_cpu_notify,
-       .priority               = 20,
-};
-
 void __init perf_event_init(void)
 {
        perf_event_init_all_cpus();
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-                       (void *)(long)smp_processor_id());
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-                       (void *)(long)smp_processor_id());
-       register_cpu_notifier(&perf_cpu_nb);
+       init_srcu_struct(&pmus_srcu);
+       perf_pmu_register(&perf_swevent);
+       perf_pmu_register(&perf_cpu_clock);
+       perf_pmu_register(&perf_task_clock);
+       perf_tp_register();
+       perf_cpu_notifier(perf_cpu_notify);
 }
 
 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,