perf: Reduce perf_disable() usage

[mv-sheeva.git] / kernel / perf_event.c
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

index ff86c558af4c28dd4c9a7ea92cc592b70bee6d7d..9a98ce95356106d689c9fba843dbb5a67a67c4c0 100644 (file)
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,7 +31,6 @@
  #include <linux/kernel_stat.h>
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
  
  #include <asm/irq_regs.h>
  
@@ -72,14 +71,6 @@ static atomic64_t perf_event_id;
   */
  static DEFINE_SPINLOCK(perf_resource_lock);
  
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-       return NULL;
-}
-
  void __weak hw_perf_disable(void)              { barrier(); }
  void __weak hw_perf_enable(void)               { barrier(); }
  
@@ -156,7 +147,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
         struct perf_event_context *ctx;
  
         rcu_read_lock();
- retry:
+retry:
         ctx = rcu_dereference(task->perf_event_ctxp);
         if (ctx) {
                 /*
@@ -214,7 +205,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
  
  static inline u64 perf_clock(void)
  {
-       return cpu_clock(raw_smp_processor_id());
+       return local_clock();
  }
  
  /*
@@ -402,11 +393,31 @@ static void perf_group_detach(struct perf_event *event)
         }
  }
  
+static inline int
+event_filter_match(struct perf_event *event)
+{
+       return event->cpu == -1 || event->cpu == smp_processor_id();
+}
+
  static void
  event_sched_out(struct perf_event *event,
                   struct perf_cpu_context *cpuctx,
                   struct perf_event_context *ctx)
  {
+       u64 delta;
+       /*
+        * An event which could not be activated because of
+        * filter mismatch still needs to have its timings
+        * maintained, otherwise bogus information is return
+        * via read() for time_enabled, time_running:
+        */
+       if (event->state == PERF_EVENT_STATE_INACTIVE
+           && !event_filter_match(event)) {
+               delta = ctx->time - event->tstamp_stopped;
+               event->tstamp_running += delta;
+               event->tstamp_stopped = ctx->time;
+       }
+
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return;
  
@@ -432,9 +443,7 @@ group_sched_out(struct perf_event *group_event,
                 struct perf_event_context *ctx)
  {
         struct perf_event *event;
-
-       if (group_event->state != PERF_EVENT_STATE_ACTIVE)
-               return;
+       int state = group_event->state;
  
         event_sched_out(group_event, cpuctx, ctx);
  
@@ -444,7 +453,7 @@ group_sched_out(struct perf_event *group_event,
         list_for_each_entry(event, &group_event->sibling_list, group_entry)
                 event_sched_out(event, cpuctx, ctx);
  
-       if (group_event->attr.exclusive)
+       if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                 cpuctx->exclusive = 0;
  }
  
@@ -469,11 +478,6 @@ static void __perf_event_remove_from_context(void *info)
                 return;
  
         raw_spin_lock(&ctx->lock);
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level.
-        */
-       perf_disable();
  
         event_sched_out(event, cpuctx, ctx);
  
@@ -489,7 +493,6 @@ static void __perf_event_remove_from_context(void *info)
                             perf_max_events - perf_reserved_percpu);
         }
  
-       perf_enable();
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -610,7 +613,7 @@ void perf_event_disable(struct perf_event *event)
                 return;
         }
  
- retry:
+retry:
         task_oncpu_function_call(task, __perf_event_disable, event);
  
         raw_spin_lock_irq(&ctx->lock);
@@ -673,9 +676,8 @@ group_sched_in(struct perf_event *group_event,
                struct perf_event_context *ctx)
  {
         struct perf_event *event, *partial_group = NULL;
-       const struct pmu *pmu = group_event->pmu;
+       struct pmu *pmu = group_event->pmu;
         bool txn = false;
-       int ret;
  
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
@@ -703,15 +705,9 @@ group_sched_in(struct perf_event *group_event,
                 }
         }
  
-       if (!txn)
+       if (!txn || !pmu->commit_txn(pmu))
                 return 0;
  
-       ret = pmu->commit_txn(pmu);
-       if (!ret) {
-               pmu->cancel_txn(pmu);
-               return 0;
-       }
-
  group_error:
         /*
          * Groups can be scheduled in as one unit only, so undo any
@@ -801,12 +797,6 @@ static void __perf_install_in_context(void *info)
         ctx->is_active = 1;
         update_context_time(ctx);
  
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level. NOP for non NMI based events.
-        */
-       perf_disable();
-
         add_event_to_ctx(event, ctx);
  
         if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -847,9 +837,7 @@ static void __perf_install_in_context(void *info)
         if (!err && !ctx->task && cpuctx->max_pertask)
                 cpuctx->max_pertask--;
  
- unlock:
-       perf_enable();
-
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -920,10 +908,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
  
         event->state = PERF_EVENT_STATE_INACTIVE;
         event->tstamp_enabled = ctx->time - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry)
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                         sub->tstamp_enabled =
                                 ctx->time - sub->total_time_enabled;
+               }
+       }
  }
  
  /*
@@ -968,12 +958,10 @@ static void __perf_event_enable(void *info)
         if (!group_can_go_on(event, cpuctx, 1)) {
                 err = -EEXIST;
         } else {
-               perf_disable();
                 if (event == leader)
                         err = group_sched_in(event, cpuctx, ctx);
                 else
                         err = event_sched_in(event, cpuctx, ctx);
-               perf_enable();
         }
  
         if (err) {
@@ -989,7 +977,7 @@ static void __perf_event_enable(void *info)
                 }
         }
  
- unlock:
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1030,7 +1018,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_ERROR)
                 event->state = PERF_EVENT_STATE_OFF;
  
- retry:
+retry:
         raw_spin_unlock_irq(&ctx->lock);
         task_oncpu_function_call(task, __perf_event_enable, event);
  
@@ -1050,7 +1038,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_OFF)
                 __perf_event_mark_enabled(event, ctx);
  
- out:
+out:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
@@ -1086,21 +1074,19 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                 goto out;
         update_context_time(ctx);
  
-       perf_disable();
         if (!ctx->nr_active)
-               goto out_enable;
+               goto out;
  
-       if (event_type & EVENT_PINNED)
+       if (event_type & EVENT_PINNED) {
                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
+       }
  
-       if (event_type & EVENT_FLEXIBLE)
+       if (event_type & EVENT_FLEXIBLE) {
                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
-
- out_enable:
-       perf_enable();
- out:
+       }
+out:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1155,9 +1141,9 @@ static void __perf_event_sync_stat(struct perf_event *event,
          * In order to keep per-task stats reliable we need to flip the event
          * values when we flip the contexts.
          */
-       value = atomic64_read(&next_event->count);
-       value = atomic64_xchg(&event->count, value);
-       atomic64_set(&next_event->count, value);
+       value = local64_read(&next_event->count);
+       value = local64_xchg(&event->count, value);
+       local64_set(&next_event->count, value);
  
         swap(event->total_time_enabled, next_event->total_time_enabled);
         swap(event->total_time_running, next_event->total_time_running);
@@ -1339,9 +1325,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 if (event->cpu != -1 && event->cpu != smp_processor_id())
                         continue;
  
-               if (group_can_go_on(event, cpuctx, can_add_hw))
+               if (group_can_go_on(event, cpuctx, can_add_hw)) {
                         if (group_sched_in(event, cpuctx, ctx))
                                 can_add_hw = 0;
+               }
         }
  }
  
@@ -1357,8 +1344,6 @@ ctx_sched_in(struct perf_event_context *ctx,
  
         ctx->timestamp = perf_clock();
  
-       perf_disable();
-
         /*
          * First go through the list and put on any pinned groups
          * in order to give them the best chance of going on.
@@ -1370,8 +1355,7 @@ ctx_sched_in(struct perf_event_context *ctx,
         if (event_type & EVENT_FLEXIBLE)
                 ctx_flexible_sched_in(ctx, cpuctx);
  
-       perf_enable();
- out:
+out:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1418,8 +1402,6 @@ void perf_event_task_sched_in(struct task_struct *task)
         if (cpuctx->task_ctx == ctx)
                 return;
  
-       perf_disable();
-
         /*
          * We want to keep the following priority order:
          * cpu pinned (that don't need to move), task pinned,
@@ -1432,8 +1414,6 @@ void perf_event_task_sched_in(struct task_struct *task)
         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
  
         cpuctx->task_ctx = ctx;
-
-       perf_enable();
  }
  
  #define MAX_INTERRUPTS (~0ULL)
@@ -1547,12 +1527,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
  
         hwc->sample_period = sample_period;
  
-       if (atomic64_read(&hwc->period_left) > 8*sample_period) {
-               perf_disable();
+       if (local64_read(&hwc->period_left) > 8*sample_period) {
                 perf_event_stop(event);
-               atomic64_set(&hwc->period_left, 0);
+               local64_set(&hwc->period_left, 0);
                 perf_event_start(event);
-               perf_enable();
         }
  }
  
@@ -1581,23 +1559,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                  */
                 if (interrupts == MAX_INTERRUPTS) {
                         perf_log_throttle(event, 1);
-                       perf_disable();
                         event->pmu->unthrottle(event);
-                       perf_enable();
                 }
  
                 if (!event->attr.freq || !event->attr.sample_freq)
                         continue;
  
-               perf_disable();
                 event->pmu->read(event);
-               now = atomic64_read(&event->count);
+               now = local64_read(&event->count);
                 delta = now - hwc->freq_count_stamp;
                 hwc->freq_count_stamp = now;
  
                 if (delta > 0)
                         perf_adjust_period(event, TICK_NSEC, delta);
-               perf_enable();
         }
         raw_spin_unlock(&ctx->lock);
  }
@@ -1640,7 +1614,6 @@ void perf_event_task_tick(struct task_struct *curr)
         if (!rotate)
                 return;
  
-       perf_disable();
         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1652,7 +1625,6 @@ void perf_event_task_tick(struct task_struct *curr)
         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-       perf_enable();
  }
  
  static int event_enable_on_exec(struct perf_event *event,
@@ -1712,7 +1684,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
         raw_spin_unlock(&ctx->lock);
  
         perf_event_task_sched_in(task);
- out:
+out:
         local_irq_restore(flags);
  }
  
@@ -1743,6 +1715,11 @@ static void __perf_event_read(void *info)
         event->pmu->read(event);
  }
  
+static inline u64 perf_event_count(struct perf_event *event)
+{
+       return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
  static u64 perf_event_read(struct perf_event *event)
  {
         /*
@@ -1762,7 +1739,217 @@ static u64 perf_event_read(struct perf_event *event)
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
  
-       return atomic64_read(&event->count);
+       return perf_event_count(event);
+}
+
+/*
+ * Callchain support
+ */
+
+struct callchain_cpus_entries {
+       struct rcu_head                 rcu_head;
+       struct perf_callchain_entry     *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+       struct callchain_cpus_entries *entries;
+       int cpu;
+
+       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+
+       kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+       struct callchain_cpus_entries *entries;
+
+       entries = callchain_cpus_entries;
+       rcu_assign_pointer(callchain_cpus_entries, NULL);
+       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+       int cpu;
+       int size;
+       struct callchain_cpus_entries *entries;
+
+       /*
+        * We can't use the percpu allocation API for data that can be
+        * accessed from NMI. Use a temporary manual per cpu allocation
+        * until that gets sorted out.
+        */
+       size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+               num_possible_cpus();
+
+       entries = kzalloc(size, GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
+
+       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+       for_each_possible_cpu(cpu) {
+               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                        cpu_to_node(cpu));
+               if (!entries->cpu_entries[cpu])
+                       goto fail;
+       }
+
+       rcu_assign_pointer(callchain_cpus_entries, entries);
+
+       return 0;
+
+fail:
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+       kfree(entries);
+
+       return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+       int err = 0;
+       int count;
+
+       mutex_lock(&callchain_mutex);
+
+       count = atomic_inc_return(&nr_callchain_events);
+       if (WARN_ON_ONCE(count < 1)) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       if (count > 1) {
+               /* If the allocation failed, give up */
+               if (!callchain_cpus_entries)
+                       err = -ENOMEM;
+               goto exit;
+       }
+
+       err = alloc_callchain_buffers();
+       if (err)
+               release_callchain_buffers();
+exit:
+       mutex_unlock(&callchain_mutex);
+
+       return err;
+}
+
+static void put_callchain_buffers(void)
+{
+       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+               release_callchain_buffers();
+               mutex_unlock(&callchain_mutex);
+       }
+}
+
+static int get_recursion_context(int *recursion)
+{
+       int rctx;
+
+       if (in_nmi())
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (recursion[rctx])
+               return -1;
+
+       recursion[rctx]++;
+       barrier();
+
+       return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+       barrier();
+       recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+       int cpu;
+       struct callchain_cpus_entries *entries;
+
+       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+       if (*rctx == -1)
+               return NULL;
+
+       entries = rcu_dereference(callchain_cpus_entries);
+       if (!entries)
+               return NULL;
+
+       cpu = smp_processor_id();
+
+       return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       int rctx;
+       struct perf_callchain_entry *entry;
+
+
+       entry = get_callchain_entry(&rctx);
+       if (rctx == -1)
+               return NULL;
+
+       if (!entry)
+               goto exit_put;
+
+       entry->nr = 0;
+
+       if (!user_mode(regs)) {
+               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(entry, regs);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               perf_callchain_store(entry, PERF_CONTEXT_USER);
+               perf_callchain_user(entry, regs);
+       }
+
+exit_put:
+       put_callchain_entry(rctx);
+
+       return entry;
  }
  
  /*
@@ -1836,7 +2023,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
         if (!ptrace_may_access(task, PTRACE_MODE_READ))
                 goto errout;
  
- retry:
+retry:
         ctx = perf_lock_task_context(task, &flags);
         if (ctx) {
                 unclone_ctx(ctx);
@@ -1864,7 +2051,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
         put_task_struct(task);
         return ctx;
  
- errout:
+errout:
         put_task_struct(task);
         return ERR_PTR(err);
  }
@@ -1883,7 +2070,7 @@ static void free_event_rcu(struct rcu_head *head)
  }
  
  static void perf_pending_sync(struct perf_event *event);
-static void perf_mmap_data_put(struct perf_mmap_data *data);
+static void perf_buffer_put(struct perf_buffer *buffer);
  
  static void free_event(struct perf_event *event)
  {
@@ -1891,17 +2078,19 @@ static void free_event(struct perf_event *event)
  
         if (!event->parent) {
                 atomic_dec(&nr_events);
-               if (event->attr.mmap)
+               if (event->attr.mmap || event->attr.mmap_data)
                         atomic_dec(&nr_mmap_events);
                 if (event->attr.comm)
                         atomic_dec(&nr_comm_events);
                 if (event->attr.task)
                         atomic_dec(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
         }
  
-       if (event->data) {
-               perf_mmap_data_put(event->data);
-               event->data = NULL;
+       if (event->buffer) {
+               perf_buffer_put(event->buffer);
+               event->buffer = NULL;
         }
  
         if (event->destroy)
@@ -2126,13 +2315,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  static unsigned int perf_poll(struct file *file, poll_table *wait)
  {
         struct perf_event *event = file->private_data;
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         unsigned int events = POLL_HUP;
  
         rcu_read_lock();
-       data = rcu_dereference(event->data);
-       if (data)
-               events = atomic_xchg(&data->poll, 0);
+       buffer = rcu_dereference(event->buffer);
+       if (buffer)
+               events = atomic_xchg(&buffer->poll, 0);
         rcu_read_unlock();
  
         poll_wait(file, &event->waitq, wait);
@@ -2143,7 +2332,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
  static void perf_event_reset(struct perf_event *event)
  {
         (void)perf_event_read(event);
-       atomic64_set(&event->count, 0);
+       local64_set(&event->count, 0);
         perf_event_update_userpage(event);
  }
  
@@ -2342,14 +2531,14 @@ static int perf_event_index(struct perf_event *event)
  void perf_event_update_userpage(struct perf_event *event)
  {
         struct perf_event_mmap_page *userpg;
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
  
         rcu_read_lock();
-       data = rcu_dereference(event->data);
-       if (!data)
+       buffer = rcu_dereference(event->buffer);
+       if (!buffer)
                 goto unlock;
  
-       userpg = data->user_page;
+       userpg = buffer->user_page;
  
         /*
          * Disable preemption so as to not let the corresponding user-space
@@ -2359,9 +2548,9 @@ void perf_event_update_userpage(struct perf_event *event)
         ++userpg->lock;
         barrier();
         userpg->index = perf_event_index(event);
-       userpg->offset = atomic64_read(&event->count);
+       userpg->offset = perf_event_count(event);
         if (event->state == PERF_EVENT_STATE_ACTIVE)
-               userpg->offset -= atomic64_read(&event->hw.prev_count);
+               userpg->offset -= local64_read(&event->hw.prev_count);
  
         userpg->time_enabled = event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@ -2376,6 +2565,25 @@ unlock:
         rcu_read_unlock();
  }
  
+static unsigned long perf_data_size(struct perf_buffer *buffer);
+
+static void
+perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
+{
+       long max_size = perf_data_size(buffer);
+
+       if (watermark)
+               buffer->watermark = min(max_size, watermark);
+
+       if (!buffer->watermark)
+               buffer->watermark = max_size / 2;
+
+       if (flags & PERF_BUFFER_WRITABLE)
+               buffer->writable = 1;
+
+       atomic_set(&buffer->refcount, 1);
+}
+
  #ifndef CONFIG_PERF_USE_VMALLOC
  
  /*
@@ -2383,15 +2591,15 @@ unlock:
   */
  
  static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
  {
-       if (pgoff > data->nr_pages)
+       if (pgoff > buffer->nr_pages)
                 return NULL;
  
         if (pgoff == 0)
-               return virt_to_page(data->user_page);
+               return virt_to_page(buffer->user_page);
  
-       return virt_to_page(data->data_pages[pgoff - 1]);
+       return virt_to_page(buffer->data_pages[pgoff - 1]);
  }
  
  static void *perf_mmap_alloc_page(int cpu)
@@ -2407,42 +2615,44 @@ static void *perf_mmap_alloc_page(int cpu)
         return page_address(page);
  }
  
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         unsigned long size;
         int i;
  
-       size = sizeof(struct perf_mmap_data);
+       size = sizeof(struct perf_buffer);
         size += nr_pages * sizeof(void *);
  
-       data = kzalloc(size, GFP_KERNEL);
-       if (!data)
+       buffer = kzalloc(size, GFP_KERNEL);
+       if (!buffer)
                 goto fail;
  
-       data->user_page = perf_mmap_alloc_page(event->cpu);
-       if (!data->user_page)
+       buffer->user_page = perf_mmap_alloc_page(cpu);
+       if (!buffer->user_page)
                 goto fail_user_page;
  
         for (i = 0; i < nr_pages; i++) {
-               data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
-               if (!data->data_pages[i])
+               buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
+               if (!buffer->data_pages[i])
                         goto fail_data_pages;
         }
  
-       data->nr_pages = nr_pages;
+       buffer->nr_pages = nr_pages;
+
+       perf_buffer_init(buffer, watermark, flags);
  
-       return data;
+       return buffer;
  
  fail_data_pages:
         for (i--; i >= 0; i--)
-               free_page((unsigned long)data->data_pages[i]);
+               free_page((unsigned long)buffer->data_pages[i]);
  
-       free_page((unsigned long)data->user_page);
+       free_page((unsigned long)buffer->user_page);
  
  fail_user_page:
-       kfree(data);
+       kfree(buffer);
  
  fail:
         return NULL;
@@ -2456,17 +2666,17 @@ static void perf_mmap_free_page(unsigned long addr)
         __free_page(page);
  }
  
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
  {
         int i;
  
-       perf_mmap_free_page((unsigned long)data->user_page);
-       for (i = 0; i < data->nr_pages; i++)
-               perf_mmap_free_page((unsigned long)data->data_pages[i]);
-       kfree(data);
+       perf_mmap_free_page((unsigned long)buffer->user_page);
+       for (i = 0; i < buffer->nr_pages; i++)
+               perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
+       kfree(buffer);
  }
  
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
  {
         return 0;
  }
@@ -2479,18 +2689,18 @@ static inline int page_order(struct perf_mmap_data *data)
   * Required for architectures that have d-cache aliasing issues.
   */
  
-static inline int page_order(struct perf_mmap_data *data)
+static inline int page_order(struct perf_buffer *buffer)
  {
-       return data->page_order;
+       return buffer->page_order;
  }
  
  static struct page *
-perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
  {
-       if (pgoff > (1UL << page_order(data)))
+       if (pgoff > (1UL << page_order(buffer)))
                 return NULL;
  
-       return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+       return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
  }
  
  static void perf_mmap_unmark_page(void *addr)
@@ -2500,57 +2710,59 @@ static void perf_mmap_unmark_page(void *addr)
         page->mapping = NULL;
  }
  
-static void perf_mmap_data_free_work(struct work_struct *work)
+static void perf_buffer_free_work(struct work_struct *work)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         void *base;
         int i, nr;
  
-       data = container_of(work, struct perf_mmap_data, work);
-       nr = 1 << page_order(data);
+       buffer = container_of(work, struct perf_buffer, work);
+       nr = 1 << page_order(buffer);
  
-       base = data->user_page;
+       base = buffer->user_page;
         for (i = 0; i < nr + 1; i++)
                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
  
         vfree(base);
-       kfree(data);
+       kfree(buffer);
  }
  
-static void perf_mmap_data_free(struct perf_mmap_data *data)
+static void perf_buffer_free(struct perf_buffer *buffer)
  {
-       schedule_work(&data->work);
+       schedule_work(&buffer->work);
  }
  
-static struct perf_mmap_data *
-perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_buffer *
+perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         unsigned long size;
         void *all_buf;
  
-       size = sizeof(struct perf_mmap_data);
+       size = sizeof(struct perf_buffer);
         size += sizeof(void *);
  
-       data = kzalloc(size, GFP_KERNEL);
-       if (!data)
+       buffer = kzalloc(size, GFP_KERNEL);
+       if (!buffer)
                 goto fail;
  
-       INIT_WORK(&data->work, perf_mmap_data_free_work);
+       INIT_WORK(&buffer->work, perf_buffer_free_work);
  
         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
         if (!all_buf)
                 goto fail_all_buf;
  
-       data->user_page = all_buf;
-       data->data_pages[0] = all_buf + PAGE_SIZE;
-       data->page_order = ilog2(nr_pages);
-       data->nr_pages = 1;
+       buffer->user_page = all_buf;
+       buffer->data_pages[0] = all_buf + PAGE_SIZE;
+       buffer->page_order = ilog2(nr_pages);
+       buffer->nr_pages = 1;
  
-       return data;
+       perf_buffer_init(buffer, watermark, flags);
+
+       return buffer;
  
  fail_all_buf:
-       kfree(data);
+       kfree(buffer);
  
  fail:
         return NULL;
@@ -2558,15 +2770,15 @@ fail:
  
  #endif
  
-static unsigned long perf_data_size(struct perf_mmap_data *data)
+static unsigned long perf_data_size(struct perf_buffer *buffer)
  {
-       return data->nr_pages << (PAGE_SHIFT + page_order(data));
+       return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
  }
  
  static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct perf_event *event = vma->vm_file->private_data;
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         int ret = VM_FAULT_SIGBUS;
  
         if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -2576,14 +2788,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         }
  
         rcu_read_lock();
-       data = rcu_dereference(event->data);
-       if (!data)
+       buffer = rcu_dereference(event->buffer);
+       if (!buffer)
                 goto unlock;
  
         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                 goto unlock;
  
-       vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+       vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
         if (!vmf->page)
                 goto unlock;
  
@@ -2598,52 +2810,35 @@ unlock:
         return ret;
  }
  
-static void
-perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
-{
-       long max_size = perf_data_size(data);
-
-       if (event->attr.watermark) {
-               data->watermark = min_t(long, max_size,
-                                       event->attr.wakeup_watermark);
-       }
-
-       if (!data->watermark)
-               data->watermark = max_size / 2;
-
-       atomic_set(&data->refcount, 1);
-       rcu_assign_pointer(event->data, data);
-}
-
-static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
  
-       data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-       perf_mmap_data_free(data);
+       buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
+       perf_buffer_free(buffer);
  }
  
-static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
+static struct perf_buffer *perf_buffer_get(struct perf_event *event)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
  
         rcu_read_lock();
-       data = rcu_dereference(event->data);
-       if (data) {
-               if (!atomic_inc_not_zero(&data->refcount))
-                       data = NULL;
+       buffer = rcu_dereference(event->buffer);
+       if (buffer) {
+               if (!atomic_inc_not_zero(&buffer->refcount))
+                       buffer = NULL;
         }
         rcu_read_unlock();
  
-       return data;
+       return buffer;
  }
  
-static void perf_mmap_data_put(struct perf_mmap_data *data)
+static void perf_buffer_put(struct perf_buffer *buffer)
  {
-       if (!atomic_dec_and_test(&data->refcount))
+       if (!atomic_dec_and_test(&buffer->refcount))
                 return;
  
-       call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
+       call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
  }
  
  static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2658,16 +2853,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
         struct perf_event *event = vma->vm_file->private_data;
  
         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               unsigned long size = perf_data_size(event->data);
+               unsigned long size = perf_data_size(event->buffer);
                 struct user_struct *user = event->mmap_user;
-               struct perf_mmap_data *data = event->data;
+               struct perf_buffer *buffer = event->buffer;
  
                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
                 vma->vm_mm->locked_vm -= event->mmap_locked;
-               rcu_assign_pointer(event->data, NULL);
+               rcu_assign_pointer(event->buffer, NULL);
                 mutex_unlock(&event->mmap_mutex);
  
-               perf_mmap_data_put(data);
+               perf_buffer_put(buffer);
                 free_uid(user);
         }
  }
@@ -2685,11 +2880,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
         unsigned long user_locked, user_lock_limit;
         struct user_struct *user = current_user();
         unsigned long locked, lock_limit;
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         unsigned long vma_size;
         unsigned long nr_pages;
         long user_extra, extra;
-       int ret = 0;
+       int ret = 0, flags = 0;
  
         /*
          * Don't allow mmap() of inherited per-task counters. This would
@@ -2706,7 +2901,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
         nr_pages = (vma_size / PAGE_SIZE) - 1;
  
         /*
-        * If we have data pages ensure they're a power-of-two number, so we
+        * If we have buffer pages ensure they're a power-of-two number, so we
          * can do bitmasks instead of modulo.
          */
         if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -2720,9 +2915,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
         mutex_lock(&event->mmap_mutex);
-       if (event->data) {
-               if (event->data->nr_pages == nr_pages)
-                       atomic_inc(&event->data->refcount);
+       if (event->buffer) {
+               if (event->buffer->nr_pages == nr_pages)
+                       atomic_inc(&event->buffer->refcount);
                 else
                         ret = -EINVAL;
                 goto unlock;
@@ -2752,17 +2947,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 goto unlock;
         }
  
-       WARN_ON(event->data);
+       WARN_ON(event->buffer);
+
+       if (vma->vm_flags & VM_WRITE)
+               flags |= PERF_BUFFER_WRITABLE;
  
-       data = perf_mmap_data_alloc(event, nr_pages);
-       if (!data) {
+       buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
+                                  event->cpu, flags);
+       if (!buffer) {
                 ret = -ENOMEM;
                 goto unlock;
         }
-
-       perf_mmap_data_init(event, data);
-       if (vma->vm_flags & VM_WRITE)
-               event->data->writable = 1;
+       rcu_assign_pointer(event->buffer, buffer);
  
         atomic_long_add(user_extra, &user->locked_vm);
         event->mmap_locked = extra;
@@ -2932,21 +3128,6 @@ void perf_event_do_pending(void)
         __perf_pending_run();
  }
  
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       return NULL;
-}
-
-__weak
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
-{
-}
-
-
  /*
   * We assume there is only KVM supporting the callbacks.
   * Later on, we might change it to a list if there is
@@ -2971,15 +3152,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
  /*
   * Output
   */
-static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
                               unsigned long offset, unsigned long head)
  {
         unsigned long mask;
  
-       if (!data->writable)
+       if (!buffer->writable)
                 return true;
  
-       mask = perf_data_size(data) - 1;
+       mask = perf_data_size(buffer) - 1;
  
         offset = (offset - tail) & mask;
         head   = (head   - tail) & mask;
@@ -2992,7 +3173,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
  
  static void perf_output_wakeup(struct perf_output_handle *handle)
  {
-       atomic_set(&handle->data->poll, POLL_IN);
+       atomic_set(&handle->buffer->poll, POLL_IN);
  
         if (handle->nmi) {
                 handle->event->pending_wakeup = 1;
@@ -3012,48 +3193,48 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
   */
  static void perf_output_get_handle(struct perf_output_handle *handle)
  {
-       struct perf_mmap_data *data = handle->data;
+       struct perf_buffer *buffer = handle->buffer;
  
         preempt_disable();
-       local_inc(&data->nest);
-       handle->wakeup = local_read(&data->wakeup);
+       local_inc(&buffer->nest);
+       handle->wakeup = local_read(&buffer->wakeup);
  }
  
  static void perf_output_put_handle(struct perf_output_handle *handle)
  {
-       struct perf_mmap_data *data = handle->data;
+       struct perf_buffer *buffer = handle->buffer;
         unsigned long head;
  
  again:
-       head = local_read(&data->head);
+       head = local_read(&buffer->head);
  
         /*
          * IRQ/NMI can happen here, which means we can miss a head update.
          */
  
-       if (!local_dec_and_test(&data->nest))
+       if (!local_dec_and_test(&buffer->nest))
                 goto out;
  
         /*
          * Publish the known good head. Rely on the full barrier implied
-        * by atomic_dec_and_test() order the data->head read and this
+        * by atomic_dec_and_test() order the buffer->head read and this
          * write.
          */
-       data->user_page->data_head = head;
+       buffer->user_page->data_head = head;
  
         /*
          * Now check if we missed an update, rely on the (compiler)
-        * barrier in atomic_dec_and_test() to re-read data->head.
+        * barrier in atomic_dec_and_test() to re-read buffer->head.
          */
-       if (unlikely(head != local_read(&data->head))) {
-               local_inc(&data->nest);
+       if (unlikely(head != local_read(&buffer->head))) {
+               local_inc(&buffer->nest);
                 goto again;
         }
  
-       if (handle->wakeup != local_read(&data->wakeup))
+       if (handle->wakeup != local_read(&buffer->wakeup))
                 perf_output_wakeup(handle);
  
- out:
+out:
         preempt_enable();
  }
  
@@ -3070,12 +3251,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
                 buf += size;
                 handle->size -= size;
                 if (!handle->size) {
-                       struct perf_mmap_data *data = handle->data;
+                       struct perf_buffer *buffer = handle->buffer;
  
                         handle->page++;
-                       handle->page &= data->nr_pages - 1;
-                       handle->addr = data->data_pages[handle->page];
-                       handle->size = PAGE_SIZE << page_order(data);
+                       handle->page &= buffer->nr_pages - 1;
+                       handle->addr = buffer->data_pages[handle->page];
+                       handle->size = PAGE_SIZE << page_order(buffer);
                 }
         } while (len);
  }
@@ -3084,7 +3265,7 @@ int perf_output_begin(struct perf_output_handle *handle,
                       struct perf_event *event, unsigned int size,
                       int nmi, int sample)
  {
-       struct perf_mmap_data *data;
+       struct perf_buffer *buffer;
         unsigned long tail, offset, head;
         int have_lost;
         struct {
@@ -3100,19 +3281,19 @@ int perf_output_begin(struct perf_output_handle *handle,
         if (event->parent)
                 event = event->parent;
  
-       data = rcu_dereference(event->data);
-       if (!data)
+       buffer = rcu_dereference(event->buffer);
+       if (!buffer)
                 goto out;
  
-       handle->data    = data;
+       handle->buffer  = buffer;
         handle->event   = event;
         handle->nmi     = nmi;
         handle->sample  = sample;
  
-       if (!data->nr_pages)
+       if (!buffer->nr_pages)
                 goto out;
  
-       have_lost = local_read(&data->lost);
+       have_lost = local_read(&buffer->lost);
         if (have_lost)
                 size += sizeof(lost_event);
  
@@ -3124,30 +3305,30 @@ int perf_output_begin(struct perf_output_handle *handle,
                  * tail pointer. So that all reads will be completed before the
                  * write is issued.
                  */
-               tail = ACCESS_ONCE(data->user_page->data_tail);
+               tail = ACCESS_ONCE(buffer->user_page->data_tail);
                 smp_rmb();
-               offset = head = local_read(&data->head);
+               offset = head = local_read(&buffer->head);
                 head += size;
-               if (unlikely(!perf_output_space(data, tail, offset, head)))
+               if (unlikely(!perf_output_space(buffer, tail, offset, head)))
                         goto fail;
-       } while (local_cmpxchg(&data->head, offset, head) != offset);
+       } while (local_cmpxchg(&buffer->head, offset, head) != offset);
  
-       if (head - local_read(&data->wakeup) > data->watermark)
-               local_add(data->watermark, &data->wakeup);
+       if (head - local_read(&buffer->wakeup) > buffer->watermark)
+               local_add(buffer->watermark, &buffer->wakeup);
  
-       handle->page = offset >> (PAGE_SHIFT + page_order(data));
-       handle->page &= data->nr_pages - 1;
-       handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
-       handle->addr = data->data_pages[handle->page];
+       handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
+       handle->page &= buffer->nr_pages - 1;
+       handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
+       handle->addr = buffer->data_pages[handle->page];
         handle->addr += handle->size;
-       handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
+       handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
  
         if (have_lost) {
                 lost_event.header.type = PERF_RECORD_LOST;
                 lost_event.header.misc = 0;
                 lost_event.header.size = sizeof(lost_event);
                 lost_event.id          = event->id;
-               lost_event.lost        = local_xchg(&data->lost, 0);
+               lost_event.lost        = local_xchg(&buffer->lost, 0);
  
                 perf_output_put(handle, lost_event);
         }
@@ -3155,7 +3336,7 @@ int perf_output_begin(struct perf_output_handle *handle,
         return 0;
  
  fail:
-       local_inc(&data->lost);
+       local_inc(&buffer->lost);
         perf_output_put_handle(handle);
  out:
         rcu_read_unlock();
@@ -3166,15 +3347,15 @@ out:
  void perf_output_end(struct perf_output_handle *handle)
  {
         struct perf_event *event = handle->event;
-       struct perf_mmap_data *data = handle->data;
+       struct perf_buffer *buffer = handle->buffer;
  
         int wakeup_events = event->attr.wakeup_events;
  
         if (handle->sample && wakeup_events) {
-               int events = local_inc_return(&data->events);
+               int events = local_inc_return(&buffer->events);
                 if (events >= wakeup_events) {
-                       local_sub(wakeup_events, &data->events);
-                       local_inc(&data->wakeup);
+                       local_sub(wakeup_events, &buffer->events);
+                       local_inc(&buffer->wakeup);
                 }
         }
  
@@ -3211,7 +3392,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
         u64 values[4];
         int n = 0;
  
-       values[n++] = atomic64_read(&event->count);
+       values[n++] = perf_event_count(event);
         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                 values[n++] = event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@ -3248,7 +3429,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
         if (leader != event)
                 leader->pmu->read(leader);
  
-       values[n++] = atomic64_read(&leader->count);
+       values[n++] = perf_event_count(leader);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
  
@@ -3260,7 +3441,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
                 if (sub != event)
                         sub->pmu->read(sub);
  
-               values[n++] = atomic64_read(&sub->count);
+               values[n++] = perf_event_count(sub);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
  
@@ -3441,14 +3622,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
         struct perf_output_handle handle;
         struct perf_event_header header;
  
+       /* protect the callchain buffers */
+       rcu_read_lock();
+
         perf_prepare_sample(&header, data, event, regs);
  
         if (perf_output_begin(&handle, event, header.size, nmi, 1))
-               return;
+               goto exit;
  
         perf_output_sample(&handle, &header, data, event);
  
         perf_output_end(&handle);
+
+exit:
+       rcu_read_unlock();
  }
  
  /*
@@ -3491,7 +3678,7 @@ perf_event_read_event(struct perf_event *event,
  /*
   * task tracking -- fork/exit
   *
- * enabled by: attr.comm | attr.mmap | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
   */
  
  struct perf_task_event {
@@ -3541,7 +3728,8 @@ static int perf_event_task_match(struct perf_event *event)
         if (event->cpu != -1 && event->cpu != smp_processor_id())
                 return 0;
  
-       if (event->attr.comm || event->attr.mmap || event->attr.task)
+       if (event->attr.comm || event->attr.mmap ||
+           event->attr.mmap_data || event->attr.task)
                 return 1;
  
         return 0;
@@ -3766,7 +3954,8 @@ static void perf_event_mmap_output(struct perf_event *event,
  }
  
  static int perf_event_mmap_match(struct perf_event *event,
-                                  struct perf_mmap_event *mmap_event)
+                                  struct perf_mmap_event *mmap_event,
+                                  int executable)
  {
         if (event->state < PERF_EVENT_STATE_INACTIVE)
                 return 0;
@@ -3774,19 +3963,21 @@ static int perf_event_mmap_match(struct perf_event *event,
         if (event->cpu != -1 && event->cpu != smp_processor_id())
                 return 0;
  
-       if (event->attr.mmap)
+       if ((!executable && event->attr.mmap_data) ||
+           (executable && event->attr.mmap))
                 return 1;
  
         return 0;
  }
  
  static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-                                 struct perf_mmap_event *mmap_event)
+                                 struct perf_mmap_event *mmap_event,
+                                 int executable)
  {
         struct perf_event *event;
  
         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-               if (perf_event_mmap_match(event, mmap_event))
+               if (perf_event_mmap_match(event, mmap_event, executable))
                         perf_event_mmap_output(event, mmap_event);
         }
  }
@@ -3830,6 +4021,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                 if (!vma->vm_mm) {
                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
                         goto got_name;
+               } else if (vma->vm_start <= vma->vm_mm->start_brk &&
+                               vma->vm_end >= vma->vm_mm->brk) {
+                       name = strncpy(tmp, "[heap]", sizeof(tmp));
+                       goto got_name;
+               } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                               vma->vm_end >= vma->vm_mm->start_stack) {
+                       name = strncpy(tmp, "[stack]", sizeof(tmp));
+                       goto got_name;
                 }
  
                 name = strncpy(tmp, "//anon", sizeof(tmp));
@@ -3846,17 +4045,17 @@ got_name:
  
         rcu_read_lock();
         cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
+       perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
         ctx = rcu_dereference(current->perf_event_ctxp);
         if (ctx)
-               perf_event_mmap_ctx(ctx, mmap_event);
+               perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
         put_cpu_var(perf_cpu_context);
         rcu_read_unlock();
  
         kfree(buf);
  }
  
-void __perf_event_mmap(struct vm_area_struct *vma)
+void perf_event_mmap(struct vm_area_struct *vma)
  {
         struct perf_mmap_event mmap_event;
  
@@ -4018,14 +4217,14 @@ static u64 perf_swevent_set_period(struct perf_event *event)
         hwc->last_period = hwc->sample_period;
  
  again:
-       old = val = atomic64_read(&hwc->period_left);
+       old = val = local64_read(&hwc->period_left);
         if (val < 0)
                 return 0;
  
         nr = div64_u64(period + val, period);
         offset = nr * period;
         val -= offset;
-       if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+       if (local64_cmpxchg(&hwc->period_left, old, val) != old)
                 goto again;
  
         return nr;
@@ -4064,7 +4263,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
  {
         struct hw_perf_event *hwc = &event->hw;
  
-       atomic64_add(nr, &event->count);
+       local64_add(nr, &event->count);
  
         if (!regs)
                 return;
@@ -4075,7 +4274,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                 return perf_swevent_overflow(event, 1, nmi, data, regs);
  
-       if (atomic64_add_negative(nr, &hwc->period_left))
+       if (local64_add_negative(nr, &hwc->period_left))
                 return;
  
         perf_swevent_overflow(event, 0, nmi, data, regs);
@@ -4192,35 +4391,17 @@ end:
  int perf_swevent_get_recursion_context(void)
  {
         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       int rctx;
-
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (cpuctx->recursion[rctx])
-               return -1;
-
-       cpuctx->recursion[rctx]++;
-       barrier();
  
-       return rctx;
+       return get_recursion_context(cpuctx->recursion);
  }
  EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
  
-void perf_swevent_put_recursion_context(int rctx)
+void inline perf_swevent_put_recursion_context(int rctx)
  {
         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       barrier();
-       cpuctx->recursion[rctx]--;
-}
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
  
+       put_recursion_context(cpuctx->recursion, rctx);
+}
  
  void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                             struct pt_regs *regs, u64 addr)
@@ -4281,189 +4462,13 @@ static int perf_swevent_int(struct perf_event *event)
         return 0;
  }
  
-static const struct pmu perf_ops_generic = {
-       .enable         = perf_swevent_enable,
-       .disable        = perf_swevent_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
-       .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
  {
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_event *event;
-       u64 period;
-
-       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-       event->pmu->read(event);
-
-       perf_sample_data_init(&data, 0);
-       data.period = event->hw.last_period;
-       regs = get_irq_regs();
-
-       if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
-                       if (perf_event_overflow(event, 0, &data, regs))
-                               ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, event->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swevent_hrtimer;
-       if (hwc->sample_period) {
-               u64 period;
-
-               if (hwc->remaining) {
-                       if (hwc->remaining < 0)
-                               period = 10000;
-                       else
-                               period = hwc->remaining;
-                       hwc->remaining = 0;
-               } else {
-                       period = max_t(u64, 10000, hwc->sample_period);
-               }
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->sample_period) {
-               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-               hwc->remaining = ktime_to_ns(remaining);
-
-               hrtimer_cancel(&hwc->hrtimer);
-       }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-       int cpu = raw_smp_processor_id();
-       s64 prev;
-       u64 now;
-
-       now = cpu_clock(cpu);
-       prev = atomic64_xchg(&event->hw.prev_count, now);
-       atomic64_add(now - prev, &event->count);
-}
-
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int cpu = raw_smp_processor_id();
-
-       atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void cpu_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-       cpu_clock_perf_event_update(event);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-       .enable         = cpu_clock_perf_event_enable,
-       .disable        = cpu_clock_perf_event_disable,
-       .read           = cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = atomic64_xchg(&event->hw.prev_count, now);
-       delta = now - prev;
-       atomic64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 now;
-
-       now = event->ctx->time;
-
-       atomic64_set(&hwc->prev_count, now);
-
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       task_clock_perf_event_update(event, event->ctx->time);
-
-}
-
-static void task_clock_perf_event_read(struct perf_event *event)
-{
-       u64 time;
-
-       if (!in_nmi()) {
-               update_context_time(event->ctx);
-               time = event->ctx->time;
-       } else {
-               u64 now = perf_clock();
-               u64 delta = now - event->ctx->timestamp;
-               time = event->ctx->time + delta;
-       }
-
-       task_clock_perf_event_update(event, time);
-}
-
-static const struct pmu perf_ops_task_clock = {
-       .enable         = task_clock_perf_event_enable,
-       .disable        = task_clock_perf_event_disable,
-       .read           = task_clock_perf_event_read,
-};
-
-/* Deref the hlist from the update side */
-static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
-{
-       return rcu_dereference_protected(cpuctx->swevent_hlist,
-                                        lockdep_is_held(&cpuctx->hlist_mutex));
-}
+       return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                        lockdep_is_held(&cpuctx->hlist_mutex));
+}
  
  static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
  {
@@ -4527,7 +4532,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
         }
         cpuctx->hlist_refcount++;
- exit:
+exit:
         mutex_unlock(&cpuctx->hlist_mutex);
  
         return err;
@@ -4552,7 +4557,7 @@ static int swevent_hlist_get(struct perf_event *event)
         put_online_cpus();
  
         return 0;
- fail:
+fail:
         for_each_possible_cpu(cpu) {
                 if (cpu == failed_cpu)
                         break;
@@ -4563,17 +4568,63 @@ static int swevent_hlist_get(struct perf_event *event)
         return err;
  }
  
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
  
-static const struct pmu perf_ops_tracepoint = {
-       .enable         = perf_trace_enable,
-       .disable        = perf_trace_disable,
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       WARN_ON(event->parent);
+
+       atomic_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+       int event_id = event->attr.config;
+
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+       case PERF_COUNT_SW_TASK_CLOCK:
+               return -ENOENT;
+
+       default:
+               break;
+       }
+
+       if (event_id > PERF_COUNT_SW_MAX)
+               return -ENOENT;
+
+       if (!event->parent) {
+               int err;
+
+               err = swevent_hlist_get(event);
+               if (err)
+                       return err;
+
+               atomic_inc(&perf_swevent_enabled[event_id]);
+               event->destroy = sw_perf_event_destroy;
+       }
+
+       return 0;
+}
+
+static struct pmu perf_swevent = {
+       .event_init     = perf_swevent_init,
+       .enable         = perf_swevent_enable,
+       .disable        = perf_swevent_disable,
         .start          = perf_swevent_int,
         .stop           = perf_swevent_void,
         .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void,
+       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
  };
  
+#ifdef CONFIG_EVENT_TRACING
+
  static int perf_tp_filter_match(struct perf_event *event,
                                 struct perf_sample_data *data)
  {
@@ -4601,7 +4652,7 @@ static int perf_tp_event_match(struct perf_event *event,
  }
  
  void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                  struct pt_regs *regs, struct hlist_head *head)
+                  struct pt_regs *regs, struct hlist_head *head, int rctx)
  {
         struct perf_sample_data data;
         struct perf_event *event;
@@ -4615,12 +4666,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
         perf_sample_data_init(&data, addr);
         data.raw = &raw;
  
-       rcu_read_lock();
         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                 if (perf_tp_event_match(event, &data, regs))
                         perf_swevent_add(event, count, 1, &data, regs);
         }
-       rcu_read_unlock();
+
+       perf_swevent_put_recursion_context(rctx);
  }
  EXPORT_SYMBOL_GPL(perf_tp_event);
  
@@ -4629,10 +4680,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
         perf_trace_destroy(event);
  }
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
  {
         int err;
  
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+
         /*
          * Raw tracepoint data is a severe data leak, only allow root to
          * have these.
@@ -4640,15 +4694,30 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                         perf_paranoid_tracepoint_raw() &&
                         !capable(CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
+               return -EPERM;
  
         err = perf_trace_init(event);
         if (err)
-               return NULL;
+               return err;
  
         event->destroy = tp_perf_event_destroy;
  
-       return &perf_ops_tracepoint;
+       return 0;
+}
+
+static struct pmu perf_tracepoint = {
+       .event_init     = perf_tp_event_init,
+       .enable         = perf_trace_enable,
+       .disable        = perf_trace_disable,
+       .start          = perf_swevent_int,
+       .stop           = perf_swevent_void,
+       .read           = perf_swevent_read,
+       .unthrottle     = perf_swevent_void,
+};
+
+static inline void perf_tp_register(void)
+{
+       perf_pmu_register(&perf_tracepoint);
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4676,9 +4745,8 @@ static void perf_event_free_filter(struct perf_event *event)
  
  #else
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
  {
-       return NULL;
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4693,105 +4761,247 @@ static void perf_event_free_filter(struct perf_event *event)
  #endif /* CONFIG_EVENT_TRACING */
  
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
+void perf_bp_event(struct perf_event *bp, void *data)
  {
-       release_bp_slot(event);
+       struct perf_sample_data sample;
+       struct pt_regs *regs = data;
+
+       perf_sample_data_init(&sample, bp->attr.bp_addr);
+
+       if (!perf_exclude_event(bp, regs))
+               perf_swevent_add(bp, 1, 1, &sample, regs);
  }
+#endif
+
+/*
+ * hrtimer based swevent callback
+ */
  
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
  {
-       int err;
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
  
-       err = register_perf_hw_breakpoint(bp);
-       if (err)
-               return ERR_PTR(err);
+       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+       event->pmu->read(event);
+
+       perf_sample_data_init(&data, 0);
+       data.period = event->hw.last_period;
+       regs = get_irq_regs();
  
-       bp->destroy = bp_perf_event_destroy;
+       if (regs && !perf_exclude_event(event, regs)) {
+               if (!(event->attr.exclude_idle && current->pid == 0))
+                       if (perf_event_overflow(event, 0, &data, regs))
+                               ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
  
-       return &perf_ops_bp;
+       return ret;
  }
  
-void perf_bp_event(struct perf_event *bp, void *data)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
  {
-       struct perf_sample_data sample;
-       struct pt_regs *regs = data;
+       struct hw_perf_event *hwc = &event->hw;
  
-       perf_sample_data_init(&sample, bp->attr.bp_addr);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               u64 period;
  
-       if (!perf_exclude_event(bp, regs))
-               perf_swevent_add(bp, 1, 1, &sample, regs);
+               if (hwc->remaining) {
+                       if (hwc->remaining < 0)
+                               period = 10000;
+                       else
+                               period = hwc->remaining;
+                       hwc->remaining = 0;
+               } else {
+                       period = max_t(u64, 10000, hwc->sample_period);
+               }
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL, 0);
+       }
  }
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
  {
-       return NULL;
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->sample_period) {
+               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+               hwc->remaining = ktime_to_ns(remaining);
+
+               hrtimer_cancel(&hwc->hrtimer);
+       }
  }
  
-void perf_bp_event(struct perf_event *bp, void *regs)
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
  {
+       int cpu = raw_smp_processor_id();
+       s64 prev;
+       u64 now;
+
+       now = cpu_clock(cpu);
+       prev = local64_xchg(&event->hw.prev_count, now);
+       local64_add(now - prev, &event->count);
  }
-#endif
  
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+static int cpu_clock_event_enable(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       int cpu = raw_smp_processor_id();
  
-static void sw_perf_event_destroy(struct perf_event *event)
+       local64_set(&hwc->prev_count, cpu_clock(cpu));
+       perf_swevent_start_hrtimer(event);
+
+       return 0;
+}
+
+static void cpu_clock_event_disable(struct perf_event *event)
  {
-       u64 event_id = event->attr.config;
+       perf_swevent_cancel_hrtimer(event);
+       cpu_clock_event_update(event);
+}
  
-       WARN_ON(event->parent);
+static void cpu_clock_event_read(struct perf_event *event)
+{
+       cpu_clock_event_update(event);
+}
  
-       atomic_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+static int cpu_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+               return -ENOENT;
+
+       return 0;
  }
  
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static struct pmu perf_cpu_clock = {
+       .event_init     = cpu_clock_event_init,
+       .enable         = cpu_clock_event_enable,
+       .disable        = cpu_clock_event_disable,
+       .read           = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
  {
-       const struct pmu *pmu = NULL;
-       u64 event_id = event->attr.config;
+       u64 prev;
+       s64 delta;
  
-       /*
-        * Software events (currently) can't in general distinguish
-        * between user, kernel and hypervisor events.
-        * However, context switches and cpu migrations are considered
-        * to be kernel events, and page faults are never hypervisor
-        * events.
-        */
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-               pmu = &perf_ops_cpu_clock;
+       prev = local64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       local64_add(delta, &event->count);
+}
  
-               break;
-       case PERF_COUNT_SW_TASK_CLOCK:
-               /*
-                * If the user instantiates this as a per-cpu event,
-                * use the cpu_clock event instead.
-                */
-               if (event->ctx->task)
-                       pmu = &perf_ops_task_clock;
-               else
-                       pmu = &perf_ops_cpu_clock;
+static int task_clock_event_enable(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       u64 now;
  
-               break;
-       case PERF_COUNT_SW_PAGE_FAULTS:
-       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-       case PERF_COUNT_SW_CONTEXT_SWITCHES:
-       case PERF_COUNT_SW_CPU_MIGRATIONS:
-       case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-       case PERF_COUNT_SW_EMULATION_FAULTS:
-               if (!event->parent) {
-                       int err;
-
-                       err = swevent_hlist_get(event);
-                       if (err)
-                               return ERR_PTR(err);
+       now = event->ctx->time;
+
+       local64_set(&hwc->prev_count, now);
+
+       perf_swevent_start_hrtimer(event);
  
-                       atomic_inc(&perf_swevent_enabled[event_id]);
-                       event->destroy = sw_perf_event_destroy;
+       return 0;
+}
+
+static void task_clock_event_disable(struct perf_event *event)
+{
+       perf_swevent_cancel_hrtimer(event);
+       task_clock_event_update(event, event->ctx->time);
+
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(event->ctx);
+               time = event->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - event->ctx->timestamp;
+               time = event->ctx->time + delta;
+       }
+
+       task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+               return -ENOENT;
+
+       return 0;
+}
+
+static struct pmu perf_task_clock = {
+       .event_init     = task_clock_event_init,
+       .enable         = task_clock_event_enable,
+       .disable        = task_clock_event_disable,
+       .read           = task_clock_event_read,
+};
+
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
+int perf_pmu_register(struct pmu *pmu)
+{
+       mutex_lock(&pmus_lock);
+       list_add_rcu(&pmu->entry, &pmus);
+       mutex_unlock(&pmus_lock);
+
+       return 0;
+}
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+       mutex_lock(&pmus_lock);
+       list_del_rcu(&pmu->entry);
+       mutex_unlock(&pmus_lock);
+
+       synchronize_srcu(&pmus_srcu);
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+       struct pmu *pmu = NULL;
+       int idx;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               int ret = pmu->event_init(event);
+               if (!ret)
+                       break;
+               if (ret != -ENOENT) {
+                       pmu = ERR_PTR(ret);
+                       break;
                 }
-               pmu = &perf_ops_generic;
-               break;
         }
+       srcu_read_unlock(&pmus_srcu, idx);
  
         return pmu;
  }
@@ -4808,7 +5018,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                    perf_overflow_handler_t overflow_handler,
                    gfp_t gfpflags)
  {
-       const struct pmu *pmu;
+       struct pmu *pmu;
         struct perf_event *event;
         struct hw_perf_event *hwc;
         long err;
@@ -4864,7 +5074,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                 hwc->sample_period = 1;
         hwc->last_period = hwc->sample_period;
  
-       atomic64_set(&hwc->period_left, hwc->sample_period);
+       local64_set(&hwc->period_left, hwc->sample_period);
  
         /*
          * we currently do not support PERF_FORMAT_GROUP on inherited events
@@ -4872,29 +5082,8 @@ perf_event_alloc(struct perf_event_attr *attr,
         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                 goto done;
  
-       switch (attr->type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               pmu = hw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_SOFTWARE:
-               pmu = sw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_TRACEPOINT:
-               pmu = tp_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_BREAKPOINT:
-               pmu = bp_perf_event_init(event);
-               break;
-
+       pmu = perf_init_event(event);
  
-       default:
-               break;
-       }
  done:
         err = 0;
         if (!pmu)
@@ -4913,12 +5102,19 @@ done:
  
         if (!event->parent) {
                 atomic_inc(&nr_events);
-               if (event->attr.mmap)
+               if (event->attr.mmap || event->attr.mmap_data)
                         atomic_inc(&nr_mmap_events);
                 if (event->attr.comm)
                         atomic_inc(&nr_comm_events);
                 if (event->attr.task)
                         atomic_inc(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                       err = get_callchain_buffers();
+                       if (err) {
+                               free_event(event);
+                               return ERR_PTR(err);
+                       }
+               }
         }
  
         return event;
@@ -5007,7 +5203,7 @@ err_size:
  static int
  perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
  {
-       struct perf_mmap_data *data = NULL, *old_data = NULL;
+       struct perf_buffer *buffer = NULL, *old_buffer = NULL;
         int ret = -EINVAL;
  
         if (!output_event)
@@ -5037,19 +5233,19 @@ set:
  
         if (output_event) {
                 /* get the buffer we want to redirect to */
-               data = perf_mmap_data_get(output_event);
-               if (!data)
+               buffer = perf_buffer_get(output_event);
+               if (!buffer)
                         goto unlock;
         }
  
-       old_data = event->data;
-       rcu_assign_pointer(event->data, data);
+       old_buffer = event->buffer;
+       rcu_assign_pointer(event->buffer, buffer);
         ret = 0;
  unlock:
         mutex_unlock(&event->mmap_mutex);
  
-       if (old_data)
-               perf_mmap_data_put(old_data);
+       if (old_buffer)
+               perf_buffer_put(old_buffer);
  out:
         return ret;
  }
@@ -5298,7 +5494,7 @@ inherit_event(struct perf_event *parent_event,
                 hwc->sample_period = sample_period;
                 hwc->last_period   = sample_period;
  
-               atomic64_set(&hwc->period_left, sample_period);
+               local64_set(&hwc->period_left, sample_period);
         }
  
         child_event->overflow_handler = parent_event->overflow_handler;
@@ -5359,12 +5555,12 @@ static void sync_child_event(struct perf_event *child_event,
         if (child_event->attr.inherit_stat)
                 perf_event_read_event(child_event, child);
  
-       child_val = atomic64_read(&child_event->count);
+       child_val = perf_event_count(child_event);
  
         /*
          * Add back the child's count to the parent's count:
          */
-       atomic64_add(child_val, &parent_event->count);
+       atomic64_add(child_val, &parent_event->child_count);
         atomic64_add(child_event->total_time_enabled,
                      &parent_event->child_total_time_enabled);
         atomic64_add(child_event->total_time_running,
@@ -5733,15 +5929,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
  {
         unsigned int cpu = (long)hcpu;
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
+       case CPU_DOWN_FAILED:
                 perf_event_init_cpu(cpu);
                 break;
  
+       case CPU_UP_CANCELED:
         case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
                 perf_event_exit_cpu(cpu);
                 break;
  
@@ -5752,22 +5948,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-       .notifier_call          = perf_cpu_notify,
-       .priority               = 20,
-};
-
  void __init perf_event_init(void)
  {
         perf_event_init_all_cpus();
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-                       (void *)(long)smp_processor_id());
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-                       (void *)(long)smp_processor_id());
-       register_cpu_notifier(&perf_cpu_nb);
+       init_srcu_struct(&pmus_srcu);
+       perf_pmu_register(&perf_swevent);
+       perf_pmu_register(&perf_cpu_clock);
+       perf_pmu_register(&perf_task_clock);
+       perf_tp_register();
+       perf_cpu_notifier(perf_cpu_notify);
  }
  
  static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,