Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic...

author Ingo Molnar <mingo@elte.hu>

Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)

committer Ingo Molnar <mingo@elte.hu>

Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
author Ingo Molnar <mingo@elte.hu>
Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
committer Ingo Molnar <mingo@elte.hu>
Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
diff --combined include/linux/ftrace_event.h

index c8091001b9433b4dcff6315dc46540dceecc4277,39e71b0a3bfdb0aeaf9b546c111f19726a954136..a9775dd7f7fe8412034657aa7f0531d3ac093865
--- 1/include/linux/ftrace_event.h
--- 2/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@@ -58,6 -58,7 +58,7 @@@ struct trace_iterator 
         /* The below is zeroed out in pipe_read */
         struct trace_seq        seq;
         struct trace_entry      *ent;
+       unsigned long           lost_events;
         int                     leftover;
         int                     cpu;
         u64                     ts;
@@@ -132,7 -133,6 +133,7 @@@ struct ftrace_event_call 
         void                    *data;
   
         int                     perf_refcount;
+ +      void                    *perf_data;
         int                     (*perf_event_enable)(struct ftrace_event_call *);
         void                    (*perf_event_disable)(struct ftrace_event_call *);
   };
@@@ -191,7 -191,7 +192,7 @@@ struct perf_event
   
   DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
   
- -extern int perf_trace_enable(int event_id);
+ +extern int perf_trace_enable(int event_id, void *data);
   extern void perf_trace_disable(int event_id);
   extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                      char *filter_str);
@@@ -202,12 -202,11 +203,12 @@@ perf_trace_buf_prepare(int size, unsign
   
   static inline void
   perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
- -                     u64 count, unsigned long irq_flags, struct pt_regs *regs)
+ +                     u64 count, unsigned long irq_flags, struct pt_regs *regs,
+ +                     void *event)
   {
         struct trace_entry *entry = raw_data;
   
- -      perf_tp_event(entry->type, addr, count, raw_data, size, regs);
+ +      perf_tp_event(entry->type, addr, count, raw_data, size, regs, event);
         perf_swevent_put_recursion_context(rctx);
         local_irq_restore(irq_flags);
   }
diff --combined include/trace/ftrace.h

index 0a29df092922123bfee483b3918ee2acd00d115b,16253db38d73274e329a20519023b342db7e7bd1..1016b2162935934a9207e42efe13d222ae321e11
--- 1/include/trace/ftrace.h
--- 2/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@@ -154,9 -154,11 +154,11 @@@
    *
    *    field = (typeof(field))entry;
    *
-  *    p = get_cpu_var(ftrace_event_seq);
+  *    p = &get_cpu_var(ftrace_event_seq);
    *    trace_seq_init(p);
-  *    ret = trace_seq_printf(s, <TP_printk> "\n");
+  *    ret = trace_seq_printf(s, "%s: ", <call>);
+  *    if (ret)
+  *            ret = trace_seq_printf(s, <TP_printk> "\n");
    *    put_cpu();
    *    if (!ret)
    *            return TRACE_TYPE_PARTIAL_LINE;
@@@ -450,38 -452,38 +452,38 @@@ perf_trace_disable_##name(struct ftrace
    *
    * static void ftrace_raw_event_<call>(proto)
    * {
+  *    struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
    *    struct ring_buffer_event *event;
    *    struct ftrace_raw_<call> *entry; <-- defined in stage 1
    *    struct ring_buffer *buffer;
    *    unsigned long irq_flags;
+  *    int __data_size;
    *    int pc;
    *
    *    local_save_flags(irq_flags);
    *    pc = preempt_count();
    *
+  *    __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+  *
    *    event = trace_current_buffer_lock_reserve(&buffer,
    *                              event_<call>.id,
-  *                              sizeof(struct ftrace_raw_<call>),
+  *                              sizeof(*entry) + __data_size,
    *                              irq_flags, pc);
    *    if (!event)
    *            return;
    *    entry   = ring_buffer_event_data(event);
    *
-  *    <assign>;  <-- Here we assign the entries by the __field and
-  *                    __array macros.
+  *    { <assign>; }  <-- Here we assign the entries by the __field and
+  *                       __array macros.
    *
-  *    trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+  *    if (!filter_current_check_discard(buffer, event_call, entry, event))
+  *            trace_current_buffer_unlock_commit(buffer,
+  *                                               event, irq_flags, pc);
    * }
    *
    * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
    * {
-  *    int ret;
-  *
-  *    ret = register_trace_<call>(ftrace_raw_event_<call>);
-  *    if (!ret)
-  *            pr_info("event trace: Could not activate trace point "
-  *                    "probe to <call>");
-  *    return ret;
+  *    return register_trace_<call>(ftrace_raw_event_<call>);
    * }
    *
    * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
@@@ -493,6 -495,8 +495,8 @@@
    *    .trace                  = ftrace_raw_output_<call>, <-- stage 2
    * };
    *
+  * static const char print_fmt_<call>[] = <TP_printk>;
+  *
    * static struct ftrace_event_call __used
    * __attribute__((__aligned__(4)))
    * __attribute__((section("_ftrace_events"))) event_<call> = {
@@@ -501,6 -505,8 +505,8 @@@
    *    .raw_init               = trace_event_raw_init,
    *    .regfunc                = ftrace_reg_event_<call>,
    *    .unregfunc              = ftrace_unreg_event_<call>,
+  *    .print_fmt              = print_fmt_<call>,
+  *    .define_fields          = ftrace_define_fields_<call>,
    * }
    *
    */
@@@ -569,7 -575,6 +575,6 @@@ ftrace_raw_event_id_##call(struct ftrac
                 return;                                                 \
         entry   = ring_buffer_event_data(event);                        \
                                                                         \
-                                                                       \
         tstruct                                                         \
                                                                         \
         { assign; }                                                     \
@@@ -785,8 -790,7 +790,8 @@@ perf_trace_templ_##call(struct ftrace_e
         { assign; }                                                     \
                                                                         \
         perf_trace_buf_submit(entry, __entry_size, rctx, __addr,        \
- -                             __count, irq_flags, __regs);             \
+ +                             __count, irq_flags, __regs,              \
+ +                            event_call->perf_data);                   \
   }
   
   #undef DEFINE_EVENT
diff --combined kernel/perf_event.c

index 7e3bcf1a29f074738cdb971f4d15acbba5cc2d96,511677bc1c6a7c08d5df57ccd35f5b213b894204..2a060be3b07fb98075cf995ce0c292af2c924b83
--- 1/kernel/perf_event.c
--- 2/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@@ -2320,19 -2320,6 +2320,19 @@@ perf_mmap_to_page(struct perf_mmap_dat
         return virt_to_page(data->data_pages[pgoff - 1]);
   }
   
+ +static void *perf_mmap_alloc_page(int cpu)
+ +{
+ +      struct page *page;
+ +      int node;
+ +
+ +      node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ +      page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ +      if (!page)
+ +              return NULL;
+ +
+ +      return page_address(page);
+ +}
+ +
   static struct perf_mmap_data *
   perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
   {
@@@ -2349,12 -2336,12 +2349,12 @@@
         if (!data)
                 goto fail;
   
- -      data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ +      data->user_page = perf_mmap_alloc_page(event->cpu);
         if (!data->user_page)
                 goto fail_user_page;
   
         for (i = 0; i < nr_pages; i++) {
- -              data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ +              data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                 if (!data->data_pages[i])
                         goto fail_data_pages;
         }
@@@ -2519,6 -2506,8 +2519,6 @@@ perf_mmap_data_init(struct perf_event *
   {
         long max_size = perf_data_size(data);
   
- -      atomic_set(&data->lock, -1);
- -
         if (event->attr.watermark) {
                 data->watermark = min_t(long, max_size,
                                         event->attr.wakeup_watermark);
@@@ -2591,14 -2580,6 +2591,14 @@@ static int perf_mmap(struct file *file
         long user_extra, extra;
         int ret = 0;
   
+ +      /*
+ +       * Don't allow mmap() of inherited per-task counters. This would
+ +       * create a performance issue due to all children writing to the
+ +       * same buffer.
+ +       */
+ +      if (event->cpu == -1 && event->attr.inherit)
+ +              return -EINVAL;
+ +
         if (!(vma->vm_flags & VM_SHARED))
                 return -EINVAL;
   
@@@ -2904,57 -2885,82 +2904,57 @@@ static void perf_output_wakeup(struct p
   }
   
   /*
- - * Curious locking construct.
- - *
    * We need to ensure a later event_id doesn't publish a head when a former
- - * event_id isn't done writing. However since we need to deal with NMIs we
+ + * event isn't done writing. However since we need to deal with NMIs we
    * cannot fully serialize things.
    *
- - * What we do is serialize between CPUs so we only have to deal with NMI
- - * nesting on a single CPU.
- - *
    * We only publish the head (and generate a wakeup) when the outer-most
- - * event_id completes.
+ + * event completes.
    */
- -static void perf_output_lock(struct perf_output_handle *handle)
+ +static void perf_output_get_handle(struct perf_output_handle *handle)
   {
         struct perf_mmap_data *data = handle->data;
- -      int cur, cpu = get_cpu();
- -
- -      handle->locked = 0;
- -
- -      for (;;) {
- -              cur = atomic_cmpxchg(&data->lock, -1, cpu);
- -              if (cur == -1) {
- -                      handle->locked = 1;
- -                      break;
- -              }
- -              if (cur == cpu)
- -                      break;
   
- -              cpu_relax();
- -      }
+ +      preempt_disable();
+ +      local_inc(&data->nest);
+ +      handle->wakeup = local_read(&data->wakeup);
   }
   
- -static void perf_output_unlock(struct perf_output_handle *handle)
+ +static void perf_output_put_handle(struct perf_output_handle *handle)
   {
         struct perf_mmap_data *data = handle->data;
         unsigned long head;
- -      int cpu;
- -
- -      data->done_head = data->head;
- -
- -      if (!handle->locked)
- -              goto out;
   
   again:
- -      /*
- -       * The xchg implies a full barrier that ensures all writes are done
- -       * before we publish the new head, matched by a rmb() in userspace when
- -       * reading this position.
- -       */
- -      while ((head = atomic_long_xchg(&data->done_head, 0)))
- -              data->user_page->data_head = head;
+ +      head = local_read(&data->head);
   
         /*
- -       * NMI can happen here, which means we can miss a done_head update.
+ +       * IRQ/NMI can happen here, which means we can miss a head update.
          */
   
- -      cpu = atomic_xchg(&data->lock, -1);
- -      WARN_ON_ONCE(cpu != smp_processor_id());
+ +      if (!local_dec_and_test(&data->nest))
+ +              return;
   
         /*
- -       * Therefore we have to validate we did not indeed do so.
+ +       * Publish the known good head. Rely on the full barrier implied
+ +       * by atomic_dec_and_test() order the data->head read and this
+ +       * write.
          */
- -      if (unlikely(atomic_long_read(&data->done_head))) {
- -              /*
- -               * Since we had it locked, we can lock it again.
- -               */
- -              while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
- -                      cpu_relax();
+ +      data->user_page->data_head = head;
   
+ +      /*
+ +       * Now check if we missed an update, rely on the (compiler)
+ +       * barrier in atomic_dec_and_test() to re-read data->head.
+ +       */
+ +      if (unlikely(head != local_read(&data->head))) {
+ +              local_inc(&data->nest);
                 goto again;
         }
   
- -      if (atomic_xchg(&data->wakeup, 0))
+ +      if (handle->wakeup != local_read(&data->wakeup))
                 perf_output_wakeup(handle);
- -out:
- -      put_cpu();
+ +
+ +      preempt_enable();
   }
   
   void perf_output_copy(struct perf_output_handle *handle,
@@@ -3030,13 -3036,13 +3030,13 @@@ int perf_output_begin(struct perf_outpu
         handle->sample  = sample;
   
         if (!data->nr_pages)
- -              goto fail;
+ +              goto out;
   
- -      have_lost = atomic_read(&data->lost);
+ +      have_lost = local_read(&data->lost);
         if (have_lost)
                 size += sizeof(lost_event);
   
- -      perf_output_lock(handle);
+ +      perf_output_get_handle(handle);
   
         do {
                 /*
@@@ -3046,24 -3052,24 +3046,24 @@@
                  */
                 tail = ACCESS_ONCE(data->user_page->data_tail);
                 smp_rmb();
- -              offset = head = atomic_long_read(&data->head);
+ +              offset = head = local_read(&data->head);
                 head += size;
                 if (unlikely(!perf_output_space(data, tail, offset, head)))
                         goto fail;
- -      } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+ +      } while (local_cmpxchg(&data->head, offset, head) != offset);
   
         handle->offset  = offset;
         handle->head    = head;
   
         if (head - tail > data->watermark)
- -              atomic_set(&data->wakeup, 1);
+ +              local_inc(&data->wakeup);
   
         if (have_lost) {
                 lost_event.header.type = PERF_RECORD_LOST;
                 lost_event.header.misc = 0;
                 lost_event.header.size = sizeof(lost_event);
                 lost_event.id          = event->id;
- -              lost_event.lost        = atomic_xchg(&data->lost, 0);
+ +              lost_event.lost        = local_xchg(&data->lost, 0);
   
                 perf_output_put(handle, lost_event);
         }
@@@ -3071,8 -3077,8 +3071,8 @@@
         return 0;
   
   fail:
- -      atomic_inc(&data->lost);
- -      perf_output_unlock(handle);
+ +      local_inc(&data->lost);
+ +      perf_output_put_handle(handle);
   out:
         rcu_read_unlock();
   
@@@ -3087,14 -3093,14 +3087,14 @@@ void perf_output_end(struct perf_output
         int wakeup_events = event->attr.wakeup_events;
   
         if (handle->sample && wakeup_events) {
- -              int events = atomic_inc_return(&data->events);
+ +              int events = local_inc_return(&data->events);
                 if (events >= wakeup_events) {
- -                      atomic_sub(wakeup_events, &data->events);
- -                      atomic_set(&data->wakeup, 1);
+ +                      local_sub(wakeup_events, &data->events);
+ +                      local_inc(&data->wakeup);
                 }
         }
   
- -      perf_output_unlock(handle);
+ +      perf_output_put_handle(handle);
         rcu_read_unlock();
   }
   
@@@ -3430,13 -3436,22 +3430,13 @@@ static void perf_event_task_output(stru
   {
         struct perf_output_handle handle;
         struct task_struct *task = task_event->task;
- -      unsigned long flags;
         int size, ret;
   
- -      /*
- -       * If this CPU attempts to acquire an rq lock held by a CPU spinning
- -       * in perf_output_lock() from interrupt context, it's game over.
- -       */
- -      local_irq_save(flags);
- -
         size  = task_event->event_id.header.size;
         ret = perf_output_begin(&handle, event, size, 0, 0);
   
- -      if (ret) {
- -              local_irq_restore(flags);
+ +      if (ret)
                 return;
- -      }
   
         task_event->event_id.pid = perf_event_pid(event, task);
         task_event->event_id.ppid = perf_event_pid(event, current);
@@@ -3447,6 -3462,7 +3447,6 @@@
         perf_output_put(&handle, task_event->event_id);
   
         perf_output_end(&handle);
- -      local_irq_restore(flags);
   }
   
   static int perf_event_task_match(struct perf_event *event)
@@@ -4050,19 -4066,46 +4050,46 @@@ static inline u64 swevent_hash(u64 type
         return hash_64(val, SWEVENT_HLIST_BITS);
   }
   
- static struct hlist_head *
- find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ static inline struct hlist_head *
+ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
   {
-       u64 hash;
-       struct swevent_hlist *hlist;
+       u64 hash = swevent_hash(type, event_id);
+ 
+       return &hlist->heads[hash];
+ }
   
-       hash = swevent_hash(type, event_id);
+ /* For the read side: events when they trigger */
+ static inline struct hlist_head *
+ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ {
+       struct swevent_hlist *hlist;
   
         hlist = rcu_dereference(ctx->swevent_hlist);
         if (!hlist)
                 return NULL;
   
-       return &hlist->heads[hash];
+       return __find_swevent_head(hlist, type, event_id);
+ }
+ 
+ /* For the event head insertion and removal in the hlist */
+ static inline struct hlist_head *
+ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+ {
+       struct swevent_hlist *hlist;
+       u32 event_id = event->attr.config;
+       u64 type = event->attr.type;
+ 
+       /*
+        * Event scheduling is always serialized against hlist allocation
+        * and release. Which makes the protected version suitable here.
+        * The context lock guarantees that.
+        */
+       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                         lockdep_is_held(&event->ctx->lock));
+       if (!hlist)
+               return NULL;
+ 
+       return __find_swevent_head(hlist, type, event_id);
   }
   
   static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@@ -4079,7 -4122,7 +4106,7 @@@
   
         rcu_read_lock();
   
-       head = find_swevent_head(cpuctx, type, event_id);
+       head = find_swevent_head_rcu(cpuctx, type, event_id);
   
         if (!head)
                 goto end;
@@@ -4162,7 -4205,7 +4189,7 @@@ static int perf_swevent_enable(struct p
                 perf_swevent_set_period(event);
         }
   
-       head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+       head = find_swevent_head(cpuctx, event);
         if (WARN_ON_ONCE(!head))
                 return -EINVAL;
   
@@@ -4350,6 -4393,14 +4377,14 @@@ static const struct pmu perf_ops_task_c
         .read           = task_clock_perf_event_read,
   };
   
+ /* Deref the hlist from the update side */
+ static inline struct swevent_hlist *
+ swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+ {
+       return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                        lockdep_is_held(&cpuctx->hlist_mutex));
+ }
+ 
   static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
   {
         struct swevent_hlist *hlist;
@@@ -4360,12 -4411,11 +4395,11 @@@
   
   static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
   {
-       struct swevent_hlist *hlist;
+       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
   
-       if (!cpuctx->swevent_hlist)
+       if (!hlist)
                 return;
   
-       hlist = cpuctx->swevent_hlist;
         rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
   }
@@@ -4402,7 -4452,7 +4436,7 @@@ static int swevent_hlist_get_cpu(struc
   
         mutex_lock(&cpuctx->hlist_mutex);
   
-       if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
                 struct swevent_hlist *hlist;
   
                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@@ -4452,9 -4502,8 +4486,9 @@@ static int swevent_hlist_get(struct per
   #ifdef CONFIG_EVENT_TRACING
   
   void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- -                 int entry_size, struct pt_regs *regs)
+ +                 int entry_size, struct pt_regs *regs, void *event)
   {
+ +      const int type = PERF_TYPE_TRACEPOINT;
         struct perf_sample_data data;
         struct perf_raw_record raw = {
                 .size = entry_size,
@@@ -4464,13 -4513,9 +4498,13 @@@
         perf_sample_data_init(&data, addr);
         data.raw = &raw;
   
- -      /* Trace events already protected against recursion */
- -      do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- -                       &data, regs);
+ +      if (!event) {
+ +              do_perf_sw_event(type, event_id, count, 1, &data, regs);
+ +              return;
+ +      }
+ +
+ +      if (perf_swevent_match(event, type, event_id, &data, regs))
+ +              perf_swevent_add(event, count, 1, &data, regs);
   }
   EXPORT_SYMBOL_GPL(perf_tp_event);
   
@@@ -4503,7 -4548,7 +4537,7 @@@ static const struct pmu *tp_perf_event_
                         !capable(CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
   
- -      if (perf_trace_enable(event->attr.config))
+ +      if (perf_trace_enable(event->attr.config, event))
                 return NULL;
   
         event->destroy = tp_perf_event_destroy;
author	Ingo Molnar <mingo@elte.hu>
	Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
		1	2
include/linux/ftrace_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/ftrace.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_event.c	patch \|	diff1 \|	diff2 \|	blob \| history