]> git.karo-electronics.de Git - mv-sheeva.git/commitdiff
Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic...
authorIngo Molnar <mingo@elte.hu>
Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
committerIngo Molnar <mingo@elte.hu>
Thu, 20 May 2010 12:38:55 +0000 (14:38 +0200)
1  2 
include/linux/ftrace_event.h
include/trace/ftrace.h
kernel/perf_event.c

index c8091001b9433b4dcff6315dc46540dceecc4277,39e71b0a3bfdb0aeaf9b546c111f19726a954136..a9775dd7f7fe8412034657aa7f0531d3ac093865
@@@ -58,6 -58,7 +58,7 @@@ struct trace_iterator 
        /* The below is zeroed out in pipe_read */
        struct trace_seq        seq;
        struct trace_entry      *ent;
+       unsigned long           lost_events;
        int                     leftover;
        int                     cpu;
        u64                     ts;
@@@ -132,7 -133,6 +133,7 @@@ struct ftrace_event_call 
        void                    *data;
  
        int                     perf_refcount;
 +      void                    *perf_data;
        int                     (*perf_event_enable)(struct ftrace_event_call *);
        void                    (*perf_event_disable)(struct ftrace_event_call *);
  };
@@@ -191,7 -191,7 +192,7 @@@ struct perf_event
  
  DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
  
 -extern int perf_trace_enable(int event_id);
 +extern int perf_trace_enable(int event_id, void *data);
  extern void perf_trace_disable(int event_id);
  extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                     char *filter_str);
@@@ -202,12 -202,11 +203,12 @@@ perf_trace_buf_prepare(int size, unsign
  
  static inline void
  perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
 -                     u64 count, unsigned long irq_flags, struct pt_regs *regs)
 +                     u64 count, unsigned long irq_flags, struct pt_regs *regs,
 +                     void *event)
  {
        struct trace_entry *entry = raw_data;
  
 -      perf_tp_event(entry->type, addr, count, raw_data, size, regs);
 +      perf_tp_event(entry->type, addr, count, raw_data, size, regs, event);
        perf_swevent_put_recursion_context(rctx);
        local_irq_restore(irq_flags);
  }
diff --combined include/trace/ftrace.h
index 0a29df092922123bfee483b3918ee2acd00d115b,16253db38d73274e329a20519023b342db7e7bd1..1016b2162935934a9207e42efe13d222ae321e11
   *
   *    field = (typeof(field))entry;
   *
-  *    p = get_cpu_var(ftrace_event_seq);
+  *    p = &get_cpu_var(ftrace_event_seq);
   *    trace_seq_init(p);
-  *    ret = trace_seq_printf(s, <TP_printk> "\n");
+  *    ret = trace_seq_printf(s, "%s: ", <call>);
+  *    if (ret)
+  *            ret = trace_seq_printf(s, <TP_printk> "\n");
   *    put_cpu();
   *    if (!ret)
   *            return TRACE_TYPE_PARTIAL_LINE;
@@@ -450,38 -452,38 +452,38 @@@ perf_trace_disable_##name(struct ftrace
   *
   * static void ftrace_raw_event_<call>(proto)
   * {
+  *    struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
   *    struct ring_buffer_event *event;
   *    struct ftrace_raw_<call> *entry; <-- defined in stage 1
   *    struct ring_buffer *buffer;
   *    unsigned long irq_flags;
+  *    int __data_size;
   *    int pc;
   *
   *    local_save_flags(irq_flags);
   *    pc = preempt_count();
   *
+  *    __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+  *
   *    event = trace_current_buffer_lock_reserve(&buffer,
   *                              event_<call>.id,
-  *                              sizeof(struct ftrace_raw_<call>),
+  *                              sizeof(*entry) + __data_size,
   *                              irq_flags, pc);
   *    if (!event)
   *            return;
   *    entry   = ring_buffer_event_data(event);
   *
-  *    <assign>;  <-- Here we assign the entries by the __field and
-  *                    __array macros.
+  *    { <assign>; }  <-- Here we assign the entries by the __field and
+  *                       __array macros.
   *
-  *    trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+  *    if (!filter_current_check_discard(buffer, event_call, entry, event))
+  *            trace_current_buffer_unlock_commit(buffer,
+  *                                               event, irq_flags, pc);
   * }
   *
   * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
   * {
-  *    int ret;
-  *
-  *    ret = register_trace_<call>(ftrace_raw_event_<call>);
-  *    if (!ret)
-  *            pr_info("event trace: Could not activate trace point "
-  *                    "probe to <call>");
-  *    return ret;
+  *    return register_trace_<call>(ftrace_raw_event_<call>);
   * }
   *
   * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
   *    .trace                  = ftrace_raw_output_<call>, <-- stage 2
   * };
   *
+  * static const char print_fmt_<call>[] = <TP_printk>;
+  *
   * static struct ftrace_event_call __used
   * __attribute__((__aligned__(4)))
   * __attribute__((section("_ftrace_events"))) event_<call> = {
   *    .raw_init               = trace_event_raw_init,
   *    .regfunc                = ftrace_reg_event_<call>,
   *    .unregfunc              = ftrace_unreg_event_<call>,
+  *    .print_fmt              = print_fmt_<call>,
+  *    .define_fields          = ftrace_define_fields_<call>,
   * }
   *
   */
@@@ -569,7 -575,6 +575,6 @@@ ftrace_raw_event_id_##call(struct ftrac
                return;                                                 \
        entry   = ring_buffer_event_data(event);                        \
                                                                        \
-                                                                       \
        tstruct                                                         \
                                                                        \
        { assign; }                                                     \
@@@ -785,8 -790,7 +790,8 @@@ perf_trace_templ_##call(struct ftrace_e
        { assign; }                                                     \
                                                                        \
        perf_trace_buf_submit(entry, __entry_size, rctx, __addr,        \
 -                             __count, irq_flags, __regs);             \
 +                             __count, irq_flags, __regs,              \
 +                            event_call->perf_data);                   \
  }
  
  #undef DEFINE_EVENT
diff --combined kernel/perf_event.c
index 7e3bcf1a29f074738cdb971f4d15acbba5cc2d96,511677bc1c6a7c08d5df57ccd35f5b213b894204..2a060be3b07fb98075cf995ce0c292af2c924b83
@@@ -2320,19 -2320,6 +2320,19 @@@ perf_mmap_to_page(struct perf_mmap_dat
        return virt_to_page(data->data_pages[pgoff - 1]);
  }
  
 +static void *perf_mmap_alloc_page(int cpu)
 +{
 +      struct page *page;
 +      int node;
 +
 +      node = (cpu == -1) ? cpu : cpu_to_node(cpu);
 +      page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 +      if (!page)
 +              return NULL;
 +
 +      return page_address(page);
 +}
 +
  static struct perf_mmap_data *
  perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
  {
        if (!data)
                goto fail;
  
 -      data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
 +      data->user_page = perf_mmap_alloc_page(event->cpu);
        if (!data->user_page)
                goto fail_user_page;
  
        for (i = 0; i < nr_pages; i++) {
 -              data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
 +              data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                if (!data->data_pages[i])
                        goto fail_data_pages;
        }
@@@ -2519,6 -2506,8 +2519,6 @@@ perf_mmap_data_init(struct perf_event *
  {
        long max_size = perf_data_size(data);
  
 -      atomic_set(&data->lock, -1);
 -
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@@ -2591,14 -2580,6 +2591,14 @@@ static int perf_mmap(struct file *file
        long user_extra, extra;
        int ret = 0;
  
 +      /*
 +       * Don't allow mmap() of inherited per-task counters. This would
 +       * create a performance issue due to all children writing to the
 +       * same buffer.
 +       */
 +      if (event->cpu == -1 && event->attr.inherit)
 +              return -EINVAL;
 +
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
  
@@@ -2904,57 -2885,82 +2904,57 @@@ static void perf_output_wakeup(struct p
  }
  
  /*
 - * Curious locking construct.
 - *
   * We need to ensure a later event_id doesn't publish a head when a former
 - * event_id isn't done writing. However since we need to deal with NMIs we
 + * event isn't done writing. However since we need to deal with NMIs we
   * cannot fully serialize things.
   *
 - * What we do is serialize between CPUs so we only have to deal with NMI
 - * nesting on a single CPU.
 - *
   * We only publish the head (and generate a wakeup) when the outer-most
 - * event_id completes.
 + * event completes.
   */
 -static void perf_output_lock(struct perf_output_handle *handle)
 +static void perf_output_get_handle(struct perf_output_handle *handle)
  {
        struct perf_mmap_data *data = handle->data;
 -      int cur, cpu = get_cpu();
 -
 -      handle->locked = 0;
 -
 -      for (;;) {
 -              cur = atomic_cmpxchg(&data->lock, -1, cpu);
 -              if (cur == -1) {
 -                      handle->locked = 1;
 -                      break;
 -              }
 -              if (cur == cpu)
 -                      break;
  
 -              cpu_relax();
 -      }
 +      preempt_disable();
 +      local_inc(&data->nest);
 +      handle->wakeup = local_read(&data->wakeup);
  }
  
 -static void perf_output_unlock(struct perf_output_handle *handle)
 +static void perf_output_put_handle(struct perf_output_handle *handle)
  {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
 -      int cpu;
 -
 -      data->done_head = data->head;
 -
 -      if (!handle->locked)
 -              goto out;
  
  again:
 -      /*
 -       * The xchg implies a full barrier that ensures all writes are done
 -       * before we publish the new head, matched by a rmb() in userspace when
 -       * reading this position.
 -       */
 -      while ((head = atomic_long_xchg(&data->done_head, 0)))
 -              data->user_page->data_head = head;
 +      head = local_read(&data->head);
  
        /*
 -       * NMI can happen here, which means we can miss a done_head update.
 +       * IRQ/NMI can happen here, which means we can miss a head update.
         */
  
 -      cpu = atomic_xchg(&data->lock, -1);
 -      WARN_ON_ONCE(cpu != smp_processor_id());
 +      if (!local_dec_and_test(&data->nest))
 +              return;
  
        /*
 -       * Therefore we have to validate we did not indeed do so.
 +       * Publish the known good head. Rely on the full barrier implied
 +       * by atomic_dec_and_test() order the data->head read and this
 +       * write.
         */
 -      if (unlikely(atomic_long_read(&data->done_head))) {
 -              /*
 -               * Since we had it locked, we can lock it again.
 -               */
 -              while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 -                      cpu_relax();
 +      data->user_page->data_head = head;
  
 +      /*
 +       * Now check if we missed an update, rely on the (compiler)
 +       * barrier in atomic_dec_and_test() to re-read data->head.
 +       */
 +      if (unlikely(head != local_read(&data->head))) {
 +              local_inc(&data->nest);
                goto again;
        }
  
 -      if (atomic_xchg(&data->wakeup, 0))
 +      if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
 -out:
 -      put_cpu();
 +
 +      preempt_enable();
  }
  
  void perf_output_copy(struct perf_output_handle *handle,
@@@ -3030,13 -3036,13 +3030,13 @@@ int perf_output_begin(struct perf_outpu
        handle->sample  = sample;
  
        if (!data->nr_pages)
 -              goto fail;
 +              goto out;
  
 -      have_lost = atomic_read(&data->lost);
 +      have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
  
 -      perf_output_lock(handle);
 +      perf_output_get_handle(handle);
  
        do {
                /*
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
 -              offset = head = atomic_long_read(&data->head);
 +              offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
 -      } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 +      } while (local_cmpxchg(&data->head, offset, head) != offset);
  
        handle->offset  = offset;
        handle->head    = head;
  
        if (head - tail > data->watermark)
 -              atomic_set(&data->wakeup, 1);
 +              local_inc(&data->wakeup);
  
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
 -              lost_event.lost        = atomic_xchg(&data->lost, 0);
 +              lost_event.lost        = local_xchg(&data->lost, 0);
  
                perf_output_put(handle, lost_event);
        }
        return 0;
  
  fail:
 -      atomic_inc(&data->lost);
 -      perf_output_unlock(handle);
 +      local_inc(&data->lost);
 +      perf_output_put_handle(handle);
  out:
        rcu_read_unlock();
  
@@@ -3087,14 -3093,14 +3087,14 @@@ void perf_output_end(struct perf_output
        int wakeup_events = event->attr.wakeup_events;
  
        if (handle->sample && wakeup_events) {
 -              int events = atomic_inc_return(&data->events);
 +              int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
 -                      atomic_sub(wakeup_events, &data->events);
 -                      atomic_set(&data->wakeup, 1);
 +                      local_sub(wakeup_events, &data->events);
 +                      local_inc(&data->wakeup);
                }
        }
  
 -      perf_output_unlock(handle);
 +      perf_output_put_handle(handle);
        rcu_read_unlock();
  }
  
@@@ -3430,13 -3436,22 +3430,13 @@@ static void perf_event_task_output(stru
  {
        struct perf_output_handle handle;
        struct task_struct *task = task_event->task;
 -      unsigned long flags;
        int size, ret;
  
 -      /*
 -       * If this CPU attempts to acquire an rq lock held by a CPU spinning
 -       * in perf_output_lock() from interrupt context, it's game over.
 -       */
 -      local_irq_save(flags);
 -
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
  
 -      if (ret) {
 -              local_irq_restore(flags);
 +      if (ret)
                return;
 -      }
  
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
        perf_output_put(&handle, task_event->event_id);
  
        perf_output_end(&handle);
 -      local_irq_restore(flags);
  }
  
  static int perf_event_task_match(struct perf_event *event)
@@@ -4050,19 -4066,46 +4050,46 @@@ static inline u64 swevent_hash(u64 type
        return hash_64(val, SWEVENT_HLIST_BITS);
  }
  
- static struct hlist_head *
find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
  {
-       u64 hash;
-       struct swevent_hlist *hlist;
+       u64 hash = swevent_hash(type, event_id);
+       return &hlist->heads[hash];
+ }
  
-       hash = swevent_hash(type, event_id);
+ /* For the read side: events when they trigger */
+ static inline struct hlist_head *
+ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ {
+       struct swevent_hlist *hlist;
  
        hlist = rcu_dereference(ctx->swevent_hlist);
        if (!hlist)
                return NULL;
  
-       return &hlist->heads[hash];
+       return __find_swevent_head(hlist, type, event_id);
+ }
+ /* For the event head insertion and removal in the hlist */
+ static inline struct hlist_head *
+ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+ {
+       struct swevent_hlist *hlist;
+       u32 event_id = event->attr.config;
+       u64 type = event->attr.type;
+       /*
+        * Event scheduling is always serialized against hlist allocation
+        * and release. Which makes the protected version suitable here.
+        * The context lock guarantees that.
+        */
+       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                         lockdep_is_held(&event->ctx->lock));
+       if (!hlist)
+               return NULL;
+       return __find_swevent_head(hlist, type, event_id);
  }
  
  static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
  
        rcu_read_lock();
  
-       head = find_swevent_head(cpuctx, type, event_id);
+       head = find_swevent_head_rcu(cpuctx, type, event_id);
  
        if (!head)
                goto end;
@@@ -4162,7 -4205,7 +4189,7 @@@ static int perf_swevent_enable(struct p
                perf_swevent_set_period(event);
        }
  
-       head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+       head = find_swevent_head(cpuctx, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;
  
@@@ -4350,6 -4393,14 +4377,14 @@@ static const struct pmu perf_ops_task_c
        .read           = task_clock_perf_event_read,
  };
  
+ /* Deref the hlist from the update side */
+ static inline struct swevent_hlist *
+ swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+ {
+       return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                        lockdep_is_held(&cpuctx->hlist_mutex));
+ }
  static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
  {
        struct swevent_hlist *hlist;
  
  static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
  {
-       struct swevent_hlist *hlist;
+       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
  
-       if (!cpuctx->swevent_hlist)
+       if (!hlist)
                return;
  
-       hlist = cpuctx->swevent_hlist;
        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
  }
@@@ -4402,7 -4452,7 +4436,7 @@@ static int swevent_hlist_get_cpu(struc
  
        mutex_lock(&cpuctx->hlist_mutex);
  
-       if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
  
                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@@ -4452,9 -4502,8 +4486,9 @@@ static int swevent_hlist_get(struct per
  #ifdef CONFIG_EVENT_TRACING
  
  void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
 -                 int entry_size, struct pt_regs *regs)
 +                 int entry_size, struct pt_regs *regs, void *event)
  {
 +      const int type = PERF_TYPE_TRACEPOINT;
        struct perf_sample_data data;
        struct perf_raw_record raw = {
                .size = entry_size,
        perf_sample_data_init(&data, addr);
        data.raw = &raw;
  
 -      /* Trace events already protected against recursion */
 -      do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 -                       &data, regs);
 +      if (!event) {
 +              do_perf_sw_event(type, event_id, count, 1, &data, regs);
 +              return;
 +      }
 +
 +      if (perf_swevent_match(event, type, event_id, &data, regs))
 +              perf_swevent_add(event, count, 1, &data, regs);
  }
  EXPORT_SYMBOL_GPL(perf_tp_event);
  
@@@ -4503,7 -4548,7 +4537,7 @@@ static const struct pmu *tp_perf_event_
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
  
 -      if (perf_trace_enable(event->attr.config))
 +      if (perf_trace_enable(event->attr.config, event))
                return NULL;
  
        event->destroy = tp_perf_event_destroy;