*
* field = (typeof(field))entry;
*
- * p = get_cpu_var(ftrace_event_seq);
+ * p = &get_cpu_var(ftrace_event_seq);
* trace_seq_init(p);
- * ret = trace_seq_printf(s, <TP_printk> "\n");
+ * ret = trace_seq_printf(s, "%s: ", <call>);
+ * if (ret)
+ * ret = trace_seq_printf(s, <TP_printk> "\n");
* put_cpu();
* if (!ret)
* return TRACE_TYPE_PARTIAL_LINE;
*
* static void ftrace_raw_event_<call>(proto)
* {
+ * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
* struct ring_buffer_event *event;
* struct ftrace_raw_<call> *entry; <-- defined in stage 1
* struct ring_buffer *buffer;
* unsigned long irq_flags;
+ * int __data_size;
* int pc;
*
* local_save_flags(irq_flags);
* pc = preempt_count();
*
+ * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
* event = trace_current_buffer_lock_reserve(&buffer,
* event_<call>.id,
- * sizeof(struct ftrace_raw_<call>),
+ * sizeof(*entry) + __data_size,
* irq_flags, pc);
* if (!event)
* return;
* entry = ring_buffer_event_data(event);
*
- * <assign>; <-- Here we assign the entries by the __field and
- * __array macros.
+ * { <assign>; } <-- Here we assign the entries by the __field and
+ * __array macros.
*
- * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ * if (!filter_current_check_discard(buffer, event_call, entry, event))
+ * trace_current_buffer_unlock_commit(buffer,
+ * event, irq_flags, pc);
* }
*
* static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
* {
- * int ret;
- *
- * ret = register_trace_<call>(ftrace_raw_event_<call>);
- * if (!ret)
- * pr_info("event trace: Could not activate trace point "
- * "probe to <call>");
- * return ret;
+ * return register_trace_<call>(ftrace_raw_event_<call>);
* }
*
* static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
* .trace = ftrace_raw_output_<call>, <-- stage 2
* };
*
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
* static struct ftrace_event_call __used
* __attribute__((__aligned__(4)))
* __attribute__((section("_ftrace_events"))) event_<call> = {
* .raw_init = trace_event_raw_init,
* .regfunc = ftrace_reg_event_<call>,
* .unregfunc = ftrace_unreg_event_<call>,
+ * .print_fmt = print_fmt_<call>,
+ * .define_fields = ftrace_define_fields_<call>,
* }
*
*/
return; \
entry = ring_buffer_event_data(event); \
\
- \
tstruct \
\
{ assign; } \
{ assign; } \
\
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
- __count, irq_flags, __regs); \
+ __count, irq_flags, __regs, \
+ event_call->perf_data); \
}
#undef DEFINE_EVENT
return virt_to_page(data->data_pages[pgoff - 1]);
}
+static void *perf_mmap_alloc_page(int cpu)
+{
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
static struct perf_mmap_data *
perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
{
if (!data)
goto fail;
- data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+ data->user_page = perf_mmap_alloc_page(event->cpu);
if (!data->user_page)
goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
- data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
if (!data->data_pages[i])
goto fail_data_pages;
}
{
long max_size = perf_data_size(data);
- atomic_set(&data->lock, -1);
-
if (event->attr.watermark) {
data->watermark = min_t(long, max_size,
event->attr.wakeup_watermark);
long user_extra, extra;
int ret = 0;
+ /*
+ * Don't allow mmap() of inherited per-task counters. This would
+ * create a performance issue due to all children writing to the
+ * same buffer.
+ */
+ if (event->cpu == -1 && event->attr.inherit)
+ return -EINVAL;
+
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
}
/*
- * Curious locking construct.
- *
* We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
* cannot fully serialize things.
*
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
* We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
*/
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
{
struct perf_mmap_data *data = handle->data;
- int cur, cpu = get_cpu();
-
- handle->locked = 0;
-
- for (;;) {
- cur = atomic_cmpxchg(&data->lock, -1, cpu);
- if (cur == -1) {
- handle->locked = 1;
- break;
- }
- if (cur == cpu)
- break;
- cpu_relax();
- }
+ preempt_disable();
+ local_inc(&data->nest);
+ handle->wakeup = local_read(&data->wakeup);
}
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct perf_mmap_data *data = handle->data;
unsigned long head;
- int cpu;
-
- data->done_head = data->head;
-
- if (!handle->locked)
- goto out;
again:
- /*
- * The xchg implies a full barrier that ensures all writes are done
- * before we publish the new head, matched by a rmb() in userspace when
- * reading this position.
- */
- while ((head = atomic_long_xchg(&data->done_head, 0)))
- data->user_page->data_head = head;
+ head = local_read(&data->head);
/*
- * NMI can happen here, which means we can miss a done_head update.
+ * IRQ/NMI can happen here, which means we can miss a head update.
*/
- cpu = atomic_xchg(&data->lock, -1);
- WARN_ON_ONCE(cpu != smp_processor_id());
+ if (!local_dec_and_test(&data->nest))
+ return;
/*
- * Therefore we have to validate we did not indeed do so.
+ * Publish the known good head. Rely on the full barrier implied
+ * by atomic_dec_and_test() order the data->head read and this
+ * write.
*/
- if (unlikely(atomic_long_read(&data->done_head))) {
- /*
- * Since we had it locked, we can lock it again.
- */
- while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
- cpu_relax();
+ data->user_page->data_head = head;
+ /*
+ * Now check if we missed an update, rely on the (compiler)
+ * barrier in atomic_dec_and_test() to re-read data->head.
+ */
+ if (unlikely(head != local_read(&data->head))) {
+ local_inc(&data->nest);
goto again;
}
- if (atomic_xchg(&data->wakeup, 0))
+ if (handle->wakeup != local_read(&data->wakeup))
perf_output_wakeup(handle);
-out:
- put_cpu();
+
+ preempt_enable();
}
void perf_output_copy(struct perf_output_handle *handle,
handle->sample = sample;
if (!data->nr_pages)
- goto fail;
+ goto out;
- have_lost = atomic_read(&data->lost);
+ have_lost = local_read(&data->lost);
if (have_lost)
size += sizeof(lost_event);
- perf_output_lock(handle);
+ perf_output_get_handle(handle);
do {
/*
*/
tail = ACCESS_ONCE(data->user_page->data_tail);
smp_rmb();
- offset = head = atomic_long_read(&data->head);
+ offset = head = local_read(&data->head);
head += size;
if (unlikely(!perf_output_space(data, tail, offset, head)))
goto fail;
- } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+ } while (local_cmpxchg(&data->head, offset, head) != offset);
handle->offset = offset;
handle->head = head;
if (head - tail > data->watermark)
- atomic_set(&data->wakeup, 1);
+ local_inc(&data->wakeup);
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
- lost_event.lost = atomic_xchg(&data->lost, 0);
+ lost_event.lost = local_xchg(&data->lost, 0);
perf_output_put(handle, lost_event);
}
return 0;
fail:
- atomic_inc(&data->lost);
- perf_output_unlock(handle);
+ local_inc(&data->lost);
+ perf_output_put_handle(handle);
out:
rcu_read_unlock();
int wakeup_events = event->attr.wakeup_events;
if (handle->sample && wakeup_events) {
- int events = atomic_inc_return(&data->events);
+ int events = local_inc_return(&data->events);
if (events >= wakeup_events) {
- atomic_sub(wakeup_events, &data->events);
- atomic_set(&data->wakeup, 1);
+ local_sub(wakeup_events, &data->events);
+ local_inc(&data->wakeup);
}
}
- perf_output_unlock(handle);
+ perf_output_put_handle(handle);
rcu_read_unlock();
}
{
struct perf_output_handle handle;
struct task_struct *task = task_event->task;
- unsigned long flags;
int size, ret;
- /*
- * If this CPU attempts to acquire an rq lock held by a CPU spinning
- * in perf_output_lock() from interrupt context, it's game over.
- */
- local_irq_save(flags);
-
size = task_event->event_id.header.size;
ret = perf_output_begin(&handle, event, size, 0, 0);
- if (ret) {
- local_irq_restore(flags);
+ if (ret)
return;
- }
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
perf_output_put(&handle, task_event->event_id);
perf_output_end(&handle);
- local_irq_restore(flags);
}
static int perf_event_task_match(struct perf_event *event)
return hash_64(val, SWEVENT_HLIST_BITS);
}
- static struct hlist_head *
- find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ static inline struct hlist_head *
+ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
- u64 hash;
- struct swevent_hlist *hlist;
+ u64 hash = swevent_hash(type, event_id);
+
+ return &hlist->heads[hash];
+ }
- hash = swevent_hash(type, event_id);
+ /* For the read side: events when they trigger */
+ static inline struct hlist_head *
+ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+ {
+ struct swevent_hlist *hlist;
hlist = rcu_dereference(ctx->swevent_hlist);
if (!hlist)
return NULL;
- return &hlist->heads[hash];
+ return __find_swevent_head(hlist, type, event_id);
+ }
+
+ /* For the event head insertion and removal in the hlist */
+ static inline struct hlist_head *
+ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+ {
+ struct swevent_hlist *hlist;
+ u32 event_id = event->attr.config;
+ u64 type = event->attr.type;
+
+ /*
+ * Event scheduling is always serialized against hlist allocation
+ * and release. Which makes the protected version suitable here.
+ * The context lock guarantees that.
+ */
+ hlist = rcu_dereference_protected(ctx->swevent_hlist,
+ lockdep_is_held(&event->ctx->lock));
+ if (!hlist)
+ return NULL;
+
+ return __find_swevent_head(hlist, type, event_id);
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
rcu_read_lock();
- head = find_swevent_head(cpuctx, type, event_id);
+ head = find_swevent_head_rcu(cpuctx, type, event_id);
if (!head)
goto end;
perf_swevent_set_period(event);
}
- head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+ head = find_swevent_head(cpuctx, event);
if (WARN_ON_ONCE(!head))
return -EINVAL;
.read = task_clock_perf_event_read,
};
+ /* Deref the hlist from the update side */
+ static inline struct swevent_hlist *
+ swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+ {
+ return rcu_dereference_protected(cpuctx->swevent_hlist,
+ lockdep_is_held(&cpuctx->hlist_mutex));
+ }
+
static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
{
struct swevent_hlist *hlist;
static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
{
- struct swevent_hlist *hlist;
+ struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
- if (!cpuctx->swevent_hlist)
+ if (!hlist)
return;
- hlist = cpuctx->swevent_hlist;
rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
}
mutex_lock(&cpuctx->hlist_mutex);
- if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+ if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
#ifdef CONFIG_EVENT_TRACING
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- int entry_size, struct pt_regs *regs)
+ int entry_size, struct pt_regs *regs, void *event)
{
+ const int type = PERF_TYPE_TRACEPOINT;
struct perf_sample_data data;
struct perf_raw_record raw = {
.size = entry_size,
perf_sample_data_init(&data, addr);
data.raw = &raw;
- /* Trace events already protected against recursion */
- do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- &data, regs);
+ if (!event) {
+ do_perf_sw_event(type, event_id, count, 1, &data, regs);
+ return;
+ }
+
+ if (perf_swevent_match(event, type, event_id, &data, regs))
+ perf_swevent_add(event, count, 1, &data, regs);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (perf_trace_enable(event->attr.config))
+ if (perf_trace_enable(event->attr.config, event))
return NULL;
event->destroy = tp_perf_event_destroy;