* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
+ *
+ * If commit fails, we'll call ->del() on all events
+ * for which ->add() was called.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
+ if (x86_pmu.add) {
+ /*
+ * This is before x86_pmu_enable() will call x86_pmu_start(),
+ * so we enable LBRs before an event needs them etc..
+ */
+ x86_pmu.add(event);
+ }
+
ret = 0;
out:
return ret;
event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
/*
- * If we're called during a txn, we don't need to do anything.
+ * If we're called during a txn, we only need to undo x86_pmu.add.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*
* an event added during that same TXN.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
- return;
+ goto do_del;
/*
* Not a TXN, therefore cleanup properly.
--cpuc->n_events;
perf_event_update_userpage(event);
+
+do_del:
+ if (x86_pmu.del) {
+ /*
+ * This is after x86_pmu_stop(); so we disable LBRs after any
+ * event can need them etc..
+ */
+ x86_pmu.del(event);
+ }
}
int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
cpuc->intel_cp_status &= ~(1ull << hwc->idx);
- /*
- * must disable before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_disable(event);
-
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
intel_pmu_pebs_disable(event);
}
+static void intel_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_del(event);
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_del(event);
+}
+
static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
intel_pmu_enable_bts(hwc->config);
return;
}
- /*
- * must enabled before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
+static void intel_pmu_add_event(struct perf_event *event)
+{
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_add(event);
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_add(event);
+}
+
/*
* Save and restart an expired event. Called by NMI contexts,
* so it has to be careful about preempting normal event ops:
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .add = intel_pmu_add_event,
+ .del = intel_pmu_del_event,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
return &emptyconstraint;
}
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+{
+ if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!needed_cb)
+ perf_sched_cb_inc(pmu);
+ else
+ perf_sched_cb_dec(pmu);
+
+ pebs_update_threshold(cpuc);
+ }
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
{
- return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs++;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
- bool first_pebs;
- u64 threshold;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
- first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;
if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
cpuc->pebs_enabled |= 1ULL << 63;
/*
- * When the event is constrained enough we can use a larger
- * threshold and run the event with less frequent PMI.
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
- if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
- threshold = ds->pebs_absolute_maximum -
- x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
- if (first_pebs)
- perf_sched_cb_inc(event->ctx->pmu);
- } else {
- threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
- /*
- * If not all events can use larger buffer,
- * roll back to threshold = 1
- */
- if (!first_pebs &&
- (ds->pebs_interrupt_threshold > threshold))
- perf_sched_cb_dec(event->ctx->pmu);
- }
-
- /* Use auto-reload if possible to save a MSR write in the PMI */
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
ds->pebs_event_reset[hwc->idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
}
+}
- if (first_pebs || ds->pebs_interrupt_threshold > threshold)
- ds->pebs_interrupt_threshold = threshold;
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs--;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct debug_store *ds = cpuc->ds;
- bool large_pebs = ds->pebs_interrupt_threshold >
- ds->pebs_buffer_base + x86_pmu.pebs_record_size;
- if (large_pebs)
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
intel_pmu_drain_pebs_buffer();
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
- if (large_pebs && !pebs_is_enabled(cpuc))
- perf_sched_cb_dec(event->ctx->pmu);
-
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
/*
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
if (task_ctx) {
- if (sched_in) {
+ if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
- cpuc->lbr_context = ctx;
- } else {
+ else
__intel_pmu_lbr_save(task_ctx);
- }
return;
}
/*
- * When sampling the branck stack in system-wide, it may be
- * necessary to flush the stack on context switch. This happens
- * when the branch stack does not tag its entries with the pid
- * of the current task. Otherwise it becomes impossible to
- * associate a branch entry with a task. This ambiguity is more
- * likely to appear when the branch stack supports priv level
- * filtering and the user sets it to monitor only at the user
- * level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch
- * stack with branch from multiple tasks.
- */
- if (sched_in) {
+ * Since a context switch can flip the address space and LBR entries
+ * are not tagged with an identifier, we need to wipe the LBR, even for
+ * per-cpu events. You simply cannot resolve the branches from the old
+ * address space.
+ */
+ if (sched_in)
intel_pmu_lbr_reset();
- cpuc->lbr_context = ctx;
- }
}
static inline bool branch_user_callstack(unsigned br_sel)
return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}
-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
- event->ctx->task_ctx_data) {
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
task_ctx = event->ctx->task_ctx_data;
task_ctx->lbr_callstack_users++;
}
- cpuc->lbr_users++;
+ /*
+ * Request pmu::sched_task() callback, which will fire inside the
+ * regular perf event scheduling, so that call will:
+ *
+ * - restore or wipe; when LBR-callstack,
+ * - wipe; otherwise,
+ *
+ * when this is from __perf_event_task_sched_in().
+ *
+ * However, if this is from perf_install_in_context(), no such callback
+ * will follow and we'll need to reset the LBR here if this is the
+ * first LBR event.
+ *
+ * The problem is, we cannot tell these cases apart... but we can
+ * exclude the biggest chunk of cases by looking at
+ * event->total_time_running. An event that has accrued runtime cannot
+ * be 'new'. Conversely, a new event can get installed through the
+ * context switch path for the first time.
+ */
perf_sched_cb_inc(event->ctx->pmu);
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ intel_pmu_lbr_reset();
}
-void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
-
- if (cpuc->enabled && !cpuc->lbr_users) {
- __intel_pmu_lbr_disable();
- /* avoid stale pointer */
- cpuc->lbr_context = NULL;
- }
}
void intel_pmu_lbr_enable_all(bool pmi)
*/
struct debug_store *ds;
u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
/*
* Intel LBR bits
*/
int lbr_users;
- void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*add)(struct perf_event *);
+ void (*del)(struct perf_event *);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
void intel_pmu_pebs_enable(struct perf_event *event);
void intel_pmu_pebs_disable(struct perf_event *event);
void intel_pmu_lbr_reset(void);
-void intel_pmu_lbr_enable(struct perf_event *event);
+void intel_pmu_lbr_add(struct perf_event *event);
-void intel_pmu_lbr_disable(struct perf_event *event);
+void intel_pmu_lbr_del(struct perf_event *event);
void intel_pmu_lbr_enable_all(bool pmi);
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
#endif
+
+ struct list_head sched_cb_entry;
+ int sched_cb_usage;
};
struct perf_output_handle {
struct perf_event *event;
int state = group_event->state;
+ perf_pmu_disable(ctx->pmu);
+
event_sched_out(group_event, cpuctx, ctx);
/*
list_for_each_entry(event, &group_event->sibling_list, group_entry)
event_sched_out(event, cpuctx, ctx);
+ perf_pmu_enable(ctx->pmu);
+
if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
+
void perf_sched_cb_inc(struct pmu *pmu)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
this_cpu_inc(perf_sched_cb_usages);
}
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
*/
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
- unsigned long flags;
if (prev == next)
return;
- local_irq_save(flags);
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
- rcu_read_lock();
-
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- if (pmu->sched_task) {
- cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
- perf_pmu_disable(pmu);
+ if (WARN_ON_ONCE(!pmu->sched_task))
+ continue;
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(pmu);
- perf_pmu_enable(pmu);
+ pmu->sched_task(cpuctx->task_ctx, sched_in);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ perf_pmu_enable(pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
-
- rcu_read_unlock();
-
- local_irq_restore(flags);
}
static void perf_event_switch(struct task_struct *task,
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
* Returns 0 on success, -EFAULT on failure.
*/
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, struct page *kpage)
+ struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
- err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+ err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
false);
if (err)
return err;
/* For try_to_free_swap() and munlock_vma_page() below */
- lock_page(page);
+ lock_page(old_page);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
- ptep = page_check_address(page, mm, addr, &ptl, 0);
+ ptep = page_check_address(old_page, mm, addr, &ptl, 0);
if (!ptep) {
- mem_cgroup_cancel_charge(kpage, memcg, false);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
- get_page(kpage);
- page_add_new_anon_rmap(kpage, vma, addr, false);
- mem_cgroup_commit_charge(kpage, memcg, false, false);
- lru_cache_add_active_or_unevictable(kpage, vma);
+ get_page(new_page);
+ page_add_new_anon_rmap(new_page, vma, addr, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
+ lru_cache_add_active_or_unevictable(new_page, vma);
- if (!PageAnon(page)) {
- dec_mm_counter(mm, mm_counter_file(page));
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, mm_counter_file(old_page));
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(page, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
+ page_remove_rmap(old_page, false);
+ if (!page_mapped(old_page))
+ try_to_free_swap(old_page);
pte_unmap_unlock(ptep, ptl);
if (vma->vm_flags & VM_LOCKED)
- munlock_vma_page(page);
- put_page(page);
+ munlock_vma_page(old_page);
+ put_page(old_page);
err = 0;
unlock:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- unlock_page(page);
+ unlock_page(old_page);
return err;
}