Merge reason: Pick up the latest -rc.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
W: http://oops.ghostprotocols.net:81/blog/
P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01
D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks
-S: R. Brasílio Itiberê, 4270/1010 - Água Verde
-S: 80240-060 - Curitiba - Paraná
S: Brazil
N: Karsten Merker
M: Peter Zijlstra <a.p.zijlstra@chello.nl>
M: Paul Mackerras <paulus@samba.org>
M: Ingo Molnar <mingo@elte.hu>
-M: Arnaldo Carvalho de Melo <acme@redhat.com>
+M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
S: Supported
F: kernel/perf_event*.c
F: include/linux/perf_event.h
#ifndef __ASM_ALPHA_PERF_EVENT_H
#define __ASM_ALPHA_PERF_EVENT_H
-#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
-#else
-static inline void init_hw_perf_events(void) { }
-#endif
-
#endif /* __ASM_ALPHA_PERF_EVENT_H */
wrent(entInt, 0);
alpha_mv.init_irq();
-
- init_hw_perf_events();
}
/*
#include <linux/kernel.h>
#include <linux/kdebug.h>
#include <linux/mutex.h>
+#include <linux/init.h>
#include <asm/hwrpb.h>
#include <asm/atomic.h>
/*
* Init call to initialise performance events at kernel startup.
*/
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
pr_info("Performance events: ");
if (!supported_cpu()) {
pr_cont("No support for your CPU.\n");
- return;
+ return 0;
}
pr_cont("Supported CPU type!\n");
alpha_pmu = &ev67_pmu;
perf_pmu_register(&pmu);
-}
+ return 0;
+}
+early_initcall(init_hw_perf_events);
return 0;
}
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
/*
* Callchain handling code.
return 0;
}
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
#endif /* defined(CONFIG_CPU_MIPS32)... */
return register_fsl_emb_pmu(&e500_pmu);
}
-arch_initcall(init_e500_pmu);
+early_initcall(init_e500_pmu);
return register_power_pmu(&mpc7450_pmu);
}
-arch_initcall(init_mpc7450_pmu);
+early_initcall(init_mpc7450_pmu);
return register_power_pmu(&power4_pmu);
}
-arch_initcall(init_power4_pmu);
+early_initcall(init_power4_pmu);
return register_power_pmu(&power5p_pmu);
}
-arch_initcall(init_power5p_pmu);
+early_initcall(init_power5p_pmu);
return register_power_pmu(&power5_pmu);
}
-arch_initcall(init_power5_pmu);
+early_initcall(init_power5_pmu);
return register_power_pmu(&power6_pmu);
}
-arch_initcall(init_power6_pmu);
+early_initcall(init_power6_pmu);
return register_power_pmu(&power7_pmu);
}
-arch_initcall(init_power7_pmu);
+early_initcall(init_power7_pmu);
return register_power_pmu(&ppc970_pmu);
}
-arch_initcall(init_ppc970_pmu);
+early_initcall(init_ppc970_pmu);
return register_sh_pmu(&sh7750_pmu);
}
-arch_initcall(sh7750_pmu_init);
+early_initcall(sh7750_pmu_init);
return register_sh_pmu(&sh4a_pmu);
}
-arch_initcall(sh4a_pmu_init);
+early_initcall(sh4a_pmu_init);
#ifdef CONFIG_PERF_EVENTS
#include <asm/ptrace.h>
-extern void init_hw_perf_events(void);
-
#define perf_arch_fetch_caller_regs(regs, ip) \
do { \
unsigned long _pstate, _asi, _pil, _i7, _fp; \
(regs)->u_regs[UREG_I6] = _fp; \
(regs)->u_regs[UREG_I7] = _i7; \
} while (0)
-#else
-static inline void init_hw_perf_events(void) { }
#endif
#endif
atomic_set(&nmi_active, -1);
}
}
- if (!err)
- init_hw_perf_events();
return err;
}
return false;
}
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
pr_info("Performance events: ");
if (!supported_pmu()) {
pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
- return;
+ return 0;
}
pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
perf_pmu_register(&pmu);
register_die_notifier(&perf_event_nmi_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_event);
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
+struct text_poke_param {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+extern void text_poke_smp_batch(struct text_poke_param *params, int n);
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#define IDEAL_NOP_SIZE_5 5
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_registers(struct pt_regs *regs);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
+ unsigned long *sp);
extern void __show_regs(struct pt_regs *regs, int all);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
#ifdef ARCH_HAS_NMI_WATCHDOG
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it. Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
-
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_LOCKUP_DETECTOR)
-extern int nmi_watchdog_enabled;
-#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
extern void release_evntsel_nmi(unsigned int);
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE 0
-#define NMI_IO_APIC 1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID 3
-
struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
-
-static inline void localise_nmi_watchdog(void)
-{
- if (nmi_watchdog == NMI_IO_APIC)
- nmi_watchdog = NMI_LOCAL_APIC;
-}
-
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
- /*
- * actually it should be:
- * return (nmi_watchdog == NMI_LOCAL_APIC ||
- * nmi_watchdog == NMI_IO_APIC)
- * but since they are power of two we could use a
- * cheaper way --cvg
- */
- return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
#endif
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
void restart_nmi(void);
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
extern void perf_events_lapic_init(void);
#define PERF_EVENT_INDEX_OFFSET 0
}
#else
-static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
#endif
};
/*
- * P4 PEBS specifics (Replay Event only)
- *
- * Format (bits):
- * 0-6: metric from P4_PEBS_METRIC enum
- * 7 : reserved
- * 8 : reserved
- * 9-11 : reserved
- *
* Note we have UOP and PEBS bits reserved for now
* just in case if we will need them once
*/
P4_PEBS_METRIC__max
};
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ * 32 bits valuable, we pack them into a single 64 bit
+ * configuration. Low 32 bits of such config correspond
+ * to low 32 bits of CCCR register and high 32 bits
+ * correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ * be found in Intel SDM but it should be noted that
+ * we "borrow" some reserved bits for own usage and
+ * clean them or set to a proper value when we do
+ * a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ * and should be either 0 or set to some predefined
+ * values:
+ *
+ * Low 32 bits
+ * -----------
+ * 0-6: P4_PEBS_METRIC enum
+ * 7-11: reserved
+ * 12: reserved (Enable)
+ * 13-15: reserved (ESCR select)
+ * 16-17: Active Thread
+ * 18: Compare
+ * 19: Complement
+ * 20-23: Threshold
+ * 24: Edge
+ * 25: reserved (FORCE_OVF)
+ * 26: reserved (OVF_PMI_T0)
+ * 27: reserved (OVF_PMI_T1)
+ * 28-29: reserved
+ * 30: reserved (Cascade)
+ * 31: reserved (OVF)
+ *
+ * High 32 bits
+ * ------------
+ * 0: reserved (T1_USR)
+ * 1: reserved (T1_OS)
+ * 2: reserved (T0_USR)
+ * 3: reserved (T0_OS)
+ * 4: Tag Enable
+ * 5-8: Tag Value
+ * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ * 25-30: enum P4_EVENTS
+ * 31: reserved (HT thread)
+ */
+
#endif /* PERF_EVENT_P4_H */
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
#endif
}
#define _ASM_X86_STACKTRACE_H
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
extern int kstack_depth_to_print;
};
void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+ unsigned long *stack,
const struct stacktrace_ops *ops, void *data);
#ifdef CONFIG_X86_32
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ unsigned long bp;
+
+ if (regs)
+ return regs->bp;
+
+ if (task == current) {
+ /* Grab bp right from our regs */
+ get_bp(bp);
+ return bp;
+ }
+
+ /* bp is the last reg pushed by switch_to */
+ return *(unsigned long *)task->thread.sp;
+}
+#else
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ return 0;
+}
+#endif
+
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+ unsigned long *stack, char *log_lvl);
extern void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+ unsigned long *sp, char *log_lvl);
extern unsigned int code_bytes;
unsigned long long native_sched_clock(void);
extern int recalibrate_cpu_khz(void);
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-extern int timer_ack;
-#else
-# define timer_ack (0)
-#endif
-
extern int no_timer_check;
/* Accelerators for sched_clock()
static int wrote_text;
struct text_poke_params {
- void *addr;
- const void *opcode;
- size_t len;
+ struct text_poke_param *params;
+ int nparams;
};
static int __kprobes stop_machine_text_poke(void *data)
{
struct text_poke_params *tpp = data;
+ struct text_poke_param *p;
+ int i;
if (atomic_dec_and_test(&stop_machine_first)) {
- text_poke(tpp->addr, tpp->opcode, tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ text_poke(p->addr, p->opcode, p->len);
+ }
smp_wmb(); /* Make sure other cpus see that this has run */
wrote_text = 1;
} else {
smp_mb(); /* Load wrote_text before following execution */
}
- flush_icache_range((unsigned long)tpp->addr,
- (unsigned long)tpp->addr + tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ flush_icache_range((unsigned long)p->addr,
+ (unsigned long)p->addr + p->len);
+ }
+
return 0;
}
void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
{
struct text_poke_params tpp;
+ struct text_poke_param p;
- tpp.addr = addr;
- tpp.opcode = opcode;
- tpp.len = len;
+ p.addr = addr;
+ p.opcode = opcode;
+ p.len = len;
+ tpp.params = &p;
+ tpp.nparams = 1;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
/* Use __stop_machine() because the caller already got online_cpus. */
return addr;
}
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+ struct text_poke_params tpp = {.params = params, .nparams = n};
+
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
+
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#ifdef CONFIG_X86_64
#
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/dmi.h>
-#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- pr_warning("APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
#endif
- setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
-#else
- localise_nmi_watchdog();
#endif
x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
- check_nmi_watchdog();
-#endif
-
return 0;
}
#include <linux/nmi.h>
#include <linux/module.h>
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(void)
{
return (u64)(cpu_khz) * 1000 * 60;
}
+#endif
-#ifdef ARCH_HAS_NMI_WATCHDOG
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+#ifdef arch_trigger_all_cpu_backtrace
void arch_trigger_all_cpu_backtrace(void)
{
int i;
#endif
/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
#include <asm/dma.h>
#include <asm/timer.h>
#include <asm/i8259.h>
-#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
#include <asm/setup.h>
"edge");
}
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
/*
* This looks a bit hackish but it's about the only one way of sending
* a few INTA cycles to 8259As and any associated glue logic. ICR does
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
legacy_pic->init(1);
-#ifdef CONFIG_X86_32
- {
- unsigned int ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- }
-#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
unmask_ioapic(cfg);
}
if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- legacy_pic->unmask(0);
- }
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
goto out;
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- setup_nmi();
- legacy_pic->unmask(0);
- }
goto out;
}
/*
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
-
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
+++ /dev/null
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
- return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
- return atomic_read(&mce_entry) > 0;
-#endif
- return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
- return per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /*
- * Intentionally don't use cpu_relax here. This is
- * to make sure that the performance counter really ticks,
- * even if there is a simulator or similar that catches the
- * pause instruction. On a real HT machine this is fine because
- * all other CPUs are busy with "useless" delay loops and don't
- * care if they get somewhat less cycles.
- */
- while (endflag == 0)
- mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING
- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
- cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
- printk(KERN_WARNING
- "Please report this to bugzilla.kernel.org,\n");
- printk(KERN_WARNING
- "and attach the output of the 'dmesg' command.\n");
-
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
- unsigned int *prev_nmi_count;
- int cpu;
-
- if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- goto error;
-
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
- for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = get_nmi_count(cpu);
- local_irq_enable();
- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
- report_broken_nmi(cpu, prev_nmi_count);
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- goto error;
- }
- printk("OK.\n");
-
- /*
- * now that we know it works we can reduce NMI frequency to
- * something more reasonable; makes a difference in some configs
- */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-error:
- if (nmi_watchdog == NMI_IO_APIC) {
- if (!timer_through_8259)
- legacy_pic->mask(0);
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
- }
-
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
- return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- unsigned int nmi;
-
- if (!strncmp(str, "panic", 5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- if (!strncmp(str, "lapic", 5))
- nmi_watchdog = NMI_LOCAL_APIC;
- else if (!strncmp(str, "ioapic", 6))
- nmi_watchdog = NMI_IO_APIC;
- else {
- get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
- return 0;
- nmi_watchdog = nmi;
- }
-
- return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /*
- * should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if (!nmi_watchdog_active())
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- else
- __acpi_nmi_disable(NULL);
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog_active()) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- /*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
- */
- unsigned int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- sum = get_timer_irqs(cpu);
-
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- /* We can be called before check_nmi_watchdog, hence NULL check. */
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
- raw_spin_lock(&lock);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- dump_stack();
- raw_spin_unlock(&lock);
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
- rc = 1;
- }
-
- /* Could check oops_in_progress here too, but it's safer not to */
- if (mce_in_progress())
- touched = 1;
-
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- __this_cpu_inc(alert_counter);
- if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(alert_counter, 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
- touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
- printk(KERN_WARNING
- "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else if (nmi_watchdog == NMI_IO_APIC) {
- if (nmi_watchdog_enabled)
- enable_ioapic_nmi_watchdog();
- else
- disable_ioapic_nmi_watchdog();
- } else {
- printk(KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
- int i;
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- }
-}
#else
vgetcpu_set_mode();
#endif
- init_hw_perf_events();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
{
int i;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- disable_lapic_nmi_watchdog();
-
for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
for (i--; i >= 0; i--)
release_perfctr_nmi(x86_pmu.perfctr + i);
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-
return false;
}
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
}
#else
struct hw_perf_event *hwc = &event->hw;
u64 config;
- if (!hwc->sample_period) {
+ if (!is_sampling_event(event)) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
pr_info("no hardware sampling interrupt available.\n");
}
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
struct event_constraint *c;
int err;
err = amd_pmu_init();
break;
default:
- return;
+ return 0;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
- return;
+ return 0;
}
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
if (!check_hw_exists()) {
pr_cont("Broken PMU hardware detected, software events only.\n");
- return;
+ return 0;
}
pr_cont("%s PMU driver.\n", x86_pmu.name);
perf_pmu_register(&pmu);
perf_cpu_notifier(x86_pmu_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_events);
static inline void x86_pmu_read(struct perf_event *event)
{
perf_callchain_store(entry, regs->ip);
- dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+ dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
}
#ifdef CONFIG_COMPAT
#include <asm/apic.h>
#include <asm/perf_event.h>
-struct nmi_watchdog_ctlblk {
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
- int (*reserve)(void);
- void (*unreserve)(void);
- int (*setup)(unsigned nmi_hz);
- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
- void (*stop)(void);
- unsigned perfctr;
- unsigned evntsel;
- u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
- if (wd_ops)
- wd_ops->unreserve();
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (!wd_ops)
- return;
- if (!wd_ops->reserve()) {
- printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
- return;
- }
-
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
- touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
- if (!reserve_perfctr_nmi(wd_ops->perfctr))
- return 0;
-
- if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
- release_perfctr_nmi(wd_ops->perfctr);
- return 0;
- }
- return 1;
-}
-
-static void single_msr_unreserve(void)
-{
- release_evntsel_nmi(wd_ops->evntsel);
- release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- /* KVM doesn't implement this MSR */
- if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
- return 0;
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS (1 << 3)
-#define P4_ESCR_USR (1 << 2)
-#define P4_CCCR_OVF_PMI0 (1 << 26)
-#define P4_CCCR_OVF_PMI1 (1 << 27)
-#define P4_CCCR_THRESHOLD(N) ((N) << 20)
-#define P4_CCCR_COMPLEMENT (1 << 19)
-#define P4_CCCR_COMPARE (1 << 18)
-#define P4_CCCR_REQUIRED (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE (1 << 12)
-#define P4_CCCR_OVF (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /*
- * performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
-
- /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
- if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
- cccr_val = P4_CCCR_OVF_PMI0;
- else
- cccr_val = P4_CCCR_OVF_PMI1;
- cccr_val |= P4_CCCR_ESCR_SELECT(4);
- }
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
- if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
- return 0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
- goto fail1;
-#endif
- if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
- goto fail2;
- /* RED-PEN why is ESCR1 not reserved here? */
- return 1;
- fail2:
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
- return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
- release_evntsel_nmi(MSR_P4_CRU_ESCR0);
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- unsigned dummy;
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
- /* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length <
- (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
- intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 == 6 ||
- (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
- wd_ops = &k7_wd_ops;
- return;
- case X86_VENDOR_INTEL:
- /* Work around where perfctr1 doesn't have a working enable
- * bit as described in the following errata:
- * AE49 Core Duo and Intel Core Solo 65 nm
- * AN49 Intel Pentium Dual-Core
- * AF49 Dual-Core Intel Xeon Processor LV
- */
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
- ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
- boot_cpu_data.x86_mask == 4))) {
- intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
- intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
- }
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- wd_ops = &intel_arch_wd_ops;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 13)
- return;
-
- wd_ops = &p6_wd_ops;
- break;
- case 15:
- wd_ops = &p4_wd_ops;
- break;
- default:
- return;
- }
- break;
- }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
- if (!wd_ops) {
- probe_nmi_watchdog();
- if (!wd_ops) {
- printk(KERN_INFO "NMI watchdog: CPU not supported\n");
- return -1;
- }
-
- if (!wd_ops->reserve()) {
- printk(KERN_ERR
- "NMI watchdog: cannot reserve perfctrs\n");
- return -1;
- }
- }
-
- if (!(wd_ops->setup(nmi_hz))) {
- printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
- raw_smp_processor_id());
- return -1;
- }
-
- return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
- if (wd_ops)
- wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
- hz = adjust_for_32bit_ctr(hz);
- return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
-
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
-
- wd_ops->rearm(wd, nmi_hz);
- return 1;
-}
void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, char *log_lvl)
{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
+ unsigned long *stack)
{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ show_trace_log_lvl(task, regs, stack, "");
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ show_stack_log_lvl(task, NULL, sp, "");
}
/*
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ show_trace(NULL, NULL, &stack);
}
EXPORT_SYMBOL(dump_stack);
#include <asm/stacktrace.h>
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
int graph = 0;
+ unsigned long bp;
if (!task)
task = current;
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
for (;;) {
struct thread_info *context;
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
touch_nmi_watchdog();
}
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
u8 *ip;
printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, ®s->sp,
- 0, KERN_EMERG);
+ show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG);
printk(KERN_EMERG "Code: ");
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
+ unsigned long bp;
if (!task)
task = current;
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
/*
* Print function call entries in all stacks, starting at the
* current stack address. If the stacks consist of nested
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
preempt_enable();
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Stack:\n");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, KERN_EMERG);
+ KERN_EMERG);
printk(KERN_EMERG "Code: ");
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
preempt_disable();
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
return 0;
}
-/* Replace a breakpoint (int3) with a relative jump. */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+ u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
{
- unsigned char jmp_code[RELATIVEJUMP_SIZE];
s32 rel = (s32)((long)op->optinsn.insn -
((long)op->kp.addr + RELATIVEJUMP_SIZE));
memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
RELATIVE_ADDR_SIZE);
- jmp_code[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&jmp_code[1]) = rel;
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ WARN_ON(kprobe_disabled(&op->kp));
+ /* Setup param */
+ setup_optimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_del_init(&op->list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
/*
* text_poke_smp doesn't support NMI/MCE code modifying.
* However, since kprobes itself also doesn't support NMI/MCE
* code probing, it's not a problem.
*/
- text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
- return 0;
+ text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
+{
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ /* Setup param */
+ setup_unoptimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_move(&op->list, done_list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp_batch(jump_poke_params, c);
}
/* Replace a relative jump with a breakpoint (int3). */
}
return 0;
}
+
+static int __kprobes init_poke_params(void)
+{
+ /* Allocate code buffer and parameter array */
+ jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_bufs)
+ return -ENOMEM;
+
+ jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_params) {
+ kfree(jump_poke_bufs);
+ jump_poke_bufs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+#else /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+ return 0;
+}
#endif
int __init arch_init_kprobes(void)
{
- return 0;
+ return init_poke_params();
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
void show_regs(struct pt_regs *regs)
{
show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
- regs->bp);
+ show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
}
void show_regs_common(void)
*/
smp_store_cpu_info(cpuid);
+ /*
+ * This must be done before setting cpu_online_mask
+ * or calling notify_cpu_starting.
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+ wmb();
+
notify_cpu_starting(cpuid);
/*
*/
check_tsc_sync_target();
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- enable_NMI_through_LVT0();
- legacy_pic->unmask(0);
- }
-
- /* This must be done before setting cpu_online_mask */
- set_cpu_sibling_map(raw_smp_processor_id());
- wmb();
-
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
printk(KERN_INFO "SMP mode deactivated.\n");
smpboot_clear_io_apic();
- localise_nmi_watchdog();
-
connect_bsp_APIC();
setup_local_APIC();
end_local_APIC_setup();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- check_nmi_watchdog();
mtrr_aps_init();
}
if (cpu == 0)
return -EBUSY;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
cpu_disable_common();
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
+ dump_trace(current, NULL, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
{
- dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+ dump_trace(current, regs, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
+ dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
#include <asm/hpet.h>
#include <asm/time.h>
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
#ifdef CONFIG_X86_64
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
#endif
/* Keep nmi watchdog up to date */
inc_irq_stat(irq0_irqs);
- /* Optimized out for !IO_APIC and x86_64 */
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- raw_spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- raw_spin_unlock(&i8259A_lock);
- }
-
global_clock_event->event_handler(global_clock_event);
/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
== NOTIFY_STOP)
return;
-#ifndef CONFIG_LOCKUP_DETECTOR
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
void stop_nmi(void)
{
- acpi_nmi_disable();
ignore_nmis++;
}
void restart_nmi(void)
{
ignore_nmis--;
- acpi_nmi_enable();
}
/* May run on IST stack. */
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_bp(&e->trace, regs->bp);
+ save_stack_trace_regs(&e->trace, regs);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
if (!user_mode_vm(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
if (depth)
- dump_trace(NULL, regs, (unsigned long *)stack, 0,
+ dump_trace(NULL, regs, (unsigned long *)stack,
&backtrace_ops, &depth);
return;
}
int __init op_nmi_timer_init(struct oprofile_operations *ops)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
- return -ENODEV;
-
ops->start = timer_start;
ops->stop = timer_stop;
ops->cpu_type = "timer";
* as possible (without an NMI being received in the middle of
* this) - so disable NMIs and initialize the device:
*/
- acpi_nmi_disable();
status = acpi_ns_evaluate(info);
- acpi_nmi_enable();
if (ACPI_SUCCESS(status)) {
walk_info->num_INI++;
* If nmi_watchdog is turned off then we can turn on
* our nmi decoding capability.
*/
- if (!nmi_watchdog_active())
- hpwdt_nmi_decoding = 1;
- else
- dev_warn(&dev->dev, "NMI decoding is disabled. To enable this "
- "functionality you must reboot with nmi_watchdog=0 "
- "and load the hpwdt driver with priority=1.\n");
+ hpwdt_nmi_decoding = 1;
}
#else
static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
TRACE_EVENT_FL_ENABLED_BIT,
TRACE_EVENT_FL_FILTERED_BIT,
TRACE_EVENT_FL_RECORDED_CMD_BIT,
+ TRACE_EVENT_FL_CAP_ANY_BIT,
};
enum {
TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT),
TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
+ TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
};
struct ftrace_event_call {
#endif
};
+#define __TRACE_EVENT_FLAGS(name, value) \
+ static int __init trace_init_flags_##name(void) \
+ { \
+ event_##name.flags = value; \
+ return 0; \
+ } \
+ early_initcall(trace_init_flags_##name);
+
#define PERF_MAX_TRACE_SIZE 2048
#define MAX_FILTER_PRED 32
extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_optimize_kprobe(struct optimized_kprobe *op);
+extern void arch_optimize_kprobes(struct list_head *oplist);
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list);
extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
extern kprobe_opcode_t *get_optinsn_slot(void);
extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
*/
#ifdef ARCH_HAS_NMI_WATCHDOG
#include <asm/nmi.h>
-extern void touch_nmi_watchdog(void);
-extern void acpi_nmi_disable(void);
-extern void acpi_nmi_enable(void);
-#else
+#endif
#ifndef CONFIG_HARDLOCKUP_DETECTOR
static inline void touch_nmi_watchdog(void)
{
#else
extern void touch_nmi_watchdog(void);
#endif
-static inline void acpi_nmi_disable(void) { }
-static inline void acpi_nmi_enable(void) { }
-#endif
/*
* Create trigger_all_cpu_backtrace() out of the arch-provided
u64 shadow_ctx_time;
struct perf_event_attr attr;
+ u16 header_size;
+ u16 read_size;
struct hw_perf_event hw;
struct perf_event_context *ctx;
struct perf_sample_data *data,
struct pt_regs *regs);
+static inline bool is_sampling_event(struct perf_event *event)
+{
+ return event->attr.sample_period != 0;
+}
+
/*
* Return 1 for a software event, 0 for a hardware event
*/
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
extern int softlockup_thresh;
+void lockup_detector_init(void);
#else
static inline void touch_softlockup_watchdog(void)
{
static inline void touch_all_softlockup_watchdogs(void)
{
}
+static inline void lockup_detector_init(void)
+{
+}
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
#define __LINUX_STACKTRACE_H
struct task_struct;
+struct pt_regs;
#ifdef CONFIG_STACKTRACE
struct task_struct;
};
extern void save_stack_trace(struct stack_trace *trace);
-extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
+extern void save_stack_trace_regs(struct stack_trace *trace,
+ struct pt_regs *regs);
extern void save_stack_trace_tsk(struct task_struct *tsk,
struct stack_trace *trace);
#define SYSCALL_TRACE_ENTER_EVENT(sname) \
static struct syscall_metadata \
__attribute__((__aligned__(4))) __syscall_meta_##sname; \
- static struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_enter_##sname; \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) \
.class = &event_class_syscall_enter, \
.event.funcs = &enter_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
- }
+ }; \
+ __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
#define SYSCALL_TRACE_EXIT_EVENT(sname) \
static struct syscall_metadata \
__attribute__((__aligned__(4))) __syscall_meta_##sname; \
- static struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_exit_##sname; \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) \
.class = &event_class_syscall_exit, \
.event.funcs = &exit_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
- }
+ }; \
+ __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
#define SYSCALL_METADATA(sname, nb) \
SYSCALL_TRACE_ENTER_EVENT(sname); \
PARAMS(void *__data, proto), \
PARAMS(__data, args))
+#define TRACE_EVENT_FLAGS(event, flag)
+
#endif /* DECLARE_TRACE */
#ifndef TRACE_EVENT
assign, print, reg, unreg) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_FLAGS(event, flag)
+
#endif /* ifdef TRACE_EVENT (see note above) */
syscall_regfunc, syscall_unregfunc
);
+TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
+
TRACE_EVENT_FN(sys_exit,
TP_PROTO(struct pt_regs *regs, long ret),
syscall_regfunc, syscall_unregfunc
);
+TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY)
+
#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
#endif /* _TRACE_EVENTS_SYSCALLS_H */
TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \
PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(name, value) \
+ __TRACE_EVENT_FLAGS(name, value)
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(event, flag)
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
/*
smp_prepare_cpus(setup_max_cpus);
do_pre_smp_initcalls();
+ lockup_detector_init();
smp_init();
sched_init_smp();
return p->pre_handler == aggr_pre_handler;
}
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+ return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+ list_empty(&p->list);
+}
+
/*
* Keep all fields in the kprobe consistent
*/
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
- memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
- memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+ memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}
#ifdef CONFIG_OPTPROBES
}
}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_remove_optimized_kprobe(op);
+ arch_remove_kprobe(p);
+ kfree(op);
+}
+
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
return 0;
}
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+ if (!kprobe_aggrprobe(p))
+ return kprobe_disabled(p);
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ return kprobe_disabled(p) && list_empty(&op->list);
+}
+
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ return 1;
+ }
+ return 0;
+}
+
/*
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+/*
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
{
- struct optimized_kprobe *op, *tmp;
-
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
- mutex_lock(&kprobe_mutex);
- if (kprobes_all_disarmed || !kprobes_allow_optimization)
- goto end;
-
- /*
- * Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
- */
- synchronize_sched();
+ /* Optimization never be done when disarmed */
+ if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+ list_empty(&optimizing_list))
+ return;
/*
* The optimization/unoptimization refers online_cpus via
*/
get_online_cpus();
mutex_lock(&text_mutex);
- list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
- WARN_ON(kprobe_disabled(&op->kp));
- if (arch_optimize_kprobe(op) < 0)
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- list_del_init(&op->list);
+ arch_optimize_kprobes(&optimizing_list);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ /* Unoptimization must be done anytime */
+ if (list_empty(&unoptimizing_list))
+ return;
+
+ /* Ditto to do_optimize_kprobes */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+ /* Loop free_list for disarming */
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ /* Disarm probes if marked disabled */
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
+ if (kprobe_unused(&op->kp)) {
+ /*
+ * Remove unused probes from hash list. After waiting
+ * for synchronization, these probes are reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes.)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ } else
+ list_del_init(&op->list);
}
mutex_unlock(&text_mutex);
put_online_cpus();
-end:
+}
+
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ BUG_ON(!kprobe_unused(&op->kp));
+ list_del_init(&op->list);
+ free_aggr_kprobe(&op->kp);
+ }
+}
+
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+ if (!delayed_work_pending(&optimizing_work))
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+ LIST_HEAD(free_list);
+
+ /* Lock modules while optimizing kprobes */
+ mutex_lock(&module_mutex);
+ mutex_lock(&kprobe_mutex);
+
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes(&free_list);
+
+ /*
+ * Step 2: Wait for quiesence period to ensure all running interrupts
+ * are done. Because optprobe may modify multiple instructions
+ * there is a chance that Nth instruction is interrupted. In that
+ * case, running interrupt can return to 2nd-Nth byte of jump
+ * instruction. This wait is for avoiding it.
+ */
+ synchronize_sched();
+
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
+
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes(&free_list);
+
mutex_unlock(&kprobe_mutex);
mutex_unlock(&module_mutex);
+
+ /* Step 5: Kick optimizer again if needed */
+ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+ kick_kprobe_optimizer();
+ else
+ /* Wake up all waiters */
+ complete_all(&optimizer_comp);
+}
+
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+ if (delayed_work_pending(&optimizing_work))
+ wait_for_completion(&optimizer_comp);
}
/* Optimize kprobe if p is ready to be optimized */
/* Check if it is already optimized. */
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
return;
-
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- list_add(&op->list, &optimizing_list);
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+
+ if (!list_empty(&op->list))
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ else {
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
+ }
+}
+
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ get_online_cpus();
+ arch_unoptimize_kprobe(op);
+ put_online_cpus();
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
- if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
- op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list))
- /* Dequeue from the optimization queue */
+ if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
+ return; /* This is not an optprobe nor optimized */
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!kprobe_optimized(p)) {
+ /* Unoptimized or unoptimizing case */
+ if (force && !list_empty(&op->list)) {
+ /*
+ * Only if this is unoptimizing kprobe and forced,
+ * forcibly unoptimize it. (No need to unoptimize
+ * unoptimized kprobe again :)
+ */
list_del_init(&op->list);
- else
- /* Replace jump with break */
- arch_unoptimize_kprobe(op);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ force_unoptimize_kprobe(op);
+ }
+ return;
+ }
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ if (!list_empty(&op->list)) {
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ return;
+ }
+ /* Optimized kprobe case */
+ if (force)
+ /* Forcibly update the code: this is a special case */
+ force_unoptimize_kprobe(op);
+ else {
+ list_add(&op->list, &unoptimizing_list);
+ kick_kprobe_optimizer();
}
}
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ struct optimized_kprobe *op;
+
+ BUG_ON(!kprobe_unused(ap));
+ /*
+ * Unused kprobe MUST be on the way of delayed unoptimizing (means
+ * there is still a relative jump) and disabled.
+ */
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (unlikely(list_empty(&op->list)))
+ printk(KERN_WARNING "Warning: found a stray unused "
+ "aggrprobe@%p\n", ap->addr);
+ /* Enable the probe again */
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ /* Optimize it again (remove from op->list) */
+ BUG_ON(!kprobe_optready(ap));
+ optimize_kprobe(ap);
+}
+
/* Remove optimized instructions */
static void __kprobes kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
+ if (!list_empty(&op->list))
+ /* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- }
- /* Don't unoptimize, because the target code will be freed. */
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ /* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
arch_prepare_optimized_kprobe(op);
}
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
- struct optimized_kprobe *op;
-
- op = container_of(p, struct optimized_kprobe, kp);
- arch_remove_optimized_kprobe(op);
- kfree(op);
-}
-
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
- free_aggr_kprobe(ap);
+ arch_remove_optimized_kprobe(op);
+ kfree(op);
return;
}
return;
kprobes_allow_optimization = false;
- printk(KERN_INFO "Kprobes globally unoptimized\n");
- get_online_cpus(); /* For avoiding text_mutex deadlock */
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!kprobe_disabled(p))
- unoptimize_kprobe(p);
+ unoptimize_kprobe(p, false);
}
}
-
- mutex_unlock(&text_mutex);
- put_online_cpus();
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
+ /* Wait for unoptimizing completion */
+ wait_for_kprobe_optimizer();
+ printk(KERN_INFO "Kprobes globally unoptimized\n");
}
int sysctl_kprobes_optimization;
}
#endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __kprobes __arm_kprobe(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
/* Check collision with other optimized kprobes */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p))
+ /* Fallback to unoptimized kprobe */
+ unoptimize_kprobe(_p, true);
arch_arm_kprobe(p);
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
- unoptimize_kprobe(p); /* Try to unoptimize */
- arch_disarm_kprobe(p);
+ unoptimize_kprobe(p, false); /* Try to unoptimize */
- /* If another kprobe was blocked, optimize it. */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- optimize_kprobe(old_p);
+ if (!kprobe_queued(p)) {
+ arch_disarm_kprobe(p);
+ /* If another kprobe was blocked, optimize it. */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p) && reopt)
+ optimize_kprobe(_p);
+ }
+ /* TODO: reoptimize others after unoptimized this probe */
}
#else /* !CONFIG_OPTPROBES */
#define optimize_kprobe(p) do {} while (0)
-#define unoptimize_kprobe(p) do {} while (0)
+#define unoptimize_kprobe(p, f) do {} while (0)
#define kill_optimized_kprobe(p) do {} while (0)
#define prepare_optimized_kprobe(p) do {} while (0)
#define try_to_optimize_kprobe(p) do {} while (0)
#define __arm_kprobe(p) arch_arm_kprobe(p)
-#define __disarm_kprobe(p) arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
+#define kprobe_disarmed(p) kprobe_disabled(p)
+#define wait_for_kprobe_optimizer() do {} while (0)
+
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ BUG_ON(kprobe_unused(ap));
+}
static __kprobes void free_aggr_kprobe(struct kprobe *p)
{
+ arch_remove_kprobe(p);
kfree(p);
}
/* Disarm a kprobe with text_mutex */
static void __kprobes disarm_kprobe(struct kprobe *kp)
{
- get_online_cpus(); /* For avoiding text_mutex deadlock */
+ /* Ditto */
mutex_lock(&text_mutex);
- __disarm_kprobe(kp);
+ __disarm_kprobe(kp, true);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
/*
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler || p->post_handler)
- unoptimize_kprobe(ap); /* Fall back to normal kprobe */
+ unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
if (p->break_handler) {
if (ap->break_handler)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
struct kprobe *p)
{
int ret = 0;
- struct kprobe *ap = old_p;
+ struct kprobe *ap = orig_p;
- if (!kprobe_aggrprobe(old_p)) {
- /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
- ap = alloc_aggr_kprobe(old_p);
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ ap = alloc_aggr_kprobe(orig_p);
if (!ap)
return -ENOMEM;
- init_aggr_kprobe(ap, old_p);
- }
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap))
+ /* This probe is going to die. Rescue it */
+ reuse_unused_kprobe(ap);
if (kprobe_gone(ap)) {
/*
return add_new_kprobe(ap, p);
}
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
- struct kprobe *kp;
-
- list_for_each_entry_rcu(kp, &p->list, list) {
- if (!kprobe_disabled(kp))
- /*
- * There is an active probe on the list.
- * We can't disable aggr_kprobe.
- */
- return 0;
- }
- p->flags |= KPROBE_FLAG_DISABLED;
- return 1;
-}
-
static int __kprobes in_kprobes_functions(unsigned long addr)
{
struct kprobe_blackpoint *kb;
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = get_kprobe(p->addr);
- if (unlikely(!old_p))
+ ap = get_kprobe(p->addr);
+ if (unlikely(!ap))
return NULL;
- if (p != old_p) {
- list_for_each_entry_rcu(list_p, &old_p->list, list)
+ if (p != ap) {
+ list_for_each_entry_rcu(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
return NULL;
}
valid:
- return old_p;
+ return ap;
}
/* Return error if the kprobe is being re-registered */
static inline int check_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
- struct kprobe *old_p;
mutex_lock(&kprobe_mutex);
- old_p = __get_valid_kprobe(p);
- if (old_p)
+ if (__get_valid_kprobe(p))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
+
return ret;
}
}
EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &ap->list, list)
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable this ap.
+ */
+ return 0;
+
+ return 1;
+}
+
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+ struct kprobe *orig_p;
+
+ /* Get an original kprobe for return */
+ orig_p = __get_valid_kprobe(p);
+ if (unlikely(orig_p == NULL))
+ return NULL;
+
+ if (!kprobe_disabled(p)) {
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ disarm_kprobe(orig_p);
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
+ }
+ }
+
+ return orig_p;
+}
+
/*
* Unregister a kprobe without a scheduler synchronization.
*/
static int __kprobes __unregister_kprobe_top(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = __get_valid_kprobe(p);
- if (old_p == NULL)
+ /* Disable kprobe. This will disarm it if needed. */
+ ap = __disable_kprobe(p);
+ if (ap == NULL)
return -EINVAL;
- if (old_p == p ||
- (kprobe_aggrprobe(old_p) &&
- list_is_singular(&old_p->list))) {
+ if (ap == p)
/*
- * Only probe on the hash list. Disarm only if kprobes are
- * enabled and not gone - otherwise, the breakpoint would
- * already have been removed. We save on flushing icache.
+ * This probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe). Remove from the hash list.
*/
- if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- hlist_del_rcu(&old_p->hlist);
- } else {
+ goto disarmed;
+
+ /* Following process expects this probe is an aggrprobe */
+ WARN_ON(!kprobe_aggrprobe(ap));
+
+ if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * !disarmed could be happen if the probe is under delayed
+ * unoptimizing.
+ */
+ goto disarmed;
+ else {
+ /* If disabling probe has special handlers, update aggrprobe */
if (p->break_handler && !kprobe_gone(p))
- old_p->break_handler = NULL;
+ ap->break_handler = NULL;
if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &old_p->list, list) {
+ list_for_each_entry_rcu(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- old_p->post_handler = NULL;
+ ap->post_handler = NULL;
}
noclean:
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
list_del_rcu(&p->list);
- if (!kprobe_disabled(old_p)) {
- try_to_disable_aggr_kprobe(old_p);
- if (!kprobes_all_disarmed) {
- if (kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- else
- /* Try to optimize this probe again */
- optimize_kprobe(old_p);
- }
- }
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
}
return 0;
+
+disarmed:
+ BUG_ON(!kprobe_disarmed(ap));
+ hlist_del_rcu(&ap->hlist);
+ return 0;
}
static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *ap;
if (list_empty(&p->list))
+ /* This is an independent kprobe */
arch_remove_kprobe(p);
else if (list_is_singular(&p->list)) {
- /* "p" is the last child of an aggr_kprobe */
- old_p = list_entry(p->list.next, struct kprobe, list);
+ /* This is the last child of an aggrprobe */
+ ap = list_entry(p->list.next, struct kprobe, list);
list_del(&p->list);
- arch_remove_kprobe(old_p);
- free_aggr_kprobe(old_p);
+ free_aggr_kprobe(ap);
}
+ /* Otherwise, do nothing. */
}
int __kprobes register_kprobes(struct kprobe **kps, int num)
int __kprobes disable_kprobe(struct kprobe *kp)
{
int ret = 0;
- struct kprobe *p;
mutex_lock(&kprobe_mutex);
- /* Check whether specified probe is valid. */
- p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
+ /* Disable this kprobe */
+ if (__disable_kprobe(kp) == NULL)
ret = -EINVAL;
- goto out;
- }
- /* If the probe is already disabled (or gone), just return */
- if (kprobe_disabled(kp))
- goto out;
-
- kp->flags |= KPROBE_FLAG_DISABLED;
- if (p != kp)
- /* When kp != p, p is always enabled. */
- try_to_disable_aggr_kprobe(p);
-
- if (!kprobes_all_disarmed && kprobe_disabled(p))
- disarm_kprobe(p);
-out:
mutex_unlock(&kprobe_mutex);
return ret;
}
mutex_lock(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed)
- goto already_disabled;
+ if (kprobes_all_disarmed) {
+ mutex_unlock(&kprobe_mutex);
+ return;
+ }
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
- /*
- * Here we call get_online_cpus() for avoiding text_mutex deadlock,
- * because disarming may also unoptimize kprobes.
- */
- get_online_cpus();
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- __disarm_kprobe(p);
+ __disarm_kprobe(p, false);
}
}
-
mutex_unlock(&text_mutex);
- put_online_cpus();
mutex_unlock(&kprobe_mutex);
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
- return;
-already_disabled:
- mutex_unlock(&kprobe_mutex);
- return;
+ /* Wait for disarming all kprobes by optimizer */
+ wait_for_kprobe_optimizer();
}
/*
ctx->nr_stat++;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ event->header_size = size;
+}
+
static void perf_group_attach(struct perf_event *event)
{
- struct perf_event *group_leader = event->group_leader;
+ struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
}
/*
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
- return;
+ goto out;
}
if (!list_empty(&event->group_entry))
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
}
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
}
static inline int
/*
* not supported on inherited events
*/
- if (event->attr.inherit)
+ if (event->attr.inherit || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
return perf_event_release_kernel(event);
}
-static int perf_event_read_size(struct perf_event *event)
-{
- int entry = sizeof(u64); /* value */
- int size = 0;
- int nr = 1;
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_ID)
- entry += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
- size += sizeof(u64);
- }
-
- size += entry * nr;
-
- return size;
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
if (event->state == PERF_EVENT_STATE_ERROR)
return 0;
- if (count < perf_event_read_size(event))
+ if (count < event->read_size)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
int ret = 0;
u64 value;
- if (!event->attr.sample_period)
+ if (!is_sampling_event(event))
return -EINVAL;
if (copy_from_user(&value, arg, sizeof(value)))
data->type = sample_type;
header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header);
+ header->size = sizeof(*header) + event->header_size;
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP) {
+ if (sample_type & PERF_SAMPLE_IP)
data->ip = perf_instruction_pointer(regs);
- header->size += sizeof(data->ip);
- }
-
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
data->tid_entry.pid = perf_event_pid(event, current);
data->tid_entry.tid = perf_event_tid(event, current);
-
- header->size += sizeof(data->tid_entry);
}
- if (sample_type & PERF_SAMPLE_TIME) {
+ if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_clock();
- header->size += sizeof(data->time);
- }
-
- if (sample_type & PERF_SAMPLE_ADDR)
- header->size += sizeof(data->addr);
-
- if (sample_type & PERF_SAMPLE_ID) {
+ if (sample_type & PERF_SAMPLE_ID)
data->id = primary_event_id(event);
- header->size += sizeof(data->id);
- }
-
- if (sample_type & PERF_SAMPLE_STREAM_ID) {
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
data->stream_id = event->id;
- header->size += sizeof(data->stream_id);
- }
-
if (sample_type & PERF_SAMPLE_CPU) {
data->cpu_entry.cpu = raw_smp_processor_id();
data->cpu_entry.reserved = 0;
-
- header->size += sizeof(data->cpu_entry);
}
- if (sample_type & PERF_SAMPLE_PERIOD)
- header->size += sizeof(data->period);
-
- if (sample_type & PERF_SAMPLE_READ)
- header->size += perf_event_read_size(event);
-
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
.header = {
.type = PERF_RECORD_READ,
.misc = 0,
- .size = sizeof(read_event) + perf_event_read_size(event),
+ .size = sizeof(read_event) + event->read_size,
},
.pid = perf_event_pid(event, task),
.tid = perf_event_tid(event, task),
struct hw_perf_event *hwc = &event->hw;
int ret = 0;
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
if (!throttle) {
hwc->interrupts++;
} else {
if (!regs)
return;
- if (!hwc->sample_period)
+ if (!is_sampling_event(event))
return;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
- /*
- * Raw tracepoint data is a severe data leak, only allow root to
- * have these.
- */
- if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
- perf_paranoid_tracepoint_raw() &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
err = perf_trace_init(event);
if (err)
return err;
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- s64 period = local64_read(&hwc->period_left);
- if (period) {
- if (period < 0)
- period = 10000;
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
ns_to_ktime(period), 0,
HRTIMER_MODE_REL_PINNED, 0);
- }
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
list_add_tail(&event->owner_entry, ¤t->perf_event_list);
mutex_unlock(¤t->perf_event_mutex);
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+
/*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
.extra2 = &one,
},
#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
- {
- .procname = "unknown_nmi_panic",
- .data = &unknown_nmi_panic,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_nmi_enabled,
- },
-#endif
#if defined(CONFIG_X86)
{
.procname = "panic_on_unrecovered_nmi",
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ /* No tracing, just counting, so no obvious leak */
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ return 0;
+
+ /* Some events are ok to be traced by non-root users... */
+ if (p_event->attach_state == PERF_ATTACH_TASK) {
+ if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+ return 0;
+ }
+
+ /*
+ * ...otherwise raw tracepoint data can be a severe data leak,
+ * only allow root to have these.
+ */
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
static int perf_trace_event_init(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
- int ret = -ENOMEM;
+ int ret;
int cpu;
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
p_event->tp_event = tp_event;
if (tp_event->perf_refcount++ > 0)
return 0;
+ ret = -ENOMEM;
+
list = alloc_percpu(struct hlist_head);
if (!list)
goto fail;
.notifier_call = cpu_callback
};
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
if (no_watchdog)
- return 0;
+ return;
err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
WARN_ON(notifier_to_errno(err));
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
- return 0;
+ return;
}
-early_initcall(spawn_watchdog_task);
-I DEFINE_TRACE,EXPORT_TRACEPOINT_SYMBOL,EXPORT_TRACEPOINT_SYMBOL_GPL \
--extra=+f --c-kinds=-px \
--regex-asm='/^ENTRY\(([^)]*)\).*/\1/' \
- --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/'
+ --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' \
+ --regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1/' \
+ --regex-c++='/^DEFINE_EVENT\(([^,)]*).*/trace_\1/'
all_kconfigs | xargs $1 -a \
--langdef=kconfig --language-force=kconfig \
--input=::
Input file name. (default: perf.data)
+-d::
+--dsos=<dso[,dso...]>::
+ Only consider symbols in these dsos.
+-s::
+--symbol=<symbol>::
+ Symbol to annotate.
+
+-f::
+--force::
+ Don't complain, do it.
+
+-v::
+--verbose::
+ Be more verbose. (Show symbol address, etc)
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
+-k::
+--vmlinux=<file>::
+ vmlinux pathname.
+
+-m::
+--modules::
+ Load module symbols. WARNING: use only with -k and LIVE kernel.
+
+-l::
+--print-line::
+ Print matching source lines (may be slow).
+
+-P::
+--full-paths::
+ Don't shorten the displayed pathnames.
+
--stdio:: Use the stdio interface.
--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
present, as when piping to other commands, the stdio interface is
used. This interfaces starts by centering on the line with more
- samples, TAB/UNTAB cycles thru the lines with more samples.
+ samples, TAB/UNTAB cycles through the lines with more samples.
SEE ALSO
--------
OPTIONS
-------
+-H::
+--with-hits::
+ Show only DSOs with hits.
-i::
--input=::
Input file name. (default: perf.data)
OPTIONS
-------
+-M::
+--displacement::
+ Show position displacement relative to baseline.
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
+-m::
+--modules::
+ Load module symbols. WARNING: use only with -k and LIVE kernel
+
-d::
--dsos=::
Only consider symbols in these dsos. CSV that understands
--field-separator=::
Use a special separator character and don't pad with spaces, replacing
- all occurances of this separator in symbol names (and other output)
+ all occurrences of this separator in symbol names (and other output)
with a '.' character, that thus it's the only non valid separator.
-v::
Be verbose, for instance, show the raw counts in addition to the
diff.
+-f::
+--force::
+ Don't complain, do it.
+
+
SEE ALSO
--------
linkperf:perf-record[1]
a performance counter profile of guest os in realtime
of an arbitrary workload.
- 'perf kvm record <command>' to record the performance couinter profile
+ 'perf kvm record <command>' to record the performance counter profile
of an arbitrary workload and save it into a perf data file. If both
--host and --guest are input, the perf data file name is perf.data.kvm.
If there is no --host but --guest, the file name is perf.data.guest.
OPTIONS
-------
+-i::
+--input=::
+ Input file name.
+-o::
+--output::
+ Output file name.
--host=::
Collect host side performance profile.
--guest=::
'perf lock report' reports statistical data.
+OPTIONS
+-------
+
+-i::
+--input=<file>::
+ Input file name.
+
+-v::
+--verbose::
+ Be more verbose (show symbol address, etc).
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
SEE ALSO
--------
linkperf:perf[1]
LINE SYNTAX
-----------
-Line range is descripted by following syntax.
+Line range is described by following syntax.
"FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]"
be passed as follows: '\mem:addr[:[r][w][x]]'.
If you want to profile read-write accesses in 0x1000, just set
'mem:0x1000:rw'.
+
+--filter=<filter>::
+ Event filter.
+
-a::
- System-wide collection.
+--all-cpus::
+ System-wide collection from all CPUs.
-l::
Scale counter values.
-p::
--pid=::
- Record events on existing pid.
+ Record events on existing process ID.
+
+-t::
+--tid=::
+ Record events on existing thread ID.
-r::
--realtime=::
-C::
--cpu::
-Collect samples only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
In per-thread mode with inheritance mode on (default), samples are captured only when
the thread executes on the designated CPUs. Default is to monitor all CPUs.
-i::
--input=::
Input file name. (default: perf.data)
+
+-v::
+--verbose::
+ Be more verbose. (show symbol address, etc)
+
-d::
--dsos=::
Only consider symbols in these dsos. CSV that understands
-n::
--show-nr-samples::
Show the number of samples for each symbol
+
+--showcpuutilization::
+ Show sample percentage for different cpu modes.
+
-T::
--threads::
Show per-thread event counters
Only consider these symbols. CSV that understands
file://filename entries.
+-U::
+--hide-unresolved::
+ Only display entries resolved to a symbol.
+
-s::
--sort=::
Sort by key(s): pid, comm, dso, symbol, parent.
+-p::
+--parent=<regex>::
+ regex filter to identify parent, see: '--sort parent'
+
+-x::
+--exclude-other::
+ Only display entries with parent-match.
+
-w::
---field-width=::
+--column-widths=<width[,width...]>::
Force each column width to the provided list, for large terminal
readability.
--field-separator=::
Use a special separator character and don't pad with spaces, replacing
- all occurances of this separator in symbol names (and other output)
+ all occurrences of this separator in symbol names (and other output)
with a '.' character, that thus it's the only non valid separator.
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
-g [type,min]::
--call-graph::
- Display callchains using type and min percent threshold.
+ Display call chains using type and min percent threshold.
type can be either:
- - flat: single column, linear exposure of callchains.
+ - flat: single column, linear exposure of call chains.
- graph: use a graph tree, displaying absolute overhead rates.
- fractal: like graph, but displays relative rates. Each branch of
the tree is considered as a new profiled object. +
Default: fractal,0.5.
+--pretty=<key>::
+ Pretty printing style. key: normal, raw
+
--stdio:: Use the stdio interface.
--tui:: Use the TUI interface, that is integrated with annotate and allows
requires a tty, if one is not present, as when piping to other
commands, the stdio interface is used.
+-k::
+--vmlinux=<file>::
+ vmlinux pathname
+
+-m::
+--modules::
+ Load module symbols. WARNING: This should only be used with -k and
+ a LIVE kernel.
+
+-f::
+--force::
+ Don't complain, do it.
+
SEE ALSO
--------
linkperf:perf-stat[1]
SYNOPSIS
--------
[verse]
-'perf sched' {record|latency|replay|trace}
+'perf sched' {record|latency|map|replay|trace}
DESCRIPTION
-----------
-There are four variants of perf sched:
+There are five variants of perf sched:
'perf sched record <command>' to record the scheduling events
of an arbitrary workload.
of the workload as it occurred when it was recorded - and can repeat
it a number of times, measuring its performance.)
+ 'perf sched map' to print a textual context-switching outline of
+ workload captured via perf sched record. Columns stand for
+ individual CPUs, and the two-letter shortcuts stand for tasks that
+ are running on a CPU. A '*' denotes the CPU that had the event, and
+ a dot signals an idle CPU.
+
OPTIONS
-------
+-i::
+--input=<file>::
+ Input file name. (default: perf.data)
+
+-v::
+--verbose::
+ Be more verbose. (show symbol address, etc)
+
-D::
--dump-raw-trace=::
Display verbose dump of the sched data.
-perf-trace-perl(1)
+perf-script-perl(1)
==================
NAME
----
-perf-trace-perl - Process trace data with a Perl script
+perf-script-perl - Process trace data with a Perl script
SYNOPSIS
--------
[verse]
-'perf trace' [-s [Perl]:script[.pl] ]
+'perf script' [-s [Perl]:script[.pl] ]
DESCRIPTION
-----------
-This perf trace option is used to process perf trace data using perf's
+This perf script option is used to process perf script data using perf's
built-in Perl interpreter. It reads and processes the input file and
displays the results of the trace analysis implemented in the given
Perl script, if any.
STARTER SCRIPTS
---------------
-You can avoid reading the rest of this document by running 'perf trace
+You can avoid reading the rest of this document by running 'perf script
-g perl' in the same directory as an existing perf.data trace file.
That will generate a starter script containing a handler for each of
the event types in the trace file; it simply prints every available
You can also look at the existing scripts in
~/libexec/perf-core/scripts/perl for typical examples showing how to
do basic things like aggregate event data, print results, etc. Also,
-the check-perf-trace.pl script, while not interesting for its results,
+the check-perf-script.pl script, while not interesting for its results,
attempts to exercise all of the main scripting features.
EVENT HANDLERS
--------------
-When perf trace is invoked using a trace script, a user-defined
+When perf script is invoked using a trace script, a user-defined
'handler function' is called for each event in the trace. If there's
no handler function defined for a given event type, the event is
ignored (or passed to a 'trace_handled' function, see below) and the
SCRIPT LAYOUT
-------------
-Every perf trace Perl script should start by setting up a Perl module
+Every perf script Perl script should start by setting up a Perl module
search path and 'use'ing a few support modules (see module
descriptions below):
----
- use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
- use lib "./Perf-Trace-Util/lib";
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib";
+ use lib "./perf-script-Util/lib";
use Perf::Trace::Core;
use Perf::Trace::Context;
use Perf::Trace::Util;
----
The remaining sections provide descriptions of each of the available
-built-in perf trace Perl modules and their associated functions.
+built-in perf script Perl modules and their associated functions.
AVAILABLE MODULES AND FUNCTIONS
-------------------------------
The following sections describe the functions and variables available
via the various Perf::Trace::* Perl modules. To use the functions and
variables from the given module, add the corresponding 'use
-Perf::Trace::XXX' line to your perf trace script.
+Perf::Trace::XXX' line to your perf script script.
Perf::Trace::Core Module
~~~~~~~~~~~~~~~~~~~~~~~~
Perf::Trace::Util Module
~~~~~~~~~~~~~~~~~~~~~~~~
-Various utility functions for use with perf trace:
+Various utility functions for use with perf script:
nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
nsecs_secs($nsecs) - returns whole secs portion given nsecs
SEE ALSO
--------
-linkperf:perf-trace[1]
+linkperf:perf-script[1]
-perf-trace-python(1)
+perf-script-python(1)
====================
NAME
----
-perf-trace-python - Process trace data with a Python script
+perf-script-python - Process trace data with a Python script
SYNOPSIS
--------
[verse]
-'perf trace' [-s [Python]:script[.py] ]
+'perf script' [-s [Python]:script[.py] ]
DESCRIPTION
-----------
-This perf trace option is used to process perf trace data using perf's
+This perf script option is used to process perf script data using perf's
built-in Python interpreter. It reads and processes the input file and
displays the results of the trace analysis implemented in the given
Python script, if any.
This section shows the process, start to finish, of creating a working
Python script that aggregates and extracts useful information from a
-raw perf trace stream. You can avoid reading the rest of this
+raw perf script stream. You can avoid reading the rest of this
document if an example is enough for you; the rest of the document
provides more details on each step and lists the library functions
available to script writers.
This example actually details the steps that were used to create the
-'syscall-counts' script you see when you list the available perf trace
-scripts via 'perf trace -l'. As such, this script also shows how to
-integrate your script into the list of general-purpose 'perf trace'
+'syscall-counts' script you see when you list the available perf script
+scripts via 'perf script -l'. As such, this script also shows how to
+integrate your script into the list of general-purpose 'perf script'
scripts listed by that command.
The syscall-counts script is a simple script, but demonstrates all the
called perf.data.
Once we have a perf.data file containing our data, we can use the -g
-'perf trace' option to generate a Python script that will contain a
+'perf script' option to generate a Python script that will contain a
callback handler for each event type found in the perf.data trace
stream (for more details, see the STARTER SCRIPTS section).
----
-# perf trace -g python
-generated Python script: perf-trace.py
+# perf script -g python
+generated Python script: perf-script.py
The output file created also in the current directory is named
-perf-trace.py. Here's the file in its entirety:
+perf-script.py. Here's the file in its entirety:
-# perf trace event handlers, generated by perf trace -g python
+# perf script event handlers, generated by perf script -g python
# Licensed under the terms of the GNU GPL License version 2
# The common_* event handler fields are the most useful fields common to
# all events. They don't necessarily correspond to the 'common_*' fields
# in the format files. Those fields not available as handler params can
# be retrieved using Python functions of the form common_*(context).
-# See the perf-trace-python Documentation for the list of available functions.
+# See the perf-script-python Documentation for the list of available functions.
import os
import sys
sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
from perf_trace_context import *
from Core import *
----
At the top is a comment block followed by some import statements and a
-path append which every perf trace script should include.
+path append which every perf script script should include.
Following that are a couple generated functions, trace_begin() and
trace_end(), which are called at the beginning and the end of the
script and run it to see the default output:
----
-# mv perf-trace.py syscall-counts.py
-# perf trace -s syscall-counts.py
+# mv perf-script.py syscall-counts.py
+# perf script -s syscall-counts.py
raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args=
raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args=
import sys
sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
from perf_trace_context import *
from Core import *
import sys
sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
from perf_trace_context import *
from Core import *
The script can be run just as before:
- # perf trace -s syscall-counts.py
+ # perf script -s syscall-counts.py
So those are the essential steps in writing and running a script. The
process can be generalized to any tracepoint or set of tracepoints
'perf list' and/or look in /sys/kernel/debug/tracing events for
detailed event and field info, record the corresponding trace data
using 'perf record', passing it the list of interesting events,
-generate a skeleton script using 'perf trace -g python' and modify the
+generate a skeleton script using 'perf script -g python' and modify the
code to aggregate and display it for your particular needs.
After you've done that you may end up with a general-purpose script
that you want to keep around and have available for future use. By
writing a couple of very simple shell scripts and putting them in the
right place, you can have your script listed alongside the other
-scripts listed by the 'perf trace -l' command e.g.:
+scripts listed by the 'perf script -l' command e.g.:
----
-root@tropicana:~# perf trace -l
+root@tropicana:~# perf script -l
List of available trace scripts:
workqueue-stats workqueue stats (ins/exe/create/destroy)
wakeup-latency system-wide min/max/avg wakeup latency
The 'report' script is also a shell script with the same base name as
your script, but with -report appended. It should also be located in
the perf/scripts/python/bin directory. In that script, you write the
-'perf trace -s' command-line needed for running your script:
+'perf script -s' command-line needed for running your script:
----
# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
#!/bin/bash
# description: system-wide syscall counts
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py
+perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
----
Note that the location of the Python script given in the shell script
drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
--rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py
-drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
+-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
+drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util
-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
----
Once you've done that (don't forget to do a new 'make install',
-otherwise your script won't show up at run-time), 'perf trace -l'
+otherwise your script won't show up at run-time), 'perf script -l'
should show a new entry for your script:
----
-root@tropicana:~# perf trace -l
+root@tropicana:~# perf script -l
List of available trace scripts:
workqueue-stats workqueue stats (ins/exe/create/destroy)
wakeup-latency system-wide min/max/avg wakeup latency
syscall-counts system-wide syscall counts
----
-You can now perform the record step via 'perf trace record':
+You can now perform the record step via 'perf script record':
- # perf trace record syscall-counts
+ # perf script record syscall-counts
-and display the output using 'perf trace report':
+and display the output using 'perf script report':
- # perf trace report syscall-counts
+ # perf script report syscall-counts
STARTER SCRIPTS
---------------
You can quickly get started writing a script for a particular set of
-trace data by generating a skeleton script using 'perf trace -g
+trace data by generating a skeleton script using 'perf script -g
python' in the same directory as an existing perf.data trace file.
That will generate a starter script containing a handler for each of
the event types in the trace file; it simply prints every available
You can also look at the existing scripts in
~/libexec/perf-core/scripts/python for typical examples showing how to
do basic things like aggregate event data, print results, etc. Also,
-the check-perf-trace.py script, while not interesting for its results,
+the check-perf-script.py script, while not interesting for its results,
attempts to exercise all of the main scripting features.
EVENT HANDLERS
--------------
-When perf trace is invoked using a trace script, a user-defined
+When perf script is invoked using a trace script, a user-defined
'handler function' is called for each event in the trace. If there's
no handler function defined for a given event type, the event is
ignored (or passed to a 'trace_handled' function, see below) and the
SCRIPT LAYOUT
-------------
-Every perf trace Python script should start by setting up a Python
+Every perf script Python script should start by setting up a Python
module search path and 'import'ing a few support modules (see module
descriptions below):
import sys
sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
from perf_trace_context import *
from Core import *
----
The remaining sections provide descriptions of each of the available
-built-in perf trace Python modules and their associated functions.
+built-in perf script Python modules and their associated functions.
AVAILABLE MODULES AND FUNCTIONS
-------------------------------
The following sections describe the functions and variables available
-via the various perf trace Python modules. To use the functions and
+via the various perf script Python modules. To use the functions and
variables from the given module, add the corresponding 'from XXXX
-import' line to your perf trace script.
+import' line to your perf script script.
Core.py Module
~~~~~~~~~~~~~~
Util.py Module
~~~~~~~~~~~~~~
-Various utility functions for use with perf trace:
+Various utility functions for use with perf script:
nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
nsecs_secs(nsecs) - returns whole secs portion given nsecs
SEE ALSO
--------
-linkperf:perf-trace[1]
+linkperf:perf-script[1]
-perf-trace(1)
+perf-script(1)
=============
NAME
----
-perf-trace - Read perf.data (created by perf record) and display trace output
+perf-script - Read perf.data (created by perf record) and display trace output
SYNOPSIS
--------
[verse]
-'perf trace' [<options>]
-'perf trace' [<options>] record <script> [<record-options>] <command>
-'perf trace' [<options>] report <script> [script-args]
-'perf trace' [<options>] <script> <required-script-args> [<record-options>] <command>
-'perf trace' [<options>] <top-script> [script-args]
+'perf script' [<options>]
+'perf script' [<options>] record <script> [<record-options>] <command>
+'perf script' [<options>] report <script> [script-args]
+'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
+'perf script' [<options>] <top-script> [script-args]
DESCRIPTION
-----------
This command reads the input file and displays the trace recorded.
-There are several variants of perf trace:
+There are several variants of perf script:
- 'perf trace' to see a detailed trace of the workload that was
+ 'perf script' to see a detailed trace of the workload that was
recorded.
You can also run a set of pre-canned scripts that aggregate and
summarize the raw trace data in various ways (the list of scripts is
- available via 'perf trace -l'). The following variants allow you to
+ available via 'perf script -l'). The following variants allow you to
record and run those scripts:
- 'perf trace record <script> <command>' to record the events required
- for 'perf trace report'. <script> is the name displayed in the
- output of 'perf trace --list' i.e. the actual script name minus any
+ 'perf script record <script> <command>' to record the events required
+ for 'perf script report'. <script> is the name displayed in the
+ output of 'perf script --list' i.e. the actual script name minus any
language extension. If <command> is not specified, the events are
recorded using the -a (system-wide) 'perf record' option.
- 'perf trace report <script> [args]' to run and display the results
+ 'perf script report <script> [args]' to run and display the results
of <script>. <script> is the name displayed in the output of 'perf
trace --list' i.e. the actual script name minus any language
- extension. The perf.data output from a previous run of 'perf trace
+ extension. The perf.data output from a previous run of 'perf script
record <script>' is used and should be present for this command to
succeed. [args] refers to the (mainly optional) args expected by
the script.
- 'perf trace <script> <required-script-args> <command>' to both
+ 'perf script <script> <required-script-args> <command>' to both
record the events required for <script> and to run the <script>
using 'live-mode' i.e. without writing anything to disk. <script>
- is the name displayed in the output of 'perf trace --list' i.e. the
+ is the name displayed in the output of 'perf script --list' i.e. the
actual script name minus any language extension. If <command> is
not specified, the events are recorded using the -a (system-wide)
'perf record' option. If <script> has any required args, they
should be specified before <command>. This mode doesn't allow for
optional script args to be specified; if optional script args are
- desired, they can be specified using separate 'perf trace record'
- and 'perf trace report' commands, with the stdout of the record step
+ desired, they can be specified using separate 'perf script record'
+ and 'perf script report' commands, with the stdout of the record step
piped to the stdin of the report script, using the '-o -' and '-i -'
options of the corresponding commands.
- 'perf trace <top-script>' to both record the events required for
+ 'perf script <top-script>' to both record the events required for
<top-script> and to run the <top-script> using 'live-mode'
i.e. without writing anything to disk. <top-script> is the name
- displayed in the output of 'perf trace --list' i.e. the actual
+ displayed in the output of 'perf script --list' i.e. the actual
script name minus any language extension; a <top-script> is defined
as any script name ending with the string 'top'.
- [<record-options>] can be passed to the record steps of 'perf trace
+ [<record-options>] can be passed to the record steps of 'perf script
record' and 'live-mode' variants; this isn't possible however for
- <top-script> 'live-mode' or 'perf trace report' variants.
+ <top-script> 'live-mode' or 'perf script report' variants.
See the 'SEE ALSO' section for links to language-specific
information on how to write and run your own trace scripts.
Any command you can specify in a shell.
-D::
---dump-raw-trace=::
+--dump-raw-script=::
Display verbose dump of the trace data.
-L::
-g::
--gen-script=::
- Generate perf-trace.[ext] starter script for given language,
+ Generate perf-script.[ext] starter script for given language,
using current perf.data.
-a::
normally don't - this option allows the latter to be run in
system-wide mode.
+-i::
+--input=::
+ Input file name.
+
+-d::
+--debug-mode::
+ Do various checks like samples ordering and lost events.
SEE ALSO
--------
-linkperf:perf-record[1], linkperf:perf-trace-perl[1],
-linkperf:perf-trace-python[1]
+linkperf:perf-record[1], linkperf:perf-script-perl[1],
+linkperf:perf-script-python[1]
SYNOPSIS
--------
[verse]
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command>
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
DESCRIPTION
-----------
child tasks do not inherit counters
-p::
--pid=<pid>::
- stat events on existing pid
+ stat events on existing process id
+
+-t::
+--tid=<tid>::
+ stat events on existing thread id
+
-a::
- system-wide collection
+--all-cpus::
+ system-wide collection from all CPUs
-c::
- scale counter values
+--scale::
+ scale/normalize counter values
+
+-r::
+--repeat=<n>::
+ repeat command and print average + stddev (max: 100)
-B::
+--big-num::
print large numbers with thousands' separators according to locale
-C::
--cpu=::
-Count only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Count only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
In per-thread mode, this option is ignored. The -a option is still necessary
to activate system-wide monitoring. Default is to count on all CPUs.
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
+This option is only valid in system-wide mode.
+
+-n::
+--null::
+ null run - don't start any counters
+
+-v::
+--verbose::
+ be more verbose (show counter open errors, etc)
+
+-x SEP::
+--field-separator SEP::
+print counts using a CSV-style output to make it easy to import directly into
+spreadsheets. Columns are separated by the string specified in SEP.
+
EXAMPLES
--------
DESCRIPTION
-----------
-This command does assorted sanity tests, initially thru linked routines but
+This command does assorted sanity tests, initially through linked routines but
also will look for a directory with more tests in the form of scripts.
OPTIONS
DESCRIPTION
-----------
-This command generates and displays a performance counter profile in realtime.
+This command generates and displays a performance counter profile in real time.
OPTIONS
-C <cpu-list>::
--cpu=<cpu>::
-Monitor only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
Default is to monitor all CPUS.
-d <seconds>::
--count-filter=<count>::
Only display functions with more events than this.
+-g::
+--group::
+ Put the counters into a counter group.
+
-F <freq>::
--freq=<freq>::
Profile at this frequency.
-p <pid>::
--pid=<pid>::
- Profile events on existing pid.
+ Profile events on existing Process ID.
+
+-t <tid>::
+--tid=<tid>::
+ Profile events on existing thread ID.
-r <priority>::
--realtime=<priority>::
--sym-annotate=<symbol>::
Annotate this symbol.
+-K::
+--hide_kernel_symbols::
+ Hide kernel symbols.
+
+-U::
+--hide_user_symbols::
+ Hide user symbols.
+
+-D::
+--dump-symtab::
+ Dump the symbol table used for profiling.
+
-v::
--verbose::
Be more verbose (show counter open errors, etc).
lib/rbtree.c
include/linux/swab.h
arch/*/include/asm/unistd*.h
+arch/*/lib/memcpy*.S
include/linux/poison.h
include/linux/magic.h
include/linux/hw_breakpoint.h
ARCH := x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH := x86_64
ARCH := x86
+ ARCH_CFLAGS := -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif
# CFLAGS and LDFLAGS are for the users to override from the command line.
LIB_H += util/include/linux/rbtree.h
LIB_H += util/include/linux/string.h
LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
LIB_H += util/include/asm/asm-offsets.h
LIB_H += util/include/asm/bug.h
LIB_H += util/include/asm/byteorder.h
LIB_H += util/include/asm/system.h
LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
LIB_H += perf.h
LIB_H += util/cache.h
LIB_H += util/callchain.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)
LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
BUILTIN_OBJS += $(OUTPUT)builtin-top.o
-BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+BUILTIN_OBJS += $(OUTPUT)builtin-script.o
BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
-include config.mak
ifndef NO_DWARF
-FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
+FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
NO_DWARF := 1
ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
else
- BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
+ BASIC_CFLAGS += -DDWARF_SUPPORT
EXTLIBS += -lelf -ldw
LIB_OBJS += $(OUTPUT)util/probe-finder.o
endif # PERF_HAVE_DWARF_REGS
LIB_OBJS += $(COMPAT_OBJS)
ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)
export TAR INSTALL DESTDIR SHELL_PATH
--- /dev/null
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
--- /dev/null
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
--- /dev/null
+
+#include "../../../arch/x86/lib/memcpy_64.S"
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
+#include "mem-memcpy-arch.h"
#include <stdio.h>
#include <stdlib.h>
static const char *length_str = "1MB";
static const char *routine = "default";
-static bool use_clock = false;
+static bool use_clock;
static int clock_fd;
+static bool only_prefault;
+static bool no_prefault;
static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};
struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
{ NULL,
NULL,
NULL }
(double)ts->tv_usec / (double)1000000;
}
+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];
- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);
- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);
- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
return 1;
}
- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
-
- routines[i].fn(dst, src, length);
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);
- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}
switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
static const struct option options[] = {
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
- OPT_BOOLEAN('m', "displacement", &show_displacement,
+ OPT_BOOLEAN('M', "displacement", &show_displacement,
"Show position displacement relative to baseline"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
usage_with_options(report_usage, report_options);
}
__cmd_report();
- } else if (!strcmp(argv[0], "trace")) {
- /* Aliased to 'perf trace' */
- return cmd_trace(argc, argv, prefix);
+ } else if (!strcmp(argv[0], "script")) {
+ /* Aliased to 'perf script' */
+ return cmd_script(argc, argv, prefix);
} else if (!strcmp(argv[0], "info")) {
if (argc) {
argc = parse_options(argc, argv,
static bool no_samples = false;
static bool sample_address = false;
static bool no_buildid = false;
+static bool no_buildid_cache = false;
static long samples = 0;
static u64 bytes_written = 0;
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
fd[nr_cpu][counter][thread_index], strerror(err));
#if defined(__i386__) || defined(__x86_64__)
if (!pipe_output) {
session->header.data_size += bytes_written;
- process_buildids();
+ if (!no_buildid)
+ process_buildids();
perf_header__write(&session->header, output, true);
perf_session__delete(session);
symbol__exit();
return -1;
}
+ if (!no_buildid)
+ perf_header__set_feat(&session->header, HEADER_BUILD_ID);
+
if (!file_new) {
err = perf_header__read(session, output);
if (err < 0)
"Sample addresses"),
OPT_BOOLEAN('n', "no-samples", &no_samples,
"don't sample"),
- OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid,
+ OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
"do not update the buildid cache"),
+ OPT_BOOLEAN('B', "no-buildid", &no_buildid,
+ "do not collect buildids in perf.data"),
OPT_END()
};
}
symbol__init();
- if (no_buildid)
+
+ if (no_buildid_cache || no_buildid)
disable_buildid_cache();
if (!nr_counters) {
usage_with_options(sched_usage, sched_options);
/*
- * Aliased to 'perf trace' for now:
+ * Aliased to 'perf script' for now:
*/
- if (!strcmp(argv[0], "trace"))
- return cmd_trace(argc, argv, prefix);
+ if (!strcmp(argv[0], "script"))
+ return cmd_script(argc, argv, prefix);
symbol__init();
if (!strncmp(argv[0], "rec", 3)) {
static int cleanup_scripting(void)
{
- pr_debug("\nperf trace script stopped\n");
+ pr_debug("\nperf script stopped\n");
return scripting_ops->stop_script();
}
session_done = 1;
}
-static int __cmd_trace(struct perf_session *session)
+static int __cmd_script(struct perf_session *session)
{
int ret;
fprintf(stderr, "\n");
fprintf(stderr, "Scripting language extensions (used in "
- "perf trace -s [spec:]script.[spec]):\n\n");
+ "perf script -s [spec:]script.[spec]):\n\n");
list_for_each_entry(s, &script_specs, node)
fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name);
return 0;
}
-#define for_each_lang(scripts_dir, lang_dirent, lang_next) \
+/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
+static int is_directory(const char *base_path, const struct dirent *dent)
+{
+ char path[PATH_MAX];
+ struct stat st;
+
+ sprintf(path, "%s/%s", base_path, dent->d_name);
+ if (stat(path, &st))
+ return 0;
+
+ return S_ISDIR(st.st_mode);
+}
+
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \
lang_next) \
- if (lang_dirent.d_type == DT_DIR && \
+ if ((lang_dirent.d_type == DT_DIR || \
+ (lang_dirent.d_type == DT_UNKNOWN && \
+ is_directory(scripts_path, &lang_dirent))) && \
(strcmp(lang_dirent.d_name, ".")) && \
(strcmp(lang_dirent.d_name, "..")))
-#define for_each_script(lang_dir, script_dirent, script_next) \
+#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
while (!readdir_r(lang_dir, &script_dirent, &script_next) && \
script_next) \
- if (script_dirent.d_type != DT_DIR)
+ if (script_dirent.d_type != DT_DIR && \
+ (script_dirent.d_type != DT_UNKNOWN || \
+ !is_directory(lang_path, &script_dirent)))
#define RECORD_SUFFIX "-record"
if (!scripts_dir)
return -1;
- for_each_lang(scripts_dir, lang_dirent, lang_next) {
+ for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
lang_dirent.d_name);
lang_dir = opendir(lang_path);
if (!lang_dir)
continue;
- for_each_script(lang_dir, script_dirent, script_next) {
+ for_each_script(lang_path, lang_dir, script_dirent, script_next) {
script_root = strdup(script_dirent.d_name);
str = ends_with(script_root, REPORT_SUFFIX);
if (str) {
if (!scripts_dir)
return NULL;
- for_each_lang(scripts_dir, lang_dirent, lang_next) {
+ for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
lang_dirent.d_name);
lang_dir = opendir(lang_path);
if (!lang_dir)
continue;
- for_each_script(lang_dir, script_dirent, script_next) {
+ for_each_script(lang_path, lang_dir, script_dirent, script_next) {
__script_root = strdup(script_dirent.d_name);
str = ends_with(__script_root, suffix);
if (str) {
return n_args;
}
-static const char * const trace_usage[] = {
- "perf trace [<options>]",
- "perf trace [<options>] record <script> [<record-options>] <command>",
- "perf trace [<options>] report <script> [script-args]",
- "perf trace [<options>] <script> [<record-options>] <command>",
- "perf trace [<options>] <top-script> [script-args]",
+static const char * const script_usage[] = {
+ "perf script [<options>]",
+ "perf script [<options>] record <script> [<record-options>] <command>",
+ "perf script [<options>] report <script> [script-args]",
+ "perf script [<options>] <script> [<record-options>] <command>",
+ "perf script [<options>] <top-script> [script-args]",
NULL
};
"script file name (lang:script name, script name, or *)",
parse_scriptname),
OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
- "generate perf-trace.xx script in specified language"),
+ "generate perf-script.xx script in specified language"),
OPT_STRING('i', "input", &input_name, "file",
"input file name"),
OPT_BOOLEAN('d', "debug-mode", &debug_mode,
return argc != 0;
}
-int cmd_trace(int argc, const char **argv, const char *prefix __used)
+int cmd_script(int argc, const char **argv, const char *prefix __used)
{
char *rec_script_path = NULL;
char *rep_script_path = NULL;
setup_scripting();
- argc = parse_options(argc, argv, options, trace_usage,
+ argc = parse_options(argc, argv, options, script_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
if (!rep_script_path) {
fprintf(stderr,
"Please specify a valid report script"
- "(see 'perf trace -l' for listing)\n");
+ "(see 'perf script -l' for listing)\n");
return -1;
}
}
if (!rec_script_path && !rep_script_path) {
fprintf(stderr, " Couldn't find script %s\n\n See perf"
- " trace -l for available scripts.\n", argv[0]);
- usage_with_options(trace_usage, options);
+ " script -l for available scripts.\n", argv[0]);
+ usage_with_options(script_usage, options);
}
if (is_top_script(argv[0])) {
rec_args = (argc - 1) - rep_args;
if (rec_args < 0) {
fprintf(stderr, " %s script requires options."
- "\n\n See perf trace -l for available "
+ "\n\n See perf script -l for available "
"scripts and options.\n", argv[0]);
- usage_with_options(trace_usage, options);
+ usage_with_options(script_usage, options);
}
}
return -1;
}
- err = scripting_ops->generate_script("perf-trace");
+ err = scripting_ops->generate_script("perf-script");
goto out;
}
err = scripting_ops->start_script(script_name, argc, argv);
if (err)
goto out;
- pr_debug("perf trace started with script %s\n\n", script_name);
+ pr_debug("perf script started with script %s\n\n", script_name);
}
- err = __cmd_trace(session);
+ err = __cmd_script(session);
perf_session__delete(session);
cleanup_scripting();
#include <math.h>
#include <locale.h>
+#define DEFAULT_SEPARATOR " "
+
static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
static int run_count = 1;
static bool no_inherit = false;
static bool scale = true;
+static bool no_aggr = false;
static pid_t target_pid = -1;
static pid_t target_tid = -1;
static pid_t *all_tids = NULL;
static int thread_num = 0;
static pid_t child_pid = -1;
static bool null_run = false;
-static bool big_num = false;
+static bool big_num = true;
+static int big_num_opt = -1;
static const char *cpu_list;
+static const char *csv_sep = NULL;
+static bool csv_output = false;
static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
static int event_scaled[MAX_COUNTERS];
+static struct {
+ u64 val;
+ u64 ena;
+ u64 run;
+} cpu_counts[MAX_NR_CPUS][MAX_COUNTERS];
+
static volatile int done = 0;
struct stats
}
struct stats event_res_stats[MAX_COUNTERS][3];
-struct stats runtime_nsecs_stats;
+struct stats runtime_nsecs_stats[MAX_NR_CPUS];
+struct stats runtime_cycles_stats[MAX_NR_CPUS];
+struct stats runtime_branches_stats[MAX_NR_CPUS];
struct stats walltime_nsecs_stats;
-struct stats runtime_cycles_stats;
-struct stats runtime_branches_stats;
#define MATCH_EVENT(t, c, counter) \
(attrs[counter].type == PERF_TYPE_##t && \
attrs[counter].config == PERF_COUNT_##c)
#define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
+"counter %d, sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information."
-static int create_perf_stat_counter(int counter)
+static int create_perf_stat_counter(int counter, bool *perm_err)
{
struct perf_event_attr *attr = attrs + counter;
int thread;
for (cpu = 0; cpu < nr_cpus; cpu++) {
fd[cpu][counter][0] = sys_perf_event_open(attr,
-1, cpumap[cpu], -1, 0);
- if (fd[cpu][counter][0] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
+ if (fd[cpu][counter][0] < 0) {
+ if (errno == EPERM || errno == EACCES)
+ *perm_err = true;
+ error(ERR_PERF_OPEN, counter,
fd[cpu][counter][0], strerror(errno));
- else
+ } else {
++ncreated;
+ }
}
} else {
attr->inherit = !no_inherit;
for (thread = 0; thread < thread_num; thread++) {
fd[0][counter][thread] = sys_perf_event_open(attr,
all_tids[thread], -1, -1, 0);
- if (fd[0][counter][thread] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
+ if (fd[0][counter][thread] < 0) {
+ if (errno == EPERM || errno == EACCES)
+ *perm_err = true;
+ error(ERR_PERF_OPEN, counter,
fd[0][counter][thread],
strerror(errno));
- else
+ } else {
++ncreated;
+ }
}
}
/*
* Read out the results of a single counter:
+ * aggregate counts across CPUs in system-wide mode
*/
-static void read_counter(int counter)
+static void read_counter_aggr(int counter)
{
u64 count[3], single_count[3];
int cpu;
* Save the full runtime - to allow normalization during printout:
*/
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
- update_stats(&runtime_nsecs_stats, count[0]);
+ update_stats(&runtime_nsecs_stats[0], count[0]);
if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
- update_stats(&runtime_cycles_stats, count[0]);
+ update_stats(&runtime_cycles_stats[0], count[0]);
if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
- update_stats(&runtime_branches_stats, count[0]);
+ update_stats(&runtime_branches_stats[0], count[0]);
+}
+
+/*
+ * Read out the results of a single counter:
+ * do not aggregate counts across CPUs in system-wide mode
+ */
+static void read_counter(int counter)
+{
+ u64 count[3];
+ int cpu;
+ size_t res, nv;
+
+ count[0] = count[1] = count[2] = 0;
+
+ nv = scale ? 3 : 1;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+
+ if (fd[cpu][counter][0] < 0)
+ continue;
+
+ res = read(fd[cpu][counter][0], count, nv * sizeof(u64));
+
+ assert(res == nv * sizeof(u64));
+
+ close(fd[cpu][counter][0]);
+ fd[cpu][counter][0] = -1;
+
+ if (scale) {
+ if (count[2] == 0) {
+ count[0] = 0;
+ } else if (count[2] < count[1]) {
+ count[0] = (unsigned long long)
+ ((double)count[0] * count[1] / count[2] + 0.5);
+ }
+ }
+ cpu_counts[cpu][counter].val = count[0]; /* scaled count */
+ cpu_counts[cpu][counter].ena = count[1];
+ cpu_counts[cpu][counter].run = count[2];
+
+ if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+ update_stats(&runtime_nsecs_stats[cpu], count[0]);
+ if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
+ update_stats(&runtime_cycles_stats[cpu], count[0]);
+ if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
+ update_stats(&runtime_branches_stats[cpu], count[0]);
+ }
}
static int run_perf_stat(int argc __used, const char **argv)
int status = 0;
int counter, ncreated = 0;
int child_ready_pipe[2], go_pipe[2];
+ bool perm_err = false;
const bool forks = (argc > 0);
char buf;
}
for (counter = 0; counter < nr_counters; counter++)
- ncreated += create_perf_stat_counter(counter);
-
- if (ncreated == 0) {
- pr_err("No permission to collect %sstats.\n"
- "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
- system_wide ? "system-wide " : "");
+ ncreated += create_perf_stat_counter(counter, &perm_err);
+
+ if (ncreated < nr_counters) {
+ if (perm_err)
+ error("You may not have permission to collect %sstats.\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid or running as root.",
+ system_wide ? "system-wide " : "");
+ die("Not all events could be opened.\n");
if (child_pid != -1)
kill(child_pid, SIGTERM);
return -1;
update_stats(&walltime_nsecs_stats, t1 - t0);
- for (counter = 0; counter < nr_counters; counter++)
- read_counter(counter);
-
+ if (no_aggr) {
+ for (counter = 0; counter < nr_counters; counter++)
+ read_counter(counter);
+ } else {
+ for (counter = 0; counter < nr_counters; counter++)
+ read_counter_aggr(counter);
+ }
return WEXITSTATUS(status);
}
100 * stddev_stats(&event_res_stats[counter][0]) / avg);
}
-static void nsec_printout(int counter, double avg)
+static void nsec_printout(int cpu, int counter, double avg)
{
double msecs = avg / 1e6;
+ char cpustr[16] = { '\0', };
+ const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s";
+
+ if (no_aggr)
+ sprintf(cpustr, "CPU%*d%s",
+ csv_output ? 0 : -4,
+ cpumap[cpu], csv_sep);
- fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
+ fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(counter));
+
+ if (csv_output)
+ return;
if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
fprintf(stderr, " # %10.3f CPUs ",
}
}
-static void abs_printout(int counter, double avg)
+static void abs_printout(int cpu, int counter, double avg)
{
double total, ratio = 0.0;
+ char cpustr[16] = { '\0', };
+ const char *fmt;
+
+ if (csv_output)
+ fmt = "%s%.0f%s%s";
+ else if (big_num)
+ fmt = "%s%'18.0f%s%-24s";
+ else
+ fmt = "%s%18.0f%s%-24s";
- if (big_num)
- fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter));
+ if (no_aggr)
+ sprintf(cpustr, "CPU%*d%s",
+ csv_output ? 0 : -4,
+ cpumap[cpu], csv_sep);
else
- fprintf(stderr, " %18.0f %-24s", avg, event_name(counter));
+ cpu = 0;
+
+ fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(counter));
+
+ if (csv_output)
+ return;
if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
- total = avg_stats(&runtime_cycles_stats);
+ total = avg_stats(&runtime_cycles_stats[cpu]);
if (total)
ratio = avg / total;
fprintf(stderr, " # %10.3f IPC ", ratio);
} else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
- runtime_branches_stats.n != 0) {
- total = avg_stats(&runtime_branches_stats);
+ runtime_branches_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_branches_stats[cpu]);
if (total)
ratio = avg * 100 / total;
fprintf(stderr, " # %10.3f %% ", ratio);
- } else if (runtime_nsecs_stats.n != 0) {
- total = avg_stats(&runtime_nsecs_stats);
+ } else if (runtime_nsecs_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_nsecs_stats[cpu]);
if (total)
ratio = 1000.0 * avg / total;
/*
* Print out the results of a single counter:
+ * aggregated counts in system-wide mode
*/
-static void print_counter(int counter)
+static void print_counter_aggr(int counter)
{
double avg = avg_stats(&event_res_stats[counter][0]);
int scaled = event_scaled[counter];
if (scaled == -1) {
- fprintf(stderr, " %18s %-24s\n",
- "<not counted>", event_name(counter));
+ fprintf(stderr, "%*s%s%-24s\n",
+ csv_output ? 0 : 18,
+ "<not counted>", csv_sep, event_name(counter));
return;
}
if (nsec_counter(counter))
- nsec_printout(counter, avg);
+ nsec_printout(-1, counter, avg);
else
- abs_printout(counter, avg);
+ abs_printout(-1, counter, avg);
+
+ if (csv_output) {
+ fputc('\n', stderr);
+ return;
+ }
print_noise(counter, avg);
fprintf(stderr, "\n");
}
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(int counter)
+{
+ u64 ena, run, val;
+ int cpu;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ val = cpu_counts[cpu][counter].val;
+ ena = cpu_counts[cpu][counter].ena;
+ run = cpu_counts[cpu][counter].run;
+ if (run == 0 || ena == 0) {
+ fprintf(stderr, "CPU%*d%s%*s%s%-24s",
+ csv_output ? 0 : -4,
+ cpumap[cpu], csv_sep,
+ csv_output ? 0 : 18,
+ "<not counted>", csv_sep,
+ event_name(counter));
+
+ fprintf(stderr, "\n");
+ continue;
+ }
+
+ if (nsec_counter(counter))
+ nsec_printout(cpu, counter, val);
+ else
+ abs_printout(cpu, counter, val);
+
+ if (!csv_output) {
+ print_noise(counter, 1.0);
+
+ if (run != ena) {
+ fprintf(stderr, " (scaled from %.2f%%)",
+ 100.0 * run / ena);
+ }
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
static void print_stat(int argc, const char **argv)
{
int i, counter;
fflush(stdout);
- fprintf(stderr, "\n");
- fprintf(stderr, " Performance counter stats for ");
- if(target_pid == -1 && target_tid == -1) {
- fprintf(stderr, "\'%s", argv[0]);
- for (i = 1; i < argc; i++)
- fprintf(stderr, " %s", argv[i]);
- } else if (target_pid != -1)
- fprintf(stderr, "process id \'%d", target_pid);
- else
- fprintf(stderr, "thread id \'%d", target_tid);
-
- fprintf(stderr, "\'");
- if (run_count > 1)
- fprintf(stderr, " (%d runs)", run_count);
- fprintf(stderr, ":\n\n");
+ if (!csv_output) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Performance counter stats for ");
+ if(target_pid == -1 && target_tid == -1) {
+ fprintf(stderr, "\'%s", argv[0]);
+ for (i = 1; i < argc; i++)
+ fprintf(stderr, " %s", argv[i]);
+ } else if (target_pid != -1)
+ fprintf(stderr, "process id \'%d", target_pid);
+ else
+ fprintf(stderr, "thread id \'%d", target_tid);
+
+ fprintf(stderr, "\'");
+ if (run_count > 1)
+ fprintf(stderr, " (%d runs)", run_count);
+ fprintf(stderr, ":\n\n");
+ }
- for (counter = 0; counter < nr_counters; counter++)
- print_counter(counter);
+ if (no_aggr) {
+ for (counter = 0; counter < nr_counters; counter++)
+ print_counter(counter);
+ } else {
+ for (counter = 0; counter < nr_counters; counter++)
+ print_counter_aggr(counter);
+ }
- fprintf(stderr, "\n");
- fprintf(stderr, " %18.9f seconds time elapsed",
- avg_stats(&walltime_nsecs_stats)/1e9);
- if (run_count > 1) {
- fprintf(stderr, " ( +- %7.3f%% )",
+ if (!csv_output) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, " %18.9f seconds time elapsed",
+ avg_stats(&walltime_nsecs_stats)/1e9);
+ if (run_count > 1) {
+ fprintf(stderr, " ( +- %7.3f%% )",
100*stddev_stats(&walltime_nsecs_stats) /
avg_stats(&walltime_nsecs_stats));
+ }
+ fprintf(stderr, "\n\n");
}
- fprintf(stderr, "\n\n");
}
static volatile int signr = -1;
NULL
};
+static int stat__set_big_num(const struct option *opt __used,
+ const char *s __used, int unset)
+{
+ big_num_opt = unset ? 0 : 1;
+ return 0;
+}
+
static const struct option options[] = {
OPT_CALLBACK('e', "event", NULL, "event",
"event selector. use 'perf list' to list available events",
"repeat command and print average + stddev (max: 100)"),
OPT_BOOLEAN('n', "null", &null_run,
"null run - dont start any counters"),
- OPT_BOOLEAN('B', "big-num", &big_num,
- "print large numbers with thousands\' separators"),
+ OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL,
+ "print large numbers with thousands\' separators",
+ stat__set_big_num),
OPT_STRING('C', "cpu", &cpu_list, "cpu",
"list of cpus to monitor in system-wide"),
+ OPT_BOOLEAN('A', "no-aggr", &no_aggr,
+ "disable CPU count aggregation"),
+ OPT_STRING('x', "field-separator", &csv_sep, "separator",
+ "print counts with custom separator"),
OPT_END()
};
argc = parse_options(argc, argv, options, stat_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
+
+ if (csv_sep)
+ csv_output = true;
+ else
+ csv_sep = DEFAULT_SEPARATOR;
+
+ /*
+ * let the spreadsheet do the pretty-printing
+ */
+ if (csv_output) {
+ /* User explicitely passed -B? */
+ if (big_num_opt == 1) {
+ fprintf(stderr, "-B option not supported with -x\n");
+ usage_with_options(stat_usage, options);
+ } else /* Nope, so disable big number formatting */
+ big_num = false;
+ } else if (big_num_opt == 0) /* User passed --no-big-num */
+ big_num = false;
+
if (!argc && target_pid == -1 && target_tid == -1)
usage_with_options(stat_usage, options);
if (run_count <= 0)
usage_with_options(stat_usage, options);
+ /* no_aggr is for system-wide only */
+ if (no_aggr && !system_wide)
+ usage_with_options(stat_usage, options);
+
/* Set attrs and nr_counters if no event is selected and !null_run */
if (!null_run && !nr_counters) {
memcpy(attrs, default_attrs, sizeof(default_attrs));
int err = errno;
if (err == EPERM || err == EACCES)
- die("No permission - are you root?\n");
+ die("Permission error - are you root?\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid.\n");
/*
* If it's cycles then fall back to hrtimer
* based cpu-clock-tick sw counter, which
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
fd[i][counter][thread_index], strerror(err));
die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
exit(-1);
extern int cmd_stat(int argc, const char **argv, const char *prefix);
extern int cmd_timechart(int argc, const char **argv, const char *prefix);
extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_script(int argc, const char **argv, const char *prefix);
extern int cmd_version(int argc, const char **argv, const char *prefix);
extern int cmd_probe(int argc, const char **argv, const char *prefix);
extern int cmd_kmem(int argc, const char **argv, const char *prefix);
perf-stat mainporcelain common
perf-timechart mainporcelain common
perf-top mainporcelain common
-perf-trace mainporcelain common
+perf-script mainporcelain common
perf-probe mainporcelain common
perf-kmem mainporcelain common
perf-lock mainporcelain common
ifndef NO_DWARF
define SOURCE_DWARF
#include <dwarf.h>
-#include <libdw.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
#ifndef _ELFUTILS_PREREQ
#error
#endif
{ "top", cmd_top, 0 },
{ "annotate", cmd_annotate, 0 },
{ "version", cmd_version, 0 },
- { "trace", cmd_trace, 0 },
+ { "script", cmd_script, 0 },
{ "sched", cmd_sched, 0 },
{ "probe", cmd_probe, 0 },
{ "kmem", cmd_kmem, 0 },
/*
- * Context.c. Python interfaces for perf trace.
+ * Context.c. Python interfaces for perf script.
*
* Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
*
return ret;
}
-static int dump_printf_color(const char *fmt, const char *color, ...)
+#ifdef NO_NEWT_SUPPORT
+void ui__warning(const char *format, ...)
{
va_list args;
- int ret = 0;
- if (dump_trace) {
- va_start(args, color);
- ret = color_vfprintf(stdout, color, fmt, args);
- va_end(args);
- }
-
- return ret;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
}
-
+#endif
void trace_event(event_t *event)
{
if (!dump_trace)
return;
- dump_printf(".");
- dump_printf_color("\n. ... raw event: size %d bytes\n", color,
- event->header.size);
+ printf(".");
+ color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+ event->header.size);
for (i = 0; i < event->header.size; i++) {
if ((i & 15) == 0) {
- dump_printf(".");
- dump_printf_color(" %04x: ", color, i);
+ printf(".");
+ color_fprintf(stdout, color, " %04x: ", i);
}
- dump_printf_color(" %02x", color, raw_event[i]);
+ color_fprintf(stdout, color, " %02x", raw_event[i]);
if (((i & 15) == 15) || i == event->header.size-1) {
- dump_printf_color(" ", color);
+ color_fprintf(stdout, color, " ");
for (j = 0; j < 15-(i & 15); j++)
- dump_printf_color(" ", color);
+ color_fprintf(stdout, color, " ");
for (j = i & ~15; j <= i; j++) {
- dump_printf_color("%c", color,
- isprint(raw_event[j]) ?
- raw_event[j] : '.');
+ color_fprintf(stdout, color, "%c",
+ isprint(raw_event[j]) ?
+ raw_event[j] : '.');
}
- dump_printf_color("\n", color);
+ color_fprintf(stdout, color, "\n");
}
}
- dump_printf(".\n");
+ printf(".\n");
}
#include "ui/progress.h"
#endif
+void ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
#endif /* __PERF_DEBUG_H */
* a zero sized synthesized MMAP event for the kernel.
*/
if (maps[MAP__FUNCTION]->end == 0)
- maps[MAP__FUNCTION]->end = ~0UL;
+ maps[MAP__FUNCTION]->end = ~0ULL;
}
static int event__process_kernel_mmap(event_t *self,
set_bit(feat, self->adds_features);
}
+void perf_header__clear_feat(struct perf_header *self, int feat)
+{
+ clear_bit(feat, self->adds_features);
+}
+
bool perf_header__has_feat(const struct perf_header *self, int feat)
{
return test_bit(feat, self->adds_features);
int idx = 0, err;
session = container_of(self, struct perf_session, header);
- if (perf_session__read_build_ids(session, true))
- perf_header__set_feat(self, HEADER_BUILD_ID);
+
+ if (perf_header__has_feat(self, HEADER_BUILD_ID &&
+ !perf_session__read_build_ids(session, true)))
+ perf_header__clear_feat(self, HEADER_BUILD_ID);
nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
if (!nr_sections)
struct perf_event_attr *
perf_header__find_attr(u64 id, struct perf_header *header);
void perf_header__set_feat(struct perf_header *self, int feat);
+void perf_header__clear_feat(struct perf_header *self, int feat);
bool perf_header__has_feat(const struct perf_header *self, int feat);
int perf_header__process_sections(struct perf_header *self, int fd,
--- /dev/null
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
--- /dev/null
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
}
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+ addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
{
return ((1UL << (nr % BITS_PER_LONG)) &
--- /dev/null
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+ name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
id = atoll(id_buf);
attr->config = id;
attr->type = PERF_TYPE_TRACEPOINT;
- *strp = evt_name + evt_length;
+ *strp += strlen(sys_name) + evt_length + 1; /* + 1 for the ':' */
attr->sample_type |= PERF_SAMPLE_RAW;
attr->sample_type |= PERF_SAMPLE_TIME;
struct perf_event_attr *attr)
{
const char *evt_name;
- char *flags;
+ char *flags = NULL, *comma_loc;
char sys_name[MAX_EVENT_LENGTH];
unsigned int sys_length, evt_length;
sys_name[sys_length] = '\0';
evt_name = evt_name + 1;
+ comma_loc = strchr(evt_name, ',');
+ if (comma_loc) {
+ /* take the event name up to the comma */
+ evt_name = strndup(evt_name, comma_loc - evt_name);
+ }
flags = strchr(evt_name, ':');
if (flags) {
/* split it out: */
evt_length = strlen(evt_name);
if (evt_length >= MAX_EVENT_LENGTH)
return EVT_FAILED;
-
if (strpbrk(evt_name, "*?")) {
- *strp = evt_name + evt_length;
+ *strp += strlen(sys_name) + evt_length;
return parse_multiple_tracepoint_event(sys_name, evt_name,
flags);
} else
bool externs);
#include <dwarf.h>
-#include <libdw.h>
-#include <libdwfl.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/libdwfl.h>
+#include <elfutils/version.h>
struct probe_finder {
struct perf_probe_event *pev; /* Target probe event */
/*
- * trace-event-perl. Feed perf trace events to an embedded Perl interpreter.
+ * trace-event-perl. Feed perf script events to an embedded Perl interpreter.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*
return -1;
}
- fprintf(ofp, "# perf trace event handlers, "
- "generated by perf trace -g perl\n");
+ fprintf(ofp, "# perf script event handlers, "
+ "generated by perf script -g perl\n");
fprintf(ofp, "# Licensed under the terms of the GNU GPL"
" License version 2\n\n");
fprintf(stderr, "couldn't open %s\n", fname);
return -1;
}
- fprintf(ofp, "# perf trace event handlers, "
- "generated by perf trace -g python\n");
+ fprintf(ofp, "# perf script event handlers, "
+ "generated by perf script -g python\n");
fprintf(ofp, "# Licensed under the terms of the GNU GPL"
" License version 2\n\n");
INIT_LIST_HEAD(&self->dead_threads);
self->hists_tree = RB_ROOT;
self->last_match = NULL;
- self->mmap_window = 32;
+ /*
+ * On 64bit we can mmap the data file in one go. No need for tiny mmap
+ * slices. On 32bit we use 32MB.
+ */
+#if BITS_PER_LONG == 64
+ self->mmap_window = ULLONG_MAX;
+#else
+ self->mmap_window = 32 * 1024 * 1024ULL;
+#endif
self->machines = RB_ROOT;
self->repipe = repipe;
- INIT_LIST_HEAD(&self->ordered_samples.samples_head);
+ INIT_LIST_HEAD(&self->ordered_samples.samples);
+ INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
+ INIT_LIST_HEAD(&self->ordered_samples.to_free);
machine__init(&self->host_machine, "", HOST_KERNEL_ID);
if (mode == O_RDONLY) {
if (handler->exit == NULL)
handler->exit = process_event_stub;
if (handler->lost == NULL)
- handler->lost = process_event_stub;
+ handler->lost = event__process_lost;
if (handler->read == NULL)
handler->read = process_event_stub;
if (handler->throttle == NULL)
struct sample_queue {
u64 timestamp;
- struct sample_event *event;
+ event_t *event;
struct list_head list;
};
+static void perf_session_free_sample_buffers(struct perf_session *session)
+{
+ struct ordered_samples *os = &session->ordered_samples;
+
+ while (!list_empty(&os->to_free)) {
+ struct sample_queue *sq;
+
+ sq = list_entry(os->to_free.next, struct sample_queue, list);
+ list_del(&sq->list);
+ free(sq);
+ }
+}
+
static void flush_sample_queue(struct perf_session *s,
struct perf_event_ops *ops)
{
- struct list_head *head = &s->ordered_samples.samples_head;
- u64 limit = s->ordered_samples.next_flush;
+ struct ordered_samples *os = &s->ordered_samples;
+ struct list_head *head = &os->samples;
struct sample_queue *tmp, *iter;
+ u64 limit = os->next_flush;
+ u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
if (!ops->ordered_samples || !limit)
return;
list_for_each_entry_safe(iter, tmp, head, list) {
if (iter->timestamp > limit)
- return;
-
- if (iter == s->ordered_samples.last_inserted)
- s->ordered_samples.last_inserted = NULL;
+ break;
- ops->sample((event_t *)iter->event, s);
+ ops->sample(iter->event, s);
- s->ordered_samples.last_flush = iter->timestamp;
+ os->last_flush = iter->timestamp;
list_del(&iter->list);
- free(iter->event);
- free(iter);
+ list_add(&iter->list, &os->sample_cache);
+ }
+
+ if (list_empty(head)) {
+ os->last_sample = NULL;
+ } else if (last_ts <= limit) {
+ os->last_sample =
+ list_entry(head->prev, struct sample_queue, list);
}
}
return 0;
}
-static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
-{
- struct sample_queue *iter;
-
- list_for_each_entry_reverse(iter, head, list) {
- if (iter->timestamp < new->timestamp) {
- list_add(&new->list, &iter->list);
- return;
- }
- }
-
- list_add(&new->list, head);
-}
-
-static void __queue_sample_before(struct sample_queue *new,
- struct sample_queue *iter,
- struct list_head *head)
-{
- list_for_each_entry_continue_reverse(iter, head, list) {
- if (iter->timestamp < new->timestamp) {
- list_add(&new->list, &iter->list);
- return;
- }
- }
-
- list_add(&new->list, head);
-}
-
-static void __queue_sample_after(struct sample_queue *new,
- struct sample_queue *iter,
- struct list_head *head)
-{
- list_for_each_entry_continue(iter, head, list) {
- if (iter->timestamp > new->timestamp) {
- list_add_tail(&new->list, &iter->list);
- return;
- }
- }
- list_add_tail(&new->list, head);
-}
-
/* The queue is ordered by time */
static void __queue_sample_event(struct sample_queue *new,
struct perf_session *s)
{
- struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
- struct list_head *head = &s->ordered_samples.samples_head;
+ struct ordered_samples *os = &s->ordered_samples;
+ struct sample_queue *sample = os->last_sample;
+ u64 timestamp = new->timestamp;
+ struct list_head *p;
+ os->last_sample = new;
- if (!last_inserted) {
- __queue_sample_end(new, head);
+ if (!sample) {
+ list_add(&new->list, &os->samples);
+ os->max_timestamp = timestamp;
return;
}
/*
- * Most of the time the current event has a timestamp
- * very close to the last event inserted, unless we just switched
- * to another event buffer. Having a sorting based on a list and
- * on the last inserted event that is close to the current one is
- * probably more efficient than an rbtree based sorting.
+ * last_sample might point to some random place in the list as it's
+ * the last queued event. We expect that the new event is close to
+ * this.
*/
- if (last_inserted->timestamp >= new->timestamp)
- __queue_sample_before(new, last_inserted, head);
- else
- __queue_sample_after(new, last_inserted, head);
+ if (sample->timestamp <= timestamp) {
+ while (sample->timestamp <= timestamp) {
+ p = sample->list.next;
+ if (p == &os->samples) {
+ list_add_tail(&new->list, &os->samples);
+ os->max_timestamp = timestamp;
+ return;
+ }
+ sample = list_entry(p, struct sample_queue, list);
+ }
+ list_add_tail(&new->list, &sample->list);
+ } else {
+ while (sample->timestamp > timestamp) {
+ p = sample->list.prev;
+ if (p == &os->samples) {
+ list_add(&new->list, &os->samples);
+ return;
+ }
+ sample = list_entry(p, struct sample_queue, list);
+ }
+ list_add(&new->list, &sample->list);
+ }
}
+#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue))
+
static int queue_sample_event(event_t *event, struct sample_data *data,
struct perf_session *s)
{
+ struct ordered_samples *os = &s->ordered_samples;
+ struct list_head *sc = &os->sample_cache;
u64 timestamp = data->time;
struct sample_queue *new;
-
if (timestamp < s->ordered_samples.last_flush) {
printf("Warning: Timestamp below last timeslice flush\n");
return -EINVAL;
}
- new = malloc(sizeof(*new));
- if (!new)
- return -ENOMEM;
-
- new->timestamp = timestamp;
-
- new->event = malloc(event->header.size);
- if (!new->event) {
- free(new);
- return -ENOMEM;
+ if (!list_empty(sc)) {
+ new = list_entry(sc->next, struct sample_queue, list);
+ list_del(&new->list);
+ } else if (os->sample_buffer) {
+ new = os->sample_buffer + os->sample_buffer_idx;
+ if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
+ os->sample_buffer = NULL;
+ } else {
+ os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
+ if (!os->sample_buffer)
+ return -ENOMEM;
+ list_add(&os->sample_buffer->list, &os->to_free);
+ os->sample_buffer_idx = 2;
+ new = os->sample_buffer + 1;
}
- memcpy(new->event, event, event->header.size);
+ new->timestamp = timestamp;
+ new->event = event;
__queue_sample_event(new, s);
- s->ordered_samples.last_inserted = new;
-
- if (new->timestamp > s->ordered_samples.max_timestamp)
- s->ordered_samples.max_timestamp = new->timestamp;
return 0;
}
static int perf_session__process_event(struct perf_session *self,
event_t *event,
struct perf_event_ops *ops,
- u64 offset, u64 head)
+ u64 file_offset)
{
trace_event(event);
if (event->header.type < PERF_RECORD_HEADER_MAX) {
dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
- offset + head, event->header.size,
+ file_offset, event->header.size,
event__name[event->header.type]);
hists__inc_nr_events(&self->hists, event->header.type);
}
return ops->event_type(event, self);
case PERF_RECORD_HEADER_TRACING_DATA:
/* setup for reading amidst mmap */
- lseek(self->fd, offset + head, SEEK_SET);
+ lseek(self->fd, file_offset, SEEK_SET);
return ops->tracing_data(event, self);
case PERF_RECORD_HEADER_BUILD_ID:
return ops->build_id(event, self);
}
if (size == 0 ||
- (skip = perf_session__process_event(self, &event, ops,
- 0, head)) < 0) {
+ (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
head, event.header.size, event.header.type);
/*
done:
err = 0;
out_err:
+ perf_session_free_sample_buffers(self);
return err;
}
-int __perf_session__process_events(struct perf_session *self,
+int __perf_session__process_events(struct perf_session *session,
u64 data_offset, u64 data_size,
u64 file_size, struct perf_event_ops *ops)
{
- int err, mmap_prot, mmap_flags;
- u64 head, shift;
- u64 offset = 0;
- size_t page_size;
+ u64 head, page_offset, file_offset, file_pos, progress_next;
+ int err, mmap_prot, mmap_flags, map_idx = 0;
+ struct ui_progress *progress;
+ size_t page_size, mmap_size;
+ char *buf, *mmaps[8];
event_t *event;
uint32_t size;
- char *buf;
- struct ui_progress *progress = ui_progress__new("Processing events...",
- self->size);
- if (progress == NULL)
- return -1;
perf_event_ops__fill_defaults(ops);
page_size = sysconf(_SC_PAGESIZE);
- head = data_offset;
- shift = page_size * (head / page_size);
- offset += shift;
- head -= shift;
+ page_offset = page_size * (data_offset / page_size);
+ file_offset = page_offset;
+ head = data_offset - page_offset;
+
+ if (data_offset + data_size < file_size)
+ file_size = data_offset + data_size;
+
+ progress_next = file_size / 16;
+ progress = ui_progress__new("Processing events...", file_size);
+ if (progress == NULL)
+ return -1;
+
+ mmap_size = session->mmap_window;
+ if (mmap_size > file_size)
+ mmap_size = file_size;
+
+ memset(mmaps, 0, sizeof(mmaps));
mmap_prot = PROT_READ;
mmap_flags = MAP_SHARED;
- if (self->header.needs_swap) {
+ if (session->header.needs_swap) {
mmap_prot |= PROT_WRITE;
mmap_flags = MAP_PRIVATE;
}
remap:
- buf = mmap(NULL, page_size * self->mmap_window, mmap_prot,
- mmap_flags, self->fd, offset);
+ buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd,
+ file_offset);
if (buf == MAP_FAILED) {
pr_err("failed to mmap file\n");
err = -errno;
goto out_err;
}
+ mmaps[map_idx] = buf;
+ map_idx = (map_idx + 1) & (ARRAY_SIZE(mmaps) - 1);
+ file_pos = file_offset + head;
more:
event = (event_t *)(buf + head);
- ui_progress__update(progress, offset);
- if (self->header.needs_swap)
+ if (session->header.needs_swap)
perf_event_header__bswap(&event->header);
size = event->header.size;
if (size == 0)
size = 8;
- if (head + event->header.size >= page_size * self->mmap_window) {
- int munmap_ret;
-
- shift = page_size * (head / page_size);
-
- munmap_ret = munmap(buf, page_size * self->mmap_window);
- assert(munmap_ret == 0);
+ if (head + event->header.size >= mmap_size) {
+ if (mmaps[map_idx]) {
+ munmap(mmaps[map_idx], mmap_size);
+ mmaps[map_idx] = NULL;
+ }
- offset += shift;
- head -= shift;
+ page_offset = page_size * (head / page_size);
+ file_offset += page_offset;
+ head -= page_offset;
goto remap;
}
size = event->header.size;
dump_printf("\n%#Lx [%#x]: event: %d\n",
- offset + head, event->header.size, event->header.type);
+ file_pos, event->header.size, event->header.type);
if (size == 0 ||
- perf_session__process_event(self, event, ops, offset, head) < 0) {
+ perf_session__process_event(session, event, ops, file_pos) < 0) {
dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
- offset + head, event->header.size,
+ file_offset + head, event->header.size,
event->header.type);
/*
* assume we lost track of the stream, check alignment, and
}
head += size;
+ file_pos += size;
- if (offset + head >= data_offset + data_size)
- goto done;
+ if (file_pos >= progress_next) {
+ progress_next += file_size / 16;
+ ui_progress__update(progress, file_pos);
+ }
- if (offset + head < file_size)
+ if (file_pos < file_size)
goto more;
-done:
+
err = 0;
/* do the final flush for ordered samples */
- self->ordered_samples.next_flush = ULLONG_MAX;
- flush_sample_queue(self, ops);
+ session->ordered_samples.next_flush = ULLONG_MAX;
+ flush_sample_queue(session, ops);
out_err:
ui_progress__delete(progress);
+
+ if (ops->lost == event__process_lost &&
+ session->hists.stats.total_lost != 0) {
+ ui__warning("Processed %Lu events and LOST %Lu!\n\n"
+ "Check IO/CPU overload!\n\n",
+ session->hists.stats.total_period,
+ session->hists.stats.total_lost);
+ }
+
+ if (session->hists.stats.nr_unknown_events != 0) {
+ ui__warning("Found %u unknown events!\n\n"
+ "Is this an older tool processing a perf.data "
+ "file generated by a more recent tool?\n\n"
+ "If that is not the case, consider "
+ "reporting to linux-kernel@vger.kernel.org.\n\n",
+ session->hists.stats.nr_unknown_events);
+ }
+
+ perf_session_free_sample_buffers(session);
return err;
}
u64 last_flush;
u64 next_flush;
u64 max_timestamp;
- struct list_head samples_head;
- struct sample_queue *last_inserted;
+ struct list_head samples;
+ struct list_head sample_cache;
+ struct list_head to_free;
+ struct sample_queue *sample_buffer;
+ struct sample_queue *last_sample;
+ int sample_buffer_idx;
};
struct perf_session {
* We still haven't the actual symbols, so guess the
* last map final address.
*/
- curr->end = ~0UL;
+ curr->end = ~0ULL;
}
static void map_groups__fixup_end(struct map_groups *self)
return rc;
}
-static const char yes[] = "Yes", no[] = "No";
+static const char yes[] = "Yes", no[] = "No",
+ warning_str[] = "Warning!", ok[] = "Ok";
bool ui__dialog_yesno(const char *msg)
{
/* newtWinChoice should really be accepting const char pointers... */
return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1;
}
+
+void ui__warning(const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ if (use_browser > 0)
+ newtWinMessagev((char *)warning_str, (char *)ok,
+ (char *)format, args);
+ else
+ vfprintf(stderr, format, args);
+ va_end(args);
+}