From: Ingo Molnar Date: Tue, 19 Aug 2008 01:34:07 +0000 (+0200) Subject: Merge branch 'x86/oprofile' into oprofile X-Git-Url: https://git.karo-electronics.de/?a=commitdiff_plain;h=2879a927bb7a3cf91ae3906a5e59215f9c17dd75;hp=20211e4d344729f4d4c93da37a590fc1c3a1fd9b;p=linux-beck.git Merge branch 'x86/oprofile' into oprofile --- diff --git a/arch/Kconfig b/arch/Kconfig index 364c6dadde0a..0267babe5eb9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -13,6 +13,20 @@ config OPROFILE If unsure, say N. +config OPROFILE_IBS + bool "OProfile AMD IBS support (EXPERIMENTAL)" + default n + depends on OPROFILE && SMP && X86 + help + Instruction-Based Sampling (IBS) is a new profiling + technique that provides rich, precise program performance + information. IBS is introduced by AMD Family10h processors + (AMD Opteron Quad-Core processor “Barcelona”) to overcome + the limitations of conventional performance counter + sampling. + + If unsure, say N. + config HAVE_OPROFILE def_bool n diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f88bd0d982b0..0ff576d026a4 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -646,6 +646,9 @@ int setup_profiling_timer(unsigned int multiplier) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 @@ -669,6 +672,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 446c062e831c..57744f4a75b4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -204,6 +204,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 @@ -228,6 +231,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 30f3eb366667..446902b2a6b6 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3f90289410e6..fb4902bc6f14 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,10 +1,11 @@ /** * @file nmi_int.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon + * @author Robert Richter */ #include @@ -22,12 +23,18 @@ #include "op_counter.h" #include "op_x86_model.h" +DEFINE_PER_CPU(int, switch_index); + static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_stop(void *dummy); +static void nmi_cpu_start(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; @@ -80,6 +87,47 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_hardware_counters; + if ((si > model->num_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, smp_processor_id()) = 0; + else + per_cpu(switch_index, smp_processor_id()) = si; + + nmi_cpu_restore_mpx_registers(msrs); + model->setup_ctrs(msrs); + nmi_cpu_start(NULL); +} + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_hardware_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (nmi_multiplex_on() < 0) + return -EINVAL; + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + return 0; +} + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -143,11 +191,10 @@ static void free_msrs(void) static int allocate_msrs(void) { - int success = 1; + int i, success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; - int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, GFP_KERNEL); @@ -155,8 +202,8 @@ static int allocate_msrs(void) success = 0; break; } - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); + per_cpu(cpu_msrs, i).controls = + kmalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) { success = 0; break; @@ -200,7 +247,8 @@ static int nmi_setup(void) return err; } - /* We need to serialize save and setup for HT because the subset + /* + * We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ @@ -216,7 +264,6 @@ static int nmi_setup(void) per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); } - } on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); @@ -224,7 +271,41 @@ static int nmi_setup(void) return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + rdmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + wrmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; @@ -264,7 +345,8 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); + __get_cpu_var(switch_index) = 0; } static void nmi_shutdown(void) @@ -327,6 +409,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + counter_config[i].save_count_low = 0; } return 0; @@ -411,6 +494,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; char *cpu_type; + int ret = 0; if (!cpu_has_apic) return -ENODEV; @@ -423,19 +507,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) default: return -ENODEV; case 6: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_athlon_spec; + model = &op_amd_spec; /* Actually it could be i386/hammer too, but give user space an consistent name. */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family10"; break; + case 0x11: + model = &op_amd_spec; + cpu_type = "x86-64/family11h"; + break; } break; @@ -462,14 +550,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } - init_sysfs(); - using_nmi = 1; + /* default values, can be overwritten by model */ + __get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; + ops->switch_events = nmi_switch_event; + + if (model->init) + ret = model->init(ops); + if (ret) + return ret; + + init_sysfs(); + using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } @@ -478,4 +575,6 @@ void op_nmi_exit(void) { if (using_nmi) exit_sysfs(); + if (model->exit) + model->exit(); } diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c4675..786d6e01cf7f 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,13 +10,14 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; + unsigned long save_count_low; unsigned long enabled; unsigned long event; unsigned long kernel; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 000000000000..bbf2b68bcc5d --- /dev/null +++ b/arch/x86/oprofile/op_model_amd.c @@ -0,0 +1,559 @@ +/* + * @file op_model_amd.c + * athlon / K7 / K8 / Family 10h model-specific MSR operations + * + * @remark Copyright 2002-2008 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Robert Richter + * @author Barry Kasindorf +*/ + +#include +#include +#include +#include + +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 32 +#define NUM_HARDWARE_COUNTERS 4 +#define NUM_CONTROLS 32 +#define NUM_HARDWARE_CONTROLS 4 + +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) +#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR_LO(x) (x &= (1<<21)) +#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) + +static unsigned long reset_value[NUM_COUNTERS]; +DECLARE_PER_CPU(int, switch_index); + +#ifdef CONFIG_OPROFILE_IBS + +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ + +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ + +/* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code); + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; i++) { + int hw_counter = i % NUM_HARDWARE_COUNTERS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->counters[i].addr = 0; + } + + for (i = 0; i < NUM_CONTROLS; i++) { + int hw_control = i % NUM_HARDWARE_CONTROLS; + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control; + else + msrs->controls[i].addr = 0; + } +} + + +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0; i < NUM_HARDWARE_CONTROLS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (counter_config[offset].enabled) + reset_value[offset] = counter_config[offset].count; + else + reset_value[offset] = 0; + } + + /* clear all counters */ + for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs, i))) + continue; + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs, i))) { + CTR_WRITE(counter_config[offset].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[offset].user); + CTRL_SET_KERN(low, counter_config[offset].kernel); + CTRL_SET_UM(low, counter_config[offset].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[offset].event); + CTRL_SET_EVENT_HIGH(high, counter_config[offset].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); + + CTRL_WRITE(low, high, msrs, i); + } + } +} + +#ifdef CONFIG_OPROFILE_IBS + +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; + + if (!ibs_allowed) + return 1; + + if (ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_HIGH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_LOW_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + + return 1; +} + +#endif + +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (!reset_value[offset]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, offset); + CTR_WRITE(reset_value[offset], msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + op_amd_handle_ibs(regs, msrs); +#endif + + /* See op_model_ppro.c */ + return 1; +} + +static void op_amd_start(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (reset_value[offset]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + + +static void op_amd_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* Subtle: stop on all counters to avoid race with + * setting our pm callback */ + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs, i)) + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + if (CTRL_IS_RESERVED(msrs, i)) + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +#ifndef CONFIG_OPROFILE_IBS + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + +static u8 ibs_eilvt_off; + +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + +/* + * initialize the APIC for the IBS interrupts + * if available (AMD Family10h rev B0 and later) + */ +static void setup_ibs(void) +{ + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + if (pfm_amd64_setup_eilvt()) { + ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +static int (*create_arch_files)(struct super_block * sb, struct dentry * root); + +static int setup_ibs_files(struct super_block * sb, struct dentry * root) +{ + char buf[12]; + struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; + + if (!ibs_allowed) + return ret; + + /* model specific files */ + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + snprintf(buf, sizeof(buf), "ibs_fetch"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + snprintf(buf, sizeof(buf), "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + + return 0; +} + +static int op_amd_init(struct oprofile_operations *ops) +{ + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; + return 0; +} + +static void op_amd_exit(void) +{ + clear_ibs_nmi(); +} + +#endif + +struct op_x86_model_spec const op_amd_spec = { + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_HARDWARE_COUNTERS, + .num_hardware_controls = NUM_HARDWARE_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown +}; diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c deleted file mode 100644 index 3d534879a9dc..000000000000 --- a/arch/x86/oprofile/op_model_athlon.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * @file op_model_athlon.h - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - */ - -#include -#include -#include -#include - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 - -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; - -static void athlon_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; - } -} - - -static void athlon_setup_ctrs(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) - continue; - CTR_WRITE(1, msrs, i); - } - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } - } -} - - -static int athlon_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } - - /* See op_model_ppro.c */ - return 1; -} - - -static void athlon_start(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } -} - - -static void athlon_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } -} - -static void athlon_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -struct op_x86_model_spec const op_athlon_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &athlon_fill_in_addresses, - .setup_ctrs = &athlon_setup_ctrs, - .check_ctrs = &athlon_check_ctrs, - .start = &athlon_start, - .stop = &athlon_stop, - .shutdown = &athlon_shutdown -}; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 43ac5af338d8..cacba61ffbac 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -700,6 +700,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, + .num_hardware_counters = NUM_COUNTERS_HT2, + .num_hardware_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -712,6 +714,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, + .num_hardware_counters = NUM_COUNTERS_NON_HT, + .num_hardware_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57b..e5811aa480eb 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -183,6 +183,8 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_COUNTERS, + .num_hardware_controls = NUM_CONTROLS, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 45b605fa71d0..e07ba1076371 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -19,6 +19,7 @@ struct op_saved_msr { struct op_msr { unsigned long addr; struct op_saved_msr saved; + struct op_saved_msr multiplex; }; struct op_msrs { @@ -32,6 +33,10 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + unsigned int const num_hardware_counters; + unsigned int const num_hardware_controls; unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); @@ -46,6 +51,6 @@ struct op_x86_model_spec { extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_athlon_spec; +extern struct op_x86_model_spec const op_amd_spec; #endif /* OP_X86_MODEL_H */ diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 9304c4555079..ed982273fb8b 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -5,6 +5,7 @@ * @remark Read the file COPYING * * @author John Levon + * @author Barry Kasindorf * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the @@ -33,7 +34,7 @@ #include "event_buffer.h" #include "cpu_buffer.h" #include "buffer_sync.h" - + static LIST_HEAD(dying_tasks); static LIST_HEAD(dead_tasks); static cpumask_t marked_cpus = CPU_MASK_NONE; @@ -48,10 +49,11 @@ static void process_task_mortuary(void); * Can be invoked from softirq via RCU callback due to * call_rcu() of the task struct, hence the _irqsave. */ -static int task_free_notify(struct notifier_block * self, unsigned long val, void * data) +static int +task_free_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long flags; - struct task_struct * task = data; + struct task_struct *task = data; spin_lock_irqsave(&task_mortuary, flags); list_add(&task->tasks, &dying_tasks); spin_unlock_irqrestore(&task_mortuary, flags); @@ -62,13 +64,14 @@ static int task_free_notify(struct notifier_block * self, unsigned long val, voi /* The task is on its way out. A sync of the buffer means we can catch * any remaining samples for this task. */ -static int task_exit_notify(struct notifier_block * self, unsigned long val, void * data) +static int +task_exit_notify(struct notifier_block *self, unsigned long val, void *data) { /* To avoid latency problems, we only process the current CPU, * hoping that most samples for the task are on this CPU */ sync_buffer(raw_smp_processor_id()); - return 0; + return 0; } @@ -77,11 +80,12 @@ static int task_exit_notify(struct notifier_block * self, unsigned long val, voi * we don't lose any. This does not have to be exact, it's a QoI issue * only. */ -static int munmap_notify(struct notifier_block * self, unsigned long val, void * data) +static int +munmap_notify(struct notifier_block *self, unsigned long val, void *data) { unsigned long addr = (unsigned long)data; - struct mm_struct * mm = current->mm; - struct vm_area_struct * mpnt; + struct mm_struct *mm = current->mm; + struct vm_area_struct *mpnt; down_read(&mm->mmap_sem); @@ -99,11 +103,12 @@ static int munmap_notify(struct notifier_block * self, unsigned long val, void * return 0; } - + /* We need to be told about new modules so we don't attribute to a previously * loaded module, or drop the samples on the floor. */ -static int module_load_notify(struct notifier_block * self, unsigned long val, void * data) +static int +module_load_notify(struct notifier_block *self, unsigned long val, void *data) { #ifdef CONFIG_MODULES if (val != MODULE_STATE_COMING) @@ -118,7 +123,7 @@ static int module_load_notify(struct notifier_block * self, unsigned long val, v return 0; } - + static struct notifier_block task_free_nb = { .notifier_call = task_free_notify, }; @@ -135,7 +140,7 @@ static struct notifier_block module_load_nb = { .notifier_call = module_load_notify, }; - + static void end_sync(void) { end_cpu_work(); @@ -208,14 +213,14 @@ static inline unsigned long fast_get_dcookie(struct path *path) * not strictly necessary but allows oprofile to associate * shared-library samples with particular applications */ -static unsigned long get_exec_dcookie(struct mm_struct * mm) +static unsigned long get_exec_dcookie(struct mm_struct *mm) { unsigned long cookie = NO_COOKIE; - struct vm_area_struct * vma; - + struct vm_area_struct *vma; + if (!mm) goto out; - + for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -235,13 +240,14 @@ out: * sure to do this lookup before a mm->mmap modification happens so * we don't lose track. */ -static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, off_t * offset) +static unsigned long +lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset) { unsigned long cookie = NO_COOKIE; - struct vm_area_struct * vma; + struct vm_area_struct *vma; for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { - + if (addr < vma->vm_start || addr >= vma->vm_end) continue; @@ -263,9 +269,20 @@ static unsigned long lookup_dcookie(struct mm_struct * mm, unsigned long addr, o return cookie; } +static void increment_tail(struct oprofile_cpu_buffer *b) +{ + unsigned long new_tail = b->tail_pos + 1; + + rmb(); /* be sure fifo pointers are synchromized */ + + if (new_tail < b->buffer_size) + b->tail_pos = new_tail; + else + b->tail_pos = 0; +} static unsigned long last_cookie = INVALID_COOKIE; - + static void add_cpu_switch(int i) { add_event_entry(ESCAPE_CODE); @@ -278,16 +295,16 @@ static void add_kernel_ctx_switch(unsigned int in_kernel) { add_event_entry(ESCAPE_CODE); if (in_kernel) - add_event_entry(KERNEL_ENTER_SWITCH_CODE); + add_event_entry(KERNEL_ENTER_SWITCH_CODE); else - add_event_entry(KERNEL_EXIT_SWITCH_CODE); + add_event_entry(KERNEL_EXIT_SWITCH_CODE); } - + static void -add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) +add_user_ctx_switch(struct task_struct const *task, unsigned long cookie) { add_event_entry(ESCAPE_CODE); - add_event_entry(CTX_SWITCH_CODE); + add_event_entry(CTX_SWITCH_CODE); add_event_entry(task->pid); add_event_entry(cookie); /* Another code for daemon back-compat */ @@ -296,7 +313,7 @@ add_user_ctx_switch(struct task_struct const * task, unsigned long cookie) add_event_entry(task->tgid); } - + static void add_cookie_switch(unsigned long cookie) { add_event_entry(ESCAPE_CODE); @@ -304,13 +321,78 @@ static void add_cookie_switch(unsigned long cookie) add_event_entry(cookie); } - + static void add_trace_begin(void) { add_event_entry(ESCAPE_CODE); add_event_entry(TRACE_BEGIN_CODE); } +#ifdef CONFIG_OPROFILE_IBS + +#define IBS_FETCH_CODE_SIZE 2 +#define IBS_OP_CODE_SIZE 5 +#define IBS_EIP(offset) \ + (((struct op_sample *)&cpu_buf->buffer[(offset)])->eip) +#define IBS_EVENT(offset) \ + (((struct op_sample *)&cpu_buf->buffer[(offset)])->event) + +/* + * Add IBS fetch and op entries to event buffer + */ +static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code, + int in_kernel, struct mm_struct *mm) +{ + unsigned long rip; + int i, count; + unsigned long ibs_cookie = 0; + off_t offset; + + increment_tail(cpu_buf); /* move to RIP entry */ + + rip = IBS_EIP(cpu_buf->tail_pos); + +#ifdef __LP64__ + rip += IBS_EVENT(cpu_buf->tail_pos) << 32; +#endif + + if (mm) { + ibs_cookie = lookup_dcookie(mm, rip, &offset); + + if (ibs_cookie == NO_COOKIE) + offset = rip; + if (ibs_cookie == INVALID_COOKIE) { + atomic_inc(&oprofile_stats.sample_lost_no_mapping); + offset = rip; + } + if (ibs_cookie != last_cookie) { + add_cookie_switch(ibs_cookie); + last_cookie = ibs_cookie; + } + } else + offset = rip; + + add_event_entry(ESCAPE_CODE); + add_event_entry(code); + add_event_entry(offset); /* Offset from Dcookie */ + + /* we send the Dcookie offset, but send the raw Linear Add also*/ + add_event_entry(IBS_EIP(cpu_buf->tail_pos)); + add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); + + if (code == IBS_FETCH_CODE) + count = IBS_FETCH_CODE_SIZE; /*IBS FETCH is 2 int64s*/ + else + count = IBS_OP_CODE_SIZE; /*IBS OP is 5 int64s*/ + + for (i = 0; i < count; i++) { + increment_tail(cpu_buf); + add_event_entry(IBS_EIP(cpu_buf->tail_pos)); + add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); + } +} + +#endif static void add_sample_entry(unsigned long offset, unsigned long event) { @@ -319,13 +401,13 @@ static void add_sample_entry(unsigned long offset, unsigned long event) } -static int add_us_sample(struct mm_struct * mm, struct op_sample * s) +static int add_us_sample(struct mm_struct *mm, struct op_sample *s) { unsigned long cookie; off_t offset; - - cookie = lookup_dcookie(mm, s->eip, &offset); - + + cookie = lookup_dcookie(mm, s->eip, &offset); + if (cookie == INVALID_COOKIE) { atomic_inc(&oprofile_stats.sample_lost_no_mapping); return 0; @@ -341,13 +423,13 @@ static int add_us_sample(struct mm_struct * mm, struct op_sample * s) return 1; } - + /* Add a sample to the global event buffer. If possible the * sample is converted into a persistent dentry/offset pair * for later lookup from userspace. */ static int -add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) +add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) { if (in_kernel) { add_sample_entry(s->eip, s->event); @@ -359,9 +441,9 @@ add_sample(struct mm_struct * mm, struct op_sample * s, int in_kernel) } return 0; } - -static void release_mm(struct mm_struct * mm) + +static void release_mm(struct mm_struct *mm) { if (!mm) return; @@ -370,9 +452,9 @@ static void release_mm(struct mm_struct * mm) } -static struct mm_struct * take_tasks_mm(struct task_struct * task) +static struct mm_struct *take_tasks_mm(struct task_struct *task) { - struct mm_struct * mm = get_task_mm(task); + struct mm_struct *mm = get_task_mm(task); if (mm) down_read(&mm->mmap_sem); return mm; @@ -383,10 +465,10 @@ static inline int is_code(unsigned long val) { return val == ESCAPE_CODE; } - + /* "acquire" as many cpu buffer slots as we can */ -static unsigned long get_slots(struct oprofile_cpu_buffer * b) +static unsigned long get_slots(struct oprofile_cpu_buffer *b) { unsigned long head = b->head_pos; unsigned long tail = b->tail_pos; @@ -412,19 +494,6 @@ static unsigned long get_slots(struct oprofile_cpu_buffer * b) } -static void increment_tail(struct oprofile_cpu_buffer * b) -{ - unsigned long new_tail = b->tail_pos + 1; - - rmb(); - - if (new_tail < b->buffer_size) - b->tail_pos = new_tail; - else - b->tail_pos = 0; -} - - /* Move tasks along towards death. Any tasks on dead_tasks * will definitely have no remaining references in any * CPU buffers at this point, because we use two lists, @@ -435,8 +504,8 @@ static void process_task_mortuary(void) { unsigned long flags; LIST_HEAD(local_dead_tasks); - struct task_struct * task; - struct task_struct * ttask; + struct task_struct *task; + struct task_struct *ttask; spin_lock_irqsave(&task_mortuary, flags); @@ -493,7 +562,7 @@ void sync_buffer(int cpu) { struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu); struct mm_struct *mm = NULL; - struct task_struct * new; + struct task_struct *new; unsigned long cookie = 0; int in_kernel = 1; unsigned int i; @@ -501,7 +570,7 @@ void sync_buffer(int cpu) unsigned long available; mutex_lock(&buffer_mutex); - + add_cpu_switch(cpu); /* Remember, only we can modify tail_pos */ @@ -509,8 +578,8 @@ void sync_buffer(int cpu) available = get_slots(cpu_buf); for (i = 0; i < available; ++i) { - struct op_sample * s = &cpu_buf->buffer[cpu_buf->tail_pos]; - + struct op_sample *s = &cpu_buf->buffer[cpu_buf->tail_pos]; + if (is_code(s->eip)) { if (s->event <= CPU_IS_KERNEL) { /* kernel/userspace switch */ @@ -521,8 +590,18 @@ void sync_buffer(int cpu) } else if (s->event == CPU_TRACE_BEGIN) { state = sb_bt_start; add_trace_begin(); +#ifdef CONFIG_OPROFILE_IBS + } else if (s->event == IBS_FETCH_BEGIN) { + state = sb_bt_start; + add_ibs_begin(cpu_buf, + IBS_FETCH_CODE, in_kernel, mm); + } else if (s->event == IBS_OP_BEGIN) { + state = sb_bt_start; + add_ibs_begin(cpu_buf, + IBS_OP_CODE, in_kernel, mm); +#endif } else { - struct mm_struct * oldmm = mm; + struct mm_struct *oldmm = mm; /* userspace context switch */ new = (struct task_struct *)s->event; @@ -533,13 +612,11 @@ void sync_buffer(int cpu) cookie = get_exec_dcookie(mm); add_user_ctx_switch(new, cookie); } - } else { - if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); - } + } else if (state >= sb_bt_start && + !add_sample(mm, s, in_kernel)) { + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); } } diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 2450b3a393ff..4decab624e76 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -5,6 +5,7 @@ * @remark Read the file COPYING * * @author John Levon + * @author Barry Kasindorf * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. @@ -207,7 +208,7 @@ static int log_sample(struct oprofile_cpu_buffer * cpu_buf, unsigned long pc, return 1; } -static int oprofile_begin_trace(struct oprofile_cpu_buffer * cpu_buf) +static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) { if (nr_available_slots(cpu_buf) < 4) { cpu_buf->sample_lost_overflow++; @@ -252,6 +253,75 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) oprofile_add_ext_sample(pc, regs, event, is_kernel); } +#ifdef CONFIG_OPROFILE_IBS + +#define MAX_IBS_SAMPLE_SIZE 14 +static int log_ibs_sample(struct oprofile_cpu_buffer *cpu_buf, + unsigned long pc, int is_kernel, unsigned int *ibs, int ibs_code) +{ + struct task_struct *task; + + cpu_buf->sample_received++; + + if (nr_available_slots(cpu_buf) < MAX_IBS_SAMPLE_SIZE) { + cpu_buf->sample_lost_overflow++; + return 0; + } + + is_kernel = !!is_kernel; + + /* notice a switch from user->kernel or vice versa */ + if (cpu_buf->last_is_kernel != is_kernel) { + cpu_buf->last_is_kernel = is_kernel; + add_code(cpu_buf, is_kernel); + } + + /* notice a task switch */ + if (!is_kernel) { + task = current; + + if (cpu_buf->last_task != task) { + cpu_buf->last_task = task; + add_code(cpu_buf, (unsigned long)task); + } + } + + add_code(cpu_buf, ibs_code); + add_sample(cpu_buf, ibs[0], ibs[1]); + add_sample(cpu_buf, ibs[2], ibs[3]); + add_sample(cpu_buf, ibs[4], ibs[5]); + + if (ibs_code == IBS_OP_BEGIN) { + add_sample(cpu_buf, ibs[6], ibs[7]); + add_sample(cpu_buf, ibs[8], ibs[9]); + add_sample(cpu_buf, ibs[10], ibs[11]); + } + + return 1; +} + +void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code) +{ + int is_kernel = !user_mode(regs); + unsigned long pc = profile_pc(regs); + + struct oprofile_cpu_buffer *cpu_buf = + &per_cpu(cpu_buffer, smp_processor_id()); + + if (!backtrace_depth) { + log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code); + return; + } + + /* if log_sample() fails we can't backtrace since we lost the source + * of this event */ + if (log_ibs_sample(cpu_buf, pc, is_kernel, ibs_sample, code)) + oprofile_ops.backtrace(regs, backtrace_depth); +} + +#endif + void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); @@ -294,7 +364,7 @@ static void wq_sync_buffer(struct work_struct *work) struct oprofile_cpu_buffer * b = container_of(work, struct oprofile_cpu_buffer, work.work); if (b->cpu != smp_processor_id()) { - printk("WQ on CPU%d, prefer CPU%d\n", + printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", smp_processor_id(), b->cpu); } sync_buffer(b->cpu); diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h index c3e366b52261..9c44d004da69 100644 --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -55,5 +55,7 @@ void cpu_buffer_reset(struct oprofile_cpu_buffer * cpu_buf); /* transient events for the CPU buffer -> event buffer */ #define CPU_IS_KERNEL 1 #define CPU_TRACE_BEGIN 2 +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 #endif /* OPROFILE_CPU_BUFFER_H */ diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 2c645170f06e..b2fa5df64a62 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "oprof.h" @@ -19,13 +21,18 @@ #include "cpu_buffer.h" #include "buffer_sync.h" #include "oprofile_stats.h" + +static unsigned long is_setup; +static void switch_worker(struct work_struct *work); +static DECLARE_DELAYED_WORK(switch_work, switch_worker); +static DEFINE_MUTEX(start_mutex); struct oprofile_operations oprofile_ops; +unsigned long timeout_jiffies; unsigned long oprofile_started; unsigned long backtrace_depth; -static unsigned long is_setup; -static DEFINE_MUTEX(start_mutex); +/* Multiplexing defaults at 1 msec*/ /* timer 0 - use performance monitoring hardware if available @@ -87,6 +94,16 @@ out: return err; } +static void start_switch_worker(void) +{ + schedule_delayed_work(&switch_work, timeout_jiffies); +} + +static void switch_worker(struct work_struct *work) +{ + if (!oprofile_ops.switch_events()) + start_switch_worker(); +} /* Actually start profiling (echo 1>/dev/oprofile/enable) */ int oprofile_start(void) @@ -94,7 +111,6 @@ int oprofile_start(void) int err = -EINVAL; mutex_lock(&start_mutex); - if (!is_setup) goto out; @@ -108,6 +124,9 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; + if (oprofile_ops.switch_events) + start_switch_worker(); + oprofile_started = 1; out: mutex_unlock(&start_mutex); @@ -123,6 +142,7 @@ void oprofile_stop(void) goto out; oprofile_ops.stop(); oprofile_started = 0; + cancel_delayed_work_sync(&switch_work); /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); out: @@ -155,6 +175,32 @@ post_sync: mutex_unlock(&start_mutex); } +/* User inputs in ms, converts to jiffies */ +int oprofile_set_timeout(unsigned long val_msec) +{ + int err = 0; + + mutex_lock(&start_mutex); + + if (oprofile_started) { + err = -EBUSY; + goto out; + } + + if (!oprofile_ops.switch_events) { + err = -EINVAL; + goto out; + } + + timeout_jiffies = msecs_to_jiffies(val_msec); + if (timeout_jiffies == MAX_JIFFY_OFFSET) + timeout_jiffies = msecs_to_jiffies(1); + +out: + mutex_unlock(&start_mutex); + return err; + +} int oprofile_set_backtrace(unsigned long val) { @@ -179,10 +225,16 @@ out: return err; } +static void __init oprofile_switch_timer_init(void) +{ + timeout_jiffies = msecs_to_jiffies(1); +} + static int __init oprofile_init(void) { int err; + oprofile_switch_timer_init(); err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 18323650806e..c4406a7366bb 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -27,7 +27,8 @@ extern unsigned long fs_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long backtrace_depth; - +extern unsigned long timeout_jiffies; + struct super_block; struct dentry; @@ -35,5 +36,6 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root); void oprofile_timer_init(struct oprofile_operations * ops); int oprofile_set_backtrace(unsigned long depth); +int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index ef953ba5ab6b..cc4f5a1f8ef2 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -9,6 +9,7 @@ #include #include +#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -18,6 +19,40 @@ unsigned long fs_buffer_size = 131072; unsigned long fs_cpu_buffer_size = 8192; unsigned long fs_buffer_watershed = 32768; /* FIXME: tune */ +static ssize_t timeout_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), + buf, count, offset); +} + + +static ssize_t timeout_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + retval = oprofile_set_timeout(val); + + if (retval) + return retval; + return count; +} + +static const struct file_operations timeout_fops = { + .read = timeout_read, + .write = timeout_write, +}; + + static ssize_t depth_read(struct file * file, char __user * buf, size_t count, loff_t * offset) { return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset); @@ -85,11 +120,10 @@ static ssize_t enable_write(struct file * file, char const __user * buf, size_t if (*offset) return -EINVAL; - retval = oprofilefs_ulong_from_user(&val, buf, count); if (retval) return retval; - + if (val) retval = oprofile_start(); else @@ -129,6 +163,7 @@ void oprofile_create_files(struct super_block * sb, struct dentry * root) oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); + oprofilefs_create_file(sb, root, "timeout_ms", &timeout_fops); oprofile_create_stats_files(sb, root); if (oprofile_ops.create_files) oprofile_ops.create_files(sb, root); diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 041bb31100f4..687f2f4c36a1 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -36,6 +36,8 @@ #define XEN_ENTER_SWITCH_CODE 10 #define SPU_PROFILING_CODE 11 #define SPU_CTX_SWITCH_CODE 12 +#define IBS_FETCH_CODE 13 +#define IBS_OP_CODE 14 struct super_block; struct dentry; @@ -65,6 +67,9 @@ struct oprofile_operations { /* Initiate a stack backtrace. Optional. */ void (*backtrace)(struct pt_regs * const regs, unsigned int depth); + + /* Multiplex between different events. Optional. */ + int (*switch_events)(void); /* CPU identification string. */ char * cpu_type; }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 9ec2bcce8e83..4463ca5b8934 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -497,6 +497,11 @@ #define PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP 0x1101 #define PCI_DEVICE_ID_AMD_K8_NB_MEMCTL 0x1102 #define PCI_DEVICE_ID_AMD_K8_NB_MISC 0x1103 +#define PCI_DEVICE_ID_AMD_10H_NB_HT 0x1200 +#define PCI_DEVICE_ID_AMD_10H_NB_MAP 0x1201 +#define PCI_DEVICE_ID_AMD_10H_NB_DRAM 0x1202 +#define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203 +#define PCI_DEVICE_ID_AMD_10H_NB_LINK 0x1204 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 #define PCI_DEVICE_ID_AMD_SCSI 0x2020