]> git.karo-electronics.de Git - linux-beck.git/commitdiff
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:30:21 +0000 (15:30 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:30:21 +0000 (15:30 -0800)
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (470 commits)
  x86: Fix comments of register/stack access functions
  perf tools: Replace %m with %a in sscanf
  hw-breakpoints: Keep track of user disabled breakpoints
  tracing/syscalls: Make syscall events print callbacks static
  tracing: Add DEFINE_EVENT(), DEFINE_SINGLE_EVENT() support to docbook
  perf: Don't free perf_mmap_data until work has been done
  perf_event: Fix compile error
  perf tools: Fix _GNU_SOURCE macro related strndup() build error
  trace_syscalls: Remove unused syscall_name_to_nr()
  trace_syscalls: Simplify syscall profile
  trace_syscalls: Remove duplicate init_enter_##sname()
  trace_syscalls: Add syscall_nr field to struct syscall_metadata
  trace_syscalls: Remove enter_id exit_id
  trace_syscalls: Set event_enter_##sname->data to its metadata
  trace_syscalls: Remove unused event_syscall_enter and event_syscall_exit
  perf_event: Initialize data.period in perf_swevent_hrtimer()
  perf probe: Simplify event naming
  perf probe: Add --list option for listing current probe events
  perf probe: Add argv_split() from lib/argv_split.c
  perf probe: Move probe event utility functions to probe-event.c
  ...

251 files changed:
Documentation/DocBook/tracepoint.tmpl
Documentation/trace/kprobetrace.txt [new file with mode: 0644]
arch/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/configs/pseries_defconfig
arch/powerpc/include/asm/emulated_ops.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/trace.h [new file with mode: 0644]
arch/powerpc/kernel/align.c
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/perf_event.c
arch/powerpc/kernel/power5+-pmu.c
arch/powerpc/kernel/power5-pmu.c
arch/powerpc/kernel/power6-pmu.c
arch/powerpc/kernel/power7-pmu.c
arch/powerpc/kernel/ppc970-pmu.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/time.c
arch/powerpc/kernel/traps.c
arch/powerpc/lib/copypage_64.S
arch/powerpc/platforms/pseries/hvCall.S
arch/powerpc/platforms/pseries/hvCall_inst.c
arch/powerpc/platforms/pseries/lpar.c
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/include/asm/Kbuild
arch/x86/include/asm/a.out-core.h
arch/x86/include/asm/debugreg.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/hw_breakpoint.h [new file with mode: 0644]
arch/x86/include/asm/inat.h [new file with mode: 0644]
arch/x86/include/asm/inat_types.h [new file with mode: 0644]
arch/x86/include/asm/insn.h [new file with mode: 0644]
arch/x86/include/asm/mce.h
arch/x86/include/asm/perf_event.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/ptrace.h
arch/x86/kernel/Makefile
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/hw_breakpoint.c [new file with mode: 0644]
arch/x86/kernel/irq.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/kprobes.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/traps.c
arch/x86/kvm/x86.c
arch/x86/lib/.gitignore [new file with mode: 0644]
arch/x86/lib/Makefile
arch/x86/lib/inat.c [new file with mode: 0644]
arch/x86/lib/insn.c [new file with mode: 0644]
arch/x86/lib/x86-opcode-map.txt [new file with mode: 0644]
arch/x86/mm/fault.c
arch/x86/mm/kmmio.c
arch/x86/power/cpu.c
arch/x86/tools/Makefile [new file with mode: 0644]
arch/x86/tools/chkobjdump.awk [new file with mode: 0644]
arch/x86/tools/distill.awk [new file with mode: 0644]
arch/x86/tools/gen-insn-attr-x86.awk [new file with mode: 0644]
arch/x86/tools/test_get_len.c [new file with mode: 0644]
drivers/edac/edac_mce_amd.c
include/linux/ftrace_event.h
include/linux/hw_breakpoint.h [new file with mode: 0644]
include/linux/kprobes.h
include/linux/perf_counter.h
include/linux/perf_event.h
include/linux/syscalls.h
include/linux/tracepoint.h
include/trace/define_trace.h
include/trace/events/bkl.h
include/trace/events/block.h
include/trace/events/ext4.h
include/trace/events/irq.h
include/trace/events/jbd2.h
include/trace/events/kmem.h
include/trace/events/lock.h [moved from include/trace/events/lockdep.h with 92% similarity]
include/trace/events/mce.h [new file with mode: 0644]
include/trace/events/module.h
include/trace/events/power.h
include/trace/events/sched.h
include/trace/events/signal.h [new file with mode: 0644]
include/trace/events/timer.h
include/trace/events/workqueue.h
include/trace/ftrace.h
include/trace/syscall.h
kernel/Makefile
kernel/exit.c
kernel/hw_breakpoint.c [new file with mode: 0644]
kernel/kallsyms.c
kernel/kprobes.c
kernel/lockdep.c
kernel/notifier.c
kernel/perf_event.c
kernel/signal.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/ring_buffer.c
kernel/trace/trace.h
kernel/trace/trace_entries.h
kernel/trace/trace_event_profile.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_export.c
kernel/trace/trace_kprobe.c [new file with mode: 0644]
kernel/trace/trace_ksym.c [new file with mode: 0644]
kernel/trace/trace_selftest.c
kernel/trace/trace_syscalls.c
samples/Kconfig
samples/Makefile
samples/hw_breakpoint/Makefile [new file with mode: 0644]
samples/hw_breakpoint/data_breakpoint.c [new file with mode: 0644]
scripts/kernel-doc
tools/perf/.gitignore
tools/perf/Documentation/perf-bench.txt [new file with mode: 0644]
tools/perf/Documentation/perf-buildid-list.txt [new file with mode: 0644]
tools/perf/Documentation/perf-kmem.txt [new file with mode: 0644]
tools/perf/Documentation/perf-probe.txt [new file with mode: 0644]
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-timechart.txt
tools/perf/Documentation/perf-trace-perl.txt [new file with mode: 0644]
tools/perf/Documentation/perf-trace.txt
tools/perf/Makefile
tools/perf/bench/bench.h [new file with mode: 0644]
tools/perf/bench/mem-memcpy.c [new file with mode: 0644]
tools/perf/bench/sched-messaging.c [new file with mode: 0644]
tools/perf/bench/sched-pipe.c [new file with mode: 0644]
tools/perf/builtin-annotate.c
tools/perf/builtin-bench.c [new file with mode: 0644]
tools/perf/builtin-buildid-list.c [new file with mode: 0644]
tools/perf/builtin-help.c
tools/perf/builtin-kmem.c [new file with mode: 0644]
tools/perf/builtin-probe.c [new file with mode: 0644]
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/builtin.h
tools/perf/command-list.txt
tools/perf/design.txt
tools/perf/perf.c
tools/perf/perf.h
tools/perf/scripts/perl/Perf-Trace-Util/Context.c [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/Context.xs [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/Makefile.PL [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/README [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/typemap [new file with mode: 0644]
tools/perf/scripts/perl/bin/check-perf-trace-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/check-perf-trace-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-file-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-file-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-pid-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-pid-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/wakeup-latency-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/wakeup-latency-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/workqueue-stats-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/workqueue-stats-report [new file with mode: 0644]
tools/perf/scripts/perl/check-perf-trace.pl [new file with mode: 0644]
tools/perf/scripts/perl/rw-by-file.pl [new file with mode: 0644]
tools/perf/scripts/perl/rw-by-pid.pl [new file with mode: 0644]
tools/perf/scripts/perl/wakeup-latency.pl [new file with mode: 0644]
tools/perf/scripts/perl/workqueue-stats.pl [new file with mode: 0644]
tools/perf/util/cache.h
tools/perf/util/callchain.c
tools/perf/util/callchain.h
tools/perf/util/color.h
tools/perf/util/ctype.c
tools/perf/util/data_map.c [new file with mode: 0644]
tools/perf/util/data_map.h [new file with mode: 0644]
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/debugfs.c [new file with mode: 0644]
tools/perf/util/debugfs.h [new file with mode: 0644]
tools/perf/util/event.c [new file with mode: 0644]
tools/perf/util/event.h
tools/perf/util/exec_cmd.h
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/help.h
tools/perf/util/hist.c [new file with mode: 0644]
tools/perf/util/hist.h [new file with mode: 0644]
tools/perf/util/include/asm/asm-offsets.h [new file with mode: 0644]
tools/perf/util/include/asm/bitops.h [new file with mode: 0644]
tools/perf/util/include/asm/bug.h [new file with mode: 0644]
tools/perf/util/include/asm/byteorder.h [new file with mode: 0644]
tools/perf/util/include/asm/swab.h [new file with mode: 0644]
tools/perf/util/include/asm/uaccess.h [new file with mode: 0644]
tools/perf/util/include/linux/bitmap.h [new file with mode: 0644]
tools/perf/util/include/linux/bitops.h [new file with mode: 0644]
tools/perf/util/include/linux/compiler.h [new file with mode: 0644]
tools/perf/util/include/linux/ctype.h [new file with mode: 0644]
tools/perf/util/include/linux/kernel.h
tools/perf/util/include/linux/string.h [new file with mode: 0644]
tools/perf/util/include/linux/types.h [new file with mode: 0644]
tools/perf/util/levenshtein.h
tools/perf/util/map.c
tools/perf/util/module.c [deleted file]
tools/perf/util/module.h [deleted file]
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-options.h
tools/perf/util/probe-event.c [new file with mode: 0644]
tools/perf/util/probe-event.h [new file with mode: 0644]
tools/perf/util/probe-finder.c [new file with mode: 0644]
tools/perf/util/probe-finder.h [new file with mode: 0644]
tools/perf/util/quote.h
tools/perf/util/run-command.h
tools/perf/util/sigchain.h
tools/perf/util/sort.c [new file with mode: 0644]
tools/perf/util/sort.h [new file with mode: 0644]
tools/perf/util/strbuf.h
tools/perf/util/string.c
tools/perf/util/string.h
tools/perf/util/strlist.h
tools/perf/util/svghelper.h
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/thread.c
tools/perf/util/thread.h
tools/perf/util/trace-event-info.c
tools/perf/util/trace-event-parse.c
tools/perf/util/trace-event-perl.c [new file with mode: 0644]
tools/perf/util/trace-event-perl.h [new file with mode: 0644]
tools/perf/util/trace-event-read.c
tools/perf/util/trace-event.h
tools/perf/util/types.h
tools/perf/util/util.h
tools/perf/util/values.h
tools/perf/util/wrapper.c

index b0756d0fd57910a44d2f2d29fdb4edf87dac21ca..8bca1d5cec09a8bf6c8c5f0e6f159d770fcf806c 100644 (file)
@@ -86,4 +86,9 @@
 !Iinclude/trace/events/irq.h
   </chapter>
 
+  <chapter id="signal">
+   <title>SIGNAL</title>
+!Iinclude/trace/events/signal.h
+  </chapter>
+
 </book>
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
new file mode 100644 (file)
index 0000000..47aabee
--- /dev/null
@@ -0,0 +1,149 @@
+                        Kprobe-based Event Tracing
+                        ==========================
+
+                 Documentation is written by Masami Hiramatsu
+
+
+Overview
+--------
+These events are similar to tracepoint based events. Instead of Tracepoint,
+this is based on kprobes (kprobe and kretprobe). So it can probe wherever
+kprobes can probe (this means, all functions body except for __kprobes
+functions). Unlike the Tracepoint based event, this can be added and removed
+dynamically, on the fly.
+
+To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y.
+
+Similar to the events tracer, this doesn't need to be activated via
+current_tracer. Instead of that, add probe points via
+/sys/kernel/debug/tracing/kprobe_events, and enable it via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enabled.
+
+
+Synopsis of kprobe_events
+-------------------------
+  p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]    : Set a probe
+  r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]               : Set a return probe
+
+ GRP           : Group name. If omitted, use "kprobes" for it.
+ EVENT         : Event name. If omitted, the event name is generated
+                 based on SYMBOL+offs or MEMADDR.
+ SYMBOL[+offs] : Symbol+offset where the probe is inserted.
+ MEMADDR       : Address where the probe is inserted.
+
+ FETCHARGS     : Arguments. Each probe can have up to 128 args.
+  %REG         : Fetch register REG
+  @ADDR                : Fetch memory at ADDR (ADDR should be in kernel)
+  @SYM[+|-offs]        : Fetch memory at SYM +|- offs (SYM should be a data symbol)
+  $stackN      : Fetch Nth entry of stack (N >= 0)
+  $stack       : Fetch stack address.
+  $argN                : Fetch function argument. (N >= 0)(*)
+  $retval      : Fetch return value.(**)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
+  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+
+  (*) aN may not correct on asmlinkaged functions and at the middle of
+      function body.
+  (**) only for return probe.
+  (***) this is useful for fetching a field of data structures.
+
+
+Per-Probe Event Filtering
+-------------------------
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event
+under tracing/events/kprobes/<EVENT>, at the directory you can see 'id',
+'enabled', 'format' and 'filter'.
+
+enabled:
+  You can enable/disable the probe by writing 1 or 0 on it.
+
+format:
+  This shows the format of this probe event.
+
+filter:
+  You can write filtering rules of this event.
+
+id:
+  This shows the id of this probe event.
+
+
+Event Profiling
+---------------
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+
+
+Usage examples
+--------------
+To add a probe as a new event, write a new definition to kprobe_events
+as below.
+
+  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kprobe on the top of do_sys_open() function with recording
+1st to 4th arguments as "myprobe" event. As this example shows, users can
+choose more familiar names for each arguments.
+
+  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kretprobe on the return point of do_sys_open() function with
+recording return value as "myretprobe" event.
+ You can see the format of these events via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
+
+  cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
+name: myprobe
+ID: 75
+format:
+       field:unsigned short common_type;       offset:0;       size:2;
+       field:unsigned char common_flags;       offset:2;       size:1;
+       field:unsigned char common_preempt_count;       offset:3;       size:1;
+       field:int common_pid;   offset:4;       size:4;
+       field:int common_tgid;  offset:8;       size:4;
+
+       field: unsigned long ip;        offset:16;tsize:8;
+       field: int nargs;       offset:24;tsize:4;
+       field: unsigned long dfd;       offset:32;tsize:8;
+       field: unsigned long filename;  offset:40;tsize:8;
+       field: unsigned long flags;     offset:48;tsize:8;
+       field: unsigned long mode;      offset:56;tsize:8;
+
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
+
+
+ You can see that the event has 4 arguments as in the expressions you specified.
+
+  echo > /sys/kernel/debug/tracing/kprobe_events
+
+ This clears all probe points.
+
+ Right after definition, each event is disabled by default. For tracing these
+events, you need to enable it.
+
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
+
+ And you can see the traced information via /sys/kernel/debug/tracing/trace.
+
+  cat /sys/kernel/debug/tracing/trace
+# tracer: nop
+#
+#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+#              | |       |          |         |
+           <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe
+           <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+           <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+
+
+ Each line shows when the kernel hits an event, and <- SYMBOL means kernel
+returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
+returns from do_sys_open to sys_open+0x1b).
+
+
index 7f418bbc261a0f825ab6072f935f1f974a2f746c..eef3bbb970753c1d840cb9c1520147850dda6800 100644 (file)
@@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
        bool
 
+config HAVE_HW_BREAKPOINT
+       bool
+       depends on HAVE_PERF_EVENTS
+       select ANON_INODES
+       select PERF_EVENTS
+
+
 source "kernel/gcov/Kconfig"
index 3b1005185390557558c99d3efb971824b8b76b68..bf3382f1904d04d0f0807490c1449691f540a4b7 100644 (file)
@@ -46,7 +46,7 @@ config DEBUG_STACK_USAGE
 
 config HCALL_STATS
        bool "Hypervisor call instrumentation"
-       depends on PPC_PSERIES && DEBUG_FS
+       depends on PPC_PSERIES && DEBUG_FS && TRACEPOINTS
        help
          Adds code to keep track of the number of hypervisor calls made and
          the amount of time spent in hypervisor calls.  Wall time spent in
index f1889abb89b1a196e61861413237f0ecd423368c..c568329723b8d5371f9cb920dbd9f811af9bcb81 100644 (file)
@@ -1683,7 +1683,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_STACK_USAGE is not set
 # CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_HCALL_STATS=y
+# CONFIG_HCALL_STATS is not set
 # CONFIG_CODE_PATCHING_SELFTEST is not set
 # CONFIG_FTR_FIXUP_SELFTEST is not set
 # CONFIG_MSI_BITMAP_SELFTEST is not set
index 9154e8526732cf2f3c217f450244f4d8ed5f0f04..f0fb4fc1f6e6878a2617fbdf6782c5e2883af39b 100644 (file)
@@ -19,6 +19,7 @@
 #define _ASM_POWERPC_EMULATED_OPS_H
 
 #include <asm/atomic.h>
+#include <linux/perf_event.h>
 
 
 #ifdef CONFIG_PPC_EMULATED_STATS
@@ -57,7 +58,7 @@ extern u32 ppc_warn_emulated;
 
 extern void ppc_warn_emulated_print(const char *type);
 
-#define PPC_WARN_EMULATED(type)                                                 \
+#define __PPC_WARN_EMULATED(type)                                       \
        do {                                                             \
                atomic_inc(&ppc_emulated.type.val);                      \
                if (ppc_warn_emulated)                                   \
@@ -66,8 +67,22 @@ extern void ppc_warn_emulated_print(const char *type);
 
 #else /* !CONFIG_PPC_EMULATED_STATS */
 
-#define PPC_WARN_EMULATED(type)        do { } while (0)
+#define __PPC_WARN_EMULATED(type)      do { } while (0)
 
 #endif /* !CONFIG_PPC_EMULATED_STATS */
 
+#define PPC_WARN_EMULATED(type, regs)                                  \
+       do {                                                            \
+               perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,           \
+                       1, 0, regs, 0);                                 \
+               __PPC_WARN_EMULATED(type);                              \
+       } while (0)
+
+#define PPC_WARN_ALIGNMENT(type, regs)                                 \
+       do {                                                            \
+               perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS,           \
+                       1, 0, regs, regs->dar);                         \
+               __PPC_WARN_EMULATED(type);                              \
+       } while (0)
+
 #endif /* _ASM_POWERPC_EMULATED_OPS_H */
index 6251a4b10be7a9a2004592b6aa7dff7994c09b0e..c27caac47ad1d6c92980ea78d13e03778053a13f 100644 (file)
@@ -274,6 +274,8 @@ struct hcall_stats {
        unsigned long   num_calls;      /* number of calls (on this CPU) */
        unsigned long   tb_total;       /* total wall time (mftb) of calls. */
        unsigned long   purr_total;     /* total cpu time (PURR) of calls. */
+       unsigned long   tb_start;
+       unsigned long   purr_start;
 };
 #define HCALL_STAT_ARRAY_SIZE  ((MAX_HCALL_OPCODE >> 2) + 1)
 
index 6315edc205d8673e1db6702224b477cee0ea602e..bc8dd53f718a1b201a16e91f4864bb221f5acb25 100644 (file)
 #define SPRN_MMCR1     798
 #define SPRN_MMCRA     0x312
 #define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
+#define   MMCRA_SDAR_DCACHE_MISS 0x40000000UL
+#define   MMCRA_SDAR_ERAT_MISS   0x20000000UL
 #define   MMCRA_SIHV   0x10000000UL /* state of MSR HV when SIAR set */
 #define   MMCRA_SIPR   0x08000000UL /* state of MSR PR when SIAR set */
 #define   MMCRA_SLOT   0x07000000UL /* SLOT bits (37-39) */
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
new file mode 100644 (file)
index 0000000..cbe2297
--- /dev/null
@@ -0,0 +1,133 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM powerpc
+
+#if !defined(_TRACE_POWERPC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWERPC_H
+
+#include <linux/tracepoint.h>
+
+struct pt_regs;
+
+TRACE_EVENT(irq_entry,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(irq_exit,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(timer_interrupt_entry,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(timer_interrupt_exit,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+#ifdef CONFIG_PPC_PSERIES
+extern void hcall_tracepoint_regfunc(void);
+extern void hcall_tracepoint_unregfunc(void);
+
+TRACE_EVENT_FN(hcall_entry,
+
+       TP_PROTO(unsigned long opcode, unsigned long *args),
+
+       TP_ARGS(opcode, args),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, opcode)
+       ),
+
+       TP_fast_assign(
+               __entry->opcode = opcode;
+       ),
+
+       TP_printk("opcode=%lu", __entry->opcode),
+
+       hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
+);
+
+TRACE_EVENT_FN(hcall_exit,
+
+       TP_PROTO(unsigned long opcode, unsigned long retval,
+               unsigned long *retbuf),
+
+       TP_ARGS(opcode, retval, retbuf),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, opcode)
+               __field(unsigned long, retval)
+       ),
+
+       TP_fast_assign(
+               __entry->opcode = opcode;
+               __entry->retval = retval;
+       ),
+
+       TP_printk("opcode=%lu retval=%lu", __entry->opcode, __entry->retval),
+
+       hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
+);
+#endif
+
+#endif /* _TRACE_POWERPC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
index a5b632e52faea9dab9c63921430d572ca1e9f0f2..3839839f83c79ad8544ef0bbf6ded33e0d96ce13 100644 (file)
@@ -732,7 +732,7 @@ int fix_alignment(struct pt_regs *regs)
 
 #ifdef CONFIG_SPE
        if ((instr >> 26) == 0x4) {
-               PPC_WARN_EMULATED(spe);
+               PPC_WARN_ALIGNMENT(spe, regs);
                return emulate_spe(regs, reg, instr);
        }
 #endif
@@ -786,7 +786,7 @@ int fix_alignment(struct pt_regs *regs)
                        flags |= SPLT;
                        nb = 8;
                }
-               PPC_WARN_EMULATED(vsx);
+               PPC_WARN_ALIGNMENT(vsx, regs);
                return emulate_vsx(addr, reg, areg, regs, flags, nb);
        }
 #endif
@@ -794,7 +794,7 @@ int fix_alignment(struct pt_regs *regs)
         * the exception of DCBZ which is handled as a special case here
         */
        if (instr == DCBZ) {
-               PPC_WARN_EMULATED(dcbz);
+               PPC_WARN_ALIGNMENT(dcbz, regs);
                return emulate_dcbz(regs, addr);
        }
        if (unlikely(nb == 0))
@@ -804,7 +804,7 @@ int fix_alignment(struct pt_regs *regs)
         * function
         */
        if (flags & M) {
-               PPC_WARN_EMULATED(multiple);
+               PPC_WARN_ALIGNMENT(multiple, regs);
                return emulate_multiple(regs, addr, reg, nb,
                                        flags, instr, swiz);
        }
@@ -825,11 +825,11 @@ int fix_alignment(struct pt_regs *regs)
 
        /* Special case for 16-byte FP loads and stores */
        if (nb == 16) {
-               PPC_WARN_EMULATED(fp_pair);
+               PPC_WARN_ALIGNMENT(fp_pair, regs);
                return emulate_fp_pair(addr, reg, flags);
        }
 
-       PPC_WARN_EMULATED(unaligned);
+       PPC_WARN_ALIGNMENT(unaligned, regs);
 
        /* If we are loading, get the data from user space, else
         * get it from register values
index 9763267e38b46cbcdb2d6c9bfffd83d61b1eeb08..bdcb557d470ab1a3f272f0783e0e77244f88cbe2 100644 (file)
@@ -551,7 +551,7 @@ restore:
 BEGIN_FW_FTR_SECTION
        ld      r5,SOFTE(r1)
 FW_FTR_SECTION_ELSE
-       b       iseries_check_pending_irqs
+       b       .Liseries_check_pending_irqs
 ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
        TRACE_AND_RESTORE_IRQ(r5);
@@ -623,7 +623,7 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 
 #endif /* CONFIG_PPC_BOOK3E */
 
-iseries_check_pending_irqs:
+.Liseries_check_pending_irqs:
 #ifdef CONFIG_PPC_ISERIES
        ld      r5,SOFTE(r1)
        cmpdi   0,r5,0
index 1808876edcc91d4f43fdff8fde2c8e1770fc832f..c7eb4e0eb86cae27f3fcd510519d5177b124fa5e 100644 (file)
@@ -185,12 +185,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
         * prolog code of the PerformanceMonitor one. A little
         * trickery is thus necessary
         */
+performance_monitor_pSeries_1:
        . = 0xf00
        b       performance_monitor_pSeries
 
+altivec_unavailable_pSeries_1:
        . = 0xf20
        b       altivec_unavailable_pSeries
 
+vsx_unavailable_pSeries_1:
        . = 0xf40
        b       vsx_unavailable_pSeries
 
index e5d1211779840f029841e1591952c3b5ec9b4fae..02a334662cc02148bf6f1cb9ba85cf118d799059 100644 (file)
@@ -70,6 +70,8 @@
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
 #endif
+#define CREATE_TRACE_POINTS
+#include <asm/trace.h>
 
 int __irq_offset_value;
 static int ppc_spurious_interrupts;
@@ -325,6 +327,8 @@ void do_IRQ(struct pt_regs *regs)
        struct pt_regs *old_regs = set_irq_regs(regs);
        unsigned int irq;
 
+       trace_irq_entry(regs);
+
        irq_enter();
 
        check_stack_overflow();
@@ -348,6 +352,8 @@ void do_IRQ(struct pt_regs *regs)
                timer_interrupt(regs);
        }
 #endif
+
+       trace_irq_exit(regs);
 }
 
 void __init init_IRQ(void)
index 87f1663584b054182b3f7719fe96088e637fb78c..1eb85fbf53a50277f48be5ae5d8c57772d216d61 100644 (file)
@@ -1165,7 +1165,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         */
        if (record) {
                struct perf_sample_data data = {
-                       .addr   = 0,
+                       .addr   = ~0ULL,
                        .period = event->hw.last_period,
                };
 
index 0f4c1c73a6adea4ca405b3ec223faca3370e0ee3..199de527d411446918651bd374bc5e9586ace46f 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0x7f
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
index c351b3a57fbb3361cb04103e95f514b86dcaff52..98b6a729a9dd127cc2c88e799b58bdbea2fa313c 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0x7f
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
@@ -390,7 +386,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        unsigned int pmc, unit, byte, psel;
        unsigned int ttm, grp;
        int i, isbus, bit, grsel;
index ca399ba5034c9d13373484476312b6c7bc8691f2..84a607bda8fbc129562943d7d2ce7fe0ee0f11bd 100644 (file)
@@ -178,7 +178,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
                           unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        int i;
        unsigned int pmc, ev, b, u, s, psel;
        unsigned int ttmset = 0;
index 28a4daacdc0222cc6b80c3824bf1529ea2f9556f..852f7b7f6b4045801df807b997c7b110426ce7a6 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0xff
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
@@ -230,7 +226,7 @@ static int power7_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        unsigned int pmc, unit, combine, l2sel, psel;
        unsigned int pmc_inuse = 0;
        int i;
index 479574413a93fa5c7aa3b38e28537b2830517f90..8eff48e20dba8ae056f9f4469642a5cdc453aefc 100644 (file)
@@ -83,10 +83,6 @@ static short mmcr1_adder_bits[8] = {
        MMCR1_PMC8_ADDER_SEL_SH
 };
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
index 4271f7a655a3adfb5e0636e83c8f6e59b61e1868..845c72ab7357884c580a1c6f3f3217ed4987ee75 100644 (file)
@@ -660,6 +660,7 @@ late_initcall(check_cache_coherency);
 
 #ifdef CONFIG_DEBUG_FS
 struct dentry *powerpc_debugfs_root;
+EXPORT_SYMBOL(powerpc_debugfs_root);
 
 static int powerpc_debugfs_init(void)
 {
index a136a11c490d0f36a9b32812132e682c38262f8f..36707dec94d775376c0b59eac099ff5e8da75116 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/perf_event.h>
+#include <asm/trace.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -571,6 +572,8 @@ void timer_interrupt(struct pt_regs * regs)
        struct clock_event_device *evt = &decrementer->event;
        u64 now;
 
+       trace_timer_interrupt_entry(regs);
+
        /* Ensure a positive value is written to the decrementer, or else
         * some CPUs will continuue to take decrementer exceptions */
        set_dec(DECREMENTER_MAX);
@@ -590,6 +593,7 @@ void timer_interrupt(struct pt_regs * regs)
                now = decrementer->next_tb - now;
                if (now <= DECREMENTER_MAX)
                        set_dec((int)now);
+               trace_timer_interrupt_exit(regs);
                return;
        }
        old_regs = set_irq_regs(regs);
@@ -620,6 +624,8 @@ void timer_interrupt(struct pt_regs * regs)
 
        irq_exit();
        set_irq_regs(old_regs);
+
+       trace_timer_interrupt_exit(regs);
 }
 
 void wakeup_decrementer(void)
index 6f0ae1a9bfae6d78b31afdf15f1c22d554e5900d..9d1f9354d6cafcf12fd7719d599615e31e55b2d9 100644 (file)
@@ -759,7 +759,7 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulate the mfspr rD, PVR. */
        if ((instword & PPC_INST_MFSPR_PVR_MASK) == PPC_INST_MFSPR_PVR) {
-               PPC_WARN_EMULATED(mfpvr);
+               PPC_WARN_EMULATED(mfpvr, regs);
                rd = (instword >> 21) & 0x1f;
                regs->gpr[rd] = mfspr(SPRN_PVR);
                return 0;
@@ -767,7 +767,7 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulating the dcba insn is just a no-op.  */
        if ((instword & PPC_INST_DCBA_MASK) == PPC_INST_DCBA) {
-               PPC_WARN_EMULATED(dcba);
+               PPC_WARN_EMULATED(dcba, regs);
                return 0;
        }
 
@@ -776,7 +776,7 @@ static int emulate_instruction(struct pt_regs *regs)
                int shift = (instword >> 21) & 0x1c;
                unsigned long msk = 0xf0000000UL >> shift;
 
-               PPC_WARN_EMULATED(mcrxr);
+               PPC_WARN_EMULATED(mcrxr, regs);
                regs->ccr = (regs->ccr & ~msk) | ((regs->xer >> shift) & msk);
                regs->xer &= ~0xf0000000UL;
                return 0;
@@ -784,19 +784,19 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulate load/store string insn. */
        if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) {
-               PPC_WARN_EMULATED(string);
+               PPC_WARN_EMULATED(string, regs);
                return emulate_string_inst(regs, instword);
        }
 
        /* Emulate the popcntb (Population Count Bytes) instruction. */
        if ((instword & PPC_INST_POPCNTB_MASK) == PPC_INST_POPCNTB) {
-               PPC_WARN_EMULATED(popcntb);
+               PPC_WARN_EMULATED(popcntb, regs);
                return emulate_popcntb_inst(regs, instword);
        }
 
        /* Emulate isel (Integer Select) instruction */
        if ((instword & PPC_INST_ISEL_MASK) == PPC_INST_ISEL) {
-               PPC_WARN_EMULATED(isel);
+               PPC_WARN_EMULATED(isel, regs);
                return emulate_isel(regs, instword);
        }
 
@@ -995,7 +995,7 @@ void SoftwareEmulation(struct pt_regs *regs)
 #ifdef CONFIG_MATH_EMULATION
        errcode = do_mathemu(regs);
        if (errcode >= 0)
-               PPC_WARN_EMULATED(math);
+               PPC_WARN_EMULATED(math, regs);
 
        switch (errcode) {
        case 0:
@@ -1018,7 +1018,7 @@ void SoftwareEmulation(struct pt_regs *regs)
 #elif defined(CONFIG_8XX_MINIMAL_FPEMU)
        errcode = Soft_emulate_8xx(regs);
        if (errcode >= 0)
-               PPC_WARN_EMULATED(8xx);
+               PPC_WARN_EMULATED(8xx, regs);
 
        switch (errcode) {
        case 0:
@@ -1129,7 +1129,7 @@ void altivec_assist_exception(struct pt_regs *regs)
 
        flush_altivec_to_thread(current);
 
-       PPC_WARN_EMULATED(altivec);
+       PPC_WARN_EMULATED(altivec, regs);
        err = emulate_altivec(regs);
        if (err == 0) {
                regs->nip += 4;         /* skip emulated instruction */
index 75f3267fdc300977f26ac6cc6e2a0db41add6525..e68beac0a171f41918e37a47aed0722bc0081218 100644 (file)
@@ -26,11 +26,11 @@ BEGIN_FTR_SECTION
        srd     r8,r5,r11
 
        mtctr   r8
-setup:
+.Lsetup:
        dcbt    r9,r4
        dcbz    r9,r3
        add     r9,r9,r12
-       bdnz    setup
+       bdnz    .Lsetup
 END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
        addi    r3,r3,-8
        srdi    r8,r5,7         /* page is copied in 128 byte strides */
index c1427b3634ec8341206d575e4643c7633ffc977d..383a5d0e9818a07758463bf3ddf29a835eb9af99 100644 (file)
        
 #define STK_PARM(i)     (48 + ((i)-3)*8)
 
-#ifdef CONFIG_HCALL_STATS
+#ifdef CONFIG_TRACEPOINTS
+
+       .section        ".toc","aw"
+
+       .globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+       .llong  0
+
+       .section        ".text"
+
 /*
  * precall must preserve all registers.  use unused STK_PARM()
- * areas to save snapshots and opcode.
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
  */
-#define HCALL_INST_PRECALL                                     \
-       std     r3,STK_PARM(r3)(r1);    /* save opcode */       \
-       mftb    r0;                     /* get timebase and */  \
-       std     r0,STK_PARM(r5)(r1);    /* save for later */    \
+#define HCALL_INST_PRECALL(FIRST_REG)                          \
 BEGIN_FTR_SECTION;                                             \
-       mfspr   r0,SPRN_PURR;           /* get PURR and */      \
-       std     r0,STK_PARM(r6)(r1);    /* save for later */    \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);
-       
+       b       1f;                                             \
+END_FTR_SECTION(0, 1);                                         \
+       ld      r12,hcall_tracepoint_refcount@toc(r2);          \
+       cmpdi   r12,0;                                          \
+       beq+    1f;                                             \
+       mflr    r0;                                             \
+       std     r3,STK_PARM(r3)(r1);                            \
+       std     r4,STK_PARM(r4)(r1);                            \
+       std     r5,STK_PARM(r5)(r1);                            \
+       std     r6,STK_PARM(r6)(r1);                            \
+       std     r7,STK_PARM(r7)(r1);                            \
+       std     r8,STK_PARM(r8)(r1);                            \
+       std     r9,STK_PARM(r9)(r1);                            \
+       std     r10,STK_PARM(r10)(r1);                          \
+       std     r0,16(r1);                                      \
+       addi    r4,r1,STK_PARM(FIRST_REG);                      \
+       stdu    r1,-STACK_FRAME_OVERHEAD(r1);                   \
+       bl      .__trace_hcall_entry;                           \
+       addi    r1,r1,STACK_FRAME_OVERHEAD;                     \
+       ld      r0,16(r1);                                      \
+       ld      r3,STK_PARM(r3)(r1);                            \
+       ld      r4,STK_PARM(r4)(r1);                            \
+       ld      r5,STK_PARM(r5)(r1);                            \
+       ld      r6,STK_PARM(r6)(r1);                            \
+       ld      r7,STK_PARM(r7)(r1);                            \
+       ld      r8,STK_PARM(r8)(r1);                            \
+       ld      r9,STK_PARM(r9)(r1);                            \
+       ld      r10,STK_PARM(r10)(r1);                          \
+       mtlr    r0;                                             \
+1:
+
 /*
  * postcall is performed immediately before function return which
  * allows liberal use of volatile registers.  We branch around this
  * in early init (eg when populating the MMU hashtable) by using an
  * unconditional cpu feature.
  */
-#define HCALL_INST_POSTCALL                                    \
+#define __HCALL_INST_POSTCALL                                  \
 BEGIN_FTR_SECTION;                                             \
        b       1f;                                             \
 END_FTR_SECTION(0, 1);                                         \
-       ld      r4,STK_PARM(r3)(r1);    /* validate opcode */   \
-       cmpldi  cr7,r4,MAX_HCALL_OPCODE;                        \
-       bgt-    cr7,1f;                                         \
-                                                               \
-       /* get time and PURR snapshots after hcall */           \
-       mftb    r7;                     /* timebase after */    \
-BEGIN_FTR_SECTION;                                             \
-       mfspr   r8,SPRN_PURR;           /* PURR after */        \
-       ld      r6,STK_PARM(r6)(r1);    /* PURR before */       \
-       subf    r6,r6,r8;               /* delta */             \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);                           \
-       ld      r5,STK_PARM(r5)(r1);    /* timebase before */   \
-       subf    r5,r5,r7;               /* time delta */        \
-                                                               \
-       /* calculate address of stat structure r4 = opcode */   \
-       srdi    r4,r4,2;                /* index into array */  \
-       mulli   r4,r4,HCALL_STAT_SIZE;                          \
-       LOAD_REG_ADDR(r7, per_cpu__hcall_stats);                \
-       add     r4,r4,r7;                                       \
-       ld      r7,PACA_DATA_OFFSET(r13); /* per cpu offset */  \
-       add     r4,r4,r7;                                       \
-                                                               \
-       /* update stats */                                      \
-       ld      r7,HCALL_STAT_CALLS(r4); /* count */            \
-       addi    r7,r7,1;                                        \
-       std     r7,HCALL_STAT_CALLS(r4);                        \
-       ld      r7,HCALL_STAT_TB(r4);   /* timebase */          \
-       add     r7,r7,r5;                                       \
-       std     r7,HCALL_STAT_TB(r4);                           \
-BEGIN_FTR_SECTION;                                             \
-       ld      r7,HCALL_STAT_PURR(r4); /* PURR */              \
-       add     r7,r7,r6;                                       \
-       std     r7,HCALL_STAT_PURR(r4);                         \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);                           \
+       ld      r12,hcall_tracepoint_refcount@toc(r2);          \
+       cmpdi   r12,0;                                          \
+       beq+    1f;                                             \
+       mflr    r0;                                             \
+       ld      r6,STK_PARM(r3)(r1);                            \
+       std     r3,STK_PARM(r3)(r1);                            \
+       mr      r4,r3;                                          \
+       mr      r3,r6;                                          \
+       std     r0,16(r1);                                      \
+       stdu    r1,-STACK_FRAME_OVERHEAD(r1);                   \
+       bl      .__trace_hcall_exit;                            \
+       addi    r1,r1,STACK_FRAME_OVERHEAD;                     \
+       ld      r0,16(r1);                                      \
+       ld      r3,STK_PARM(r3)(r1);                            \
+       mtlr    r0;                                             \
 1:
+
+#define HCALL_INST_POSTCALL_NORETS                             \
+       li      r5,0;                                           \
+       __HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)                            \
+       mr      r5,BUFREG;                                      \
+       __HCALL_INST_POSTCALL
+
 #else
-#define HCALL_INST_PRECALL
-#define HCALL_INST_POSTCALL
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
 #endif
 
        .text
@@ -86,11 +112,11 @@ _GLOBAL(plpar_hcall_norets)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r4)
 
        HVSC                            /* invoke the hypervisor */
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL_NORETS
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
@@ -102,7 +128,7 @@ _GLOBAL(plpar_hcall)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r5)
 
        std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
 
@@ -121,7 +147,7 @@ _GLOBAL(plpar_hcall)
        std     r6, 16(r12)
        std     r7, 24(r12)
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL(r12)
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
@@ -168,7 +194,7 @@ _GLOBAL(plpar_hcall9)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r5)
 
        std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
 
@@ -196,7 +222,7 @@ _GLOBAL(plpar_hcall9)
        std     r11,56(r12)
        std     r0, 64(r12)
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL(r12)
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
index 3631a4f277eb2ab8aaf02676d7124d6a2a430e44..2f58c71b72593911694cc0cd6a33588a0259ef20 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/hvcall.h>
 #include <asm/firmware.h>
 #include <asm/cputable.h>
+#include <asm/trace.h>
 
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
@@ -100,6 +101,35 @@ static const struct file_operations hcall_inst_seq_fops = {
 #define        HCALL_ROOT_DIR          "hcall_inst"
 #define CPU_NAME_BUF_SIZE      32
 
+
+static void probe_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+       struct hcall_stats *h;
+
+       if (opcode > MAX_HCALL_OPCODE)
+               return;
+
+       h = &get_cpu_var(hcall_stats)[opcode / 4];
+       h->tb_start = mftb();
+       h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(unsigned long opcode, unsigned long retval,
+                            unsigned long *retbuf)
+{
+       struct hcall_stats *h;
+
+       if (opcode > MAX_HCALL_OPCODE)
+               return;
+
+       h = &__get_cpu_var(hcall_stats)[opcode / 4];
+       h->num_calls++;
+       h->tb_total = mftb() - h->tb_start;
+       h->purr_total = mfspr(SPRN_PURR) - h->purr_start;
+
+       put_cpu_var(hcall_stats);
+}
+
 static int __init hcall_inst_init(void)
 {
        struct dentry *hcall_root;
@@ -110,6 +140,14 @@ static int __init hcall_inst_init(void)
        if (!firmware_has_feature(FW_FEATURE_LPAR))
                return 0;
 
+       if (register_trace_hcall_entry(probe_hcall_entry))
+               return -EINVAL;
+
+       if (register_trace_hcall_exit(probe_hcall_exit)) {
+               unregister_trace_hcall_entry(probe_hcall_entry);
+               return -EINVAL;
+       }
+
        hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
        if (!hcall_root)
                return -ENOMEM;
index 903eb9eec687471628c37e30026ae6aad5c35548..0707653612bacbc4d7dcf71f45bb4436275a7202 100644 (file)
@@ -39,6 +39,7 @@
 #include <asm/cputable.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
+#include <asm/trace.h>
 
 #include "plpar_wrappers.h"
 #include "pseries.h"
@@ -661,3 +662,35 @@ void arch_free_page(struct page *page, int order)
 EXPORT_SYMBOL(arch_free_page);
 
 #endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+void hcall_tracepoint_regfunc(void)
+{
+       hcall_tracepoint_refcount++;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+       hcall_tracepoint_refcount--;
+}
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+       trace_hcall_entry(opcode, args);
+}
+
+void __trace_hcall_exit(long opcode, unsigned long retval,
+                       unsigned long *retbuf)
+{
+       trace_hcall_exit(opcode, retval, retbuf);
+}
+#endif
index 72ace9515a07a44525778899e1ea04b32b3accbc..178084b4377ccbebb2fb089ea1c17742f9addbfe 100644 (file)
@@ -49,6 +49,7 @@ config X86
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_BZIP2
        select HAVE_KERNEL_LZMA
+       select HAVE_HW_BREAKPOINT
        select HAVE_ARCH_KMEMCHECK
 
 config OUTPUT_FORMAT
index d105f29bb6bb7c9b75fe3369d66f68f2f3ada5ba..7d0b681a132bb0b241717695e33f61c33e21cb46 100644 (file)
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
 config HAVE_MMIOTRACE_SUPPORT
        def_bool y
 
+config X86_DECODER_SELFTEST
+     bool "x86 instruction decoder selftest"
+     depends on DEBUG_KERNEL
+       ---help---
+        Perform x86 instruction decoder selftests at build time.
+        This option is useful for checking the sanity of x86 instruction
+        decoder code.
+        If unsure, say "N".
+
 #
 # IO delay types:
 #
index d2d24c9ee64d926de80a249330e938095623ca85..78b32be55e9e7a82dafcdf96a5b59be4a26decd8 100644 (file)
@@ -155,6 +155,9 @@ all: bzImage
 KBUILD_IMAGE := $(boot)/bzImage
 
 bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+       $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
        $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
        $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
        $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
index 4a8e80cdcfa57a7faff08a2042a6b6fb64f5ae66..9f828f87ca35f418d24d4b7674477f643eea773d 100644 (file)
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += hw_breakpoint.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
index bb70e397aa84c0e7cfa452c1efa9298b8a58c985..7a15588e45d47265391ec508c787d98b374aaeb9 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <asm/debugreg.h>
 
 /*
  * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
                        >> PAGE_SHIFT;
        dump->u_dsize -= dump->u_tsize;
        dump->u_ssize = 0;
-       dump->u_debugreg[0] = current->thread.debugreg0;
-       dump->u_debugreg[1] = current->thread.debugreg1;
-       dump->u_debugreg[2] = current->thread.debugreg2;
-       dump->u_debugreg[3] = current->thread.debugreg3;
-       dump->u_debugreg[4] = 0;
-       dump->u_debugreg[5] = 0;
-       dump->u_debugreg[6] = current->thread.debugreg6;
-       dump->u_debugreg[7] = current->thread.debugreg7;
+       aout_dump_debugregs(dump);
 
        if (dump->start_stack < TASK_SIZE)
                dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
index 3ea6f37be9e2d29a69f6982bb3ddcc80554f1652..8240f76b531e0959be5a4fa823b1820d5d5952b5 100644 (file)
@@ -18,6 +18,7 @@
 #define DR_TRAP1       (0x2)           /* db1 */
 #define DR_TRAP2       (0x4)           /* db2 */
 #define DR_TRAP3       (0x8)           /* db3 */
+#define DR_TRAP_BITS   (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
 #define DR_STEP                (0x4000)        /* single-step */
 #define DR_SWITCH      (0x8000)        /* task switch */
@@ -49,6 +50,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+static inline void hw_breakpoint_disable(void)
+{
+       /* Zero the control register for HW Breakpoint */
+       set_debugreg(0UL, 7);
+
+       /* Zero-out the individual HW breakpoint address registers */
+       set_debugreg(0UL, 0);
+       set_debugreg(0UL, 1);
+       set_debugreg(0UL, 2);
+       set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+       return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
index 82e3e8f010439cde125a3a75c4f0d65a7068b573..108eb6fd1ae7f6da8c94ca72197fdf7f4e3ec74d 100644 (file)
@@ -20,11 +20,11 @@ typedef struct {
        unsigned int irq_call_count;
        unsigned int irq_tlb_count;
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        unsigned int irq_threshold_count;
-# endif
 #endif
 } ____cacheline_aligned irq_cpustat_t;
 
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644 (file)
index 0000000..0675a7c
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef        _I386_HW_BREAKPOINT_H
+#define        _I386_HW_BREAKPOINT_H
+
+#ifdef __KERNEL__
+#define        __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+       char            *name; /* Contains name of the symbol to set bkpt */
+       unsigned long   address;
+       u8              len;
+       u8              type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_1           0x40
+#define X86_BREAKPOINT_LEN_2           0x44
+#define X86_BREAKPOINT_LEN_4           0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE     0x40
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8           0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE   0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW      0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+struct perf_event;
+struct pmu;
+
+extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+                                        struct task_struct *tsk);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+                                          unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+                                 int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* __KERNEL__ */
+#endif /* _I386_HW_BREAKPOINT_H */
+
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644 (file)
index 0000000..205b063
--- /dev/null
@@ -0,0 +1,220 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ        1       /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE  2       /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3       /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK  4       /* 0xF0 */
+#define INAT_PFX_CS    5       /* 0x2E */
+#define INAT_PFX_DS    6       /* 0x3E */
+#define INAT_PFX_ES    7       /* 0x26 */
+#define INAT_PFX_FS    8       /* 0x64 */
+#define INAT_PFX_GS    9       /* 0x65 */
+#define INAT_PFX_SS    10      /* 0x36 */
+#define INAT_PFX_ADDRSZ        11      /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX   12      /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2  13      /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3  14      /* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX        3
+#define INAT_LGCPFX_MAX        11
+
+/* Immediate size */
+#define INAT_IMM_BYTE          1
+#define INAT_IMM_WORD          2
+#define INAT_IMM_DWORD         3
+#define INAT_IMM_QWORD         4
+#define INAT_IMM_PTR           5
+#define INAT_IMM_VWORD32       6
+#define INAT_IMM_VWORD         7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS  0
+#define INAT_PFX_BITS  4
+#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK  (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS  (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS  2
+#define INAT_ESC_MAX   ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK  (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS  (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS  5
+#define INAT_GRP_MAX   ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK  (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS  (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS  3
+#define INAT_IMM_MASK  (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM     (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64   (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM   (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET   (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT   (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK     (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY   (1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx)  (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc)  (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp)   ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm)     (imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+                                            insn_byte_t last_pfx,
+                                            insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+                                           insn_byte_t last_pfx,
+                                           insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+                                         insn_byte_t vex_m,
+                                         insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+       if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+               return 0;
+       else
+               return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+       return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+       return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+       return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+       return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+       return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+       return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+       return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+       return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+       return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+       return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+       return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+       return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644 (file)
index 0000000..cb3c20c
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644 (file)
index 0000000..96c2e0a
--- /dev/null
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+       union {
+               insn_value_t value;
+               insn_byte_t bytes[4];
+       };
+       /* !0 if we've run insn_get_xxx() for this field */
+       unsigned char got;
+       unsigned char nbytes;
+};
+
+struct insn {
+       struct insn_field prefixes;     /*
+                                        * Prefixes
+                                        * prefixes.bytes[3]: last prefix
+                                        */
+       struct insn_field rex_prefix;   /* REX prefix */
+       struct insn_field vex_prefix;   /* VEX prefix */
+       struct insn_field opcode;       /*
+                                        * opcode.bytes[0]: opcode1
+                                        * opcode.bytes[1]: opcode2
+                                        * opcode.bytes[2]: opcode3
+                                        */
+       struct insn_field modrm;
+       struct insn_field sib;
+       struct insn_field displacement;
+       union {
+               struct insn_field immediate;
+               struct insn_field moffset1;     /* for 64bit MOV */
+               struct insn_field immediate1;   /* for 64bit imm or off16/32 */
+       };
+       union {
+               struct insn_field moffset2;     /* for 64bit MOV */
+               struct insn_field immediate2;   /* for 64bit imm or seg16 */
+       };
+
+       insn_attr_t attr;
+       unsigned char opnd_bytes;
+       unsigned char addr_bytes;
+       unsigned char length;
+       unsigned char x86_64;
+
+       const insn_byte_t *kaddr;       /* kernel address of insn to analyze */
+       const insn_byte_t *next_byte;
+};
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags  */
+#define X86_VEX_W(vex) ((vex) & 0x80)  /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80)  /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40)  /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20)  /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04)  /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex)        ((vex) & 0x1f)          /* VEX3 Byte1 */
+#define X86_VEX2_M     1                       /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3)   /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03)          /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX  0x1f                    /* VEX3.M Maximum value */
+
+/* The last prefix is needed for two-byte and three-byte opcodes */
+static inline insn_byte_t insn_last_prefix(struct insn *insn)
+{
+       return insn->prefixes.bytes[3];
+}
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+       insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+       insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+       insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+       return (insn->vex_prefix.value != 0);
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX2_M;
+       else
+               return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX_P(insn->vex_prefix.bytes[1]);
+       else
+               return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+       return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+       return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+       return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+       return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+       return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+       return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+       return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
index f1363b72364f3e2a53609e77e52379f7f3998b3a..858baa061cfce365a51e48bf162f92cb00a8e8ba 100644 (file)
@@ -108,6 +108,8 @@ struct mce_log {
 #define K8_MCE_THRESHOLD_BANK_5    (MCE_THRESHOLD_BASE + 5 * 9)
 #define K8_MCE_THRESHOLD_DRAM_ECC  (MCE_THRESHOLD_BANK_4 + 0)
 
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
 #ifdef __KERNEL__
 
 #include <linux/percpu.h>
@@ -118,9 +120,11 @@ extern int mce_disabled;
 extern int mce_p5_enabled;
 
 #ifdef CONFIG_X86_MCE
-void mcheck_init(struct cpuinfo_x86 *c);
+int mcheck_init(void);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
 #else
-static inline void mcheck_init(struct cpuinfo_x86 *c) {}
+static inline int mcheck_init(void) { return 0; }
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
@@ -214,5 +218,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
 
 void mce_log_therm_throt_event(__u64 status);
 
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
+#else
+static inline void mcheck_intel_therm_init(void) { }
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
index ad7ce3fd5065a8354035233de59fd1ba21b8d0e8..8d9f8548a870498e94e3de1ced5811692238962c 100644 (file)
  */
 #define ARCH_PERFMON_EVENT_MASK                                    0xffff
 
+/*
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define ARCH_PERFMON_EVENT_FILTER_MASK                 0xff840000
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL                0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK                (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                         0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
                (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
index c9786480f0fe4d074e9575557316d7f4b358be4b..6f8ec1c37e0a8c999f9344082465fd2c9b4570f1 100644 (file)
@@ -30,6 +30,7 @@ struct mm_struct;
 #include <linux/math64.h>
 #include <linux/init.h>
 
+#define HBP_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
 
+struct perf_event;
+
 struct thread_struct {
        /* Cached TLS descriptors: */
        struct desc_struct      tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
        unsigned long           fs;
 #endif
        unsigned long           gs;
-       /* Hardware debugging registers: */
-       unsigned long           debugreg0;
-       unsigned long           debugreg1;
-       unsigned long           debugreg2;
-       unsigned long           debugreg3;
-       unsigned long           debugreg6;
-       unsigned long           debugreg7;
+       /* Save middle states of ptrace breakpoints */
+       struct perf_event       *ptrace_bps[HBP_NUM];
+       /* Debug status used for traps, single steps, etc... */
+       unsigned long           debugreg6;
        /* Fault info: */
        unsigned long           cr2;
        unsigned long           trap_no;
index 0f0d908349aa3f375e87f68802cbcb2e7753f725..3d11fd0f44c5f4f86c060207417710db58758094 100644 (file)
@@ -7,6 +7,7 @@
 
 #ifdef __KERNEL__
 #include <asm/segment.h>
+#include <asm/page_types.h>
 #endif
 
 #ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
        return regs->sp;
 }
 
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs:      pt_regs from which register value is gotten.
+ * @offset:    offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+                                             unsigned int offset)
+{
+       if (unlikely(offset > MAX_REG_OFFSET))
+               return 0;
+       return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+                                          unsigned long addr)
+{
+       return ((addr & ~(THREAD_SIZE - 1))  ==
+               (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @n:         stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+                                                     unsigned int n)
+{
+       unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+       addr += n;
+       if (regs_within_kernel_stack(regs, (unsigned long)addr))
+               return *addr;
+       else
+               return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
+                                          unsigned int n);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
index d8e5d0cdd678d3b4396c0e7f859b7c3f6ac0d212..4f2e66e29ecc5cc35e749f18e194b2ad11f398d2 100644 (file)
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64)  += sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)   += syscall_64.o vsyscall_64.o
 obj-y                  += bootflag.o e820.o
 obj-y                  += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y                  += alternative.o i8253.o pci-nommu.o
+obj-y                  += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y                  += tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline.o
index 68537e957a9b2cd621185e275103c201af9502e6..1d2cb383410ebef206fb48e2ccecbe8fd03f90ce 100644 (file)
@@ -5,6 +5,7 @@
 # Don't trace early stages of a secondary CPU boot
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 # Make sure load_percpu_segment has no stackprotector
index cc25c2b4a567c2ca3e020127cefe87b2778f02ee..9053be5d95cd4fb21f4aae94f2e7335af6a6f83a 100644 (file)
@@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
        }
 
-#ifdef CONFIG_X86_MCE
        /* Init Machine Check Exception if available. */
-       mcheck_init(c);
-#endif
+       mcheck_cpu_init(c);
 
        select_idle_routine(c);
 
index 721a77ca811536eb2129e02449701948fe9aa9c3..0bcaa3875863aaefa98942e00d93ae5be0fecdb6 100644 (file)
@@ -46,6 +46,9 @@
 
 #include "mce-internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
 int mce_disabled __read_mostly;
 
 #define MISC_MCELOG_MINOR      227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                     cpu_missing;
 
-static void default_decode_mce(struct mce *m)
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
+
+static int default_decode_mce(struct notifier_block *nb, unsigned long val,
+                              void *data)
 {
        pr_emerg("No human readable MCE decoding support on this CPU type.\n");
        pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+
+       return NOTIFY_STOP;
 }
 
-/*
- * CPU/chipset specific EDAC code can register a callback here to print
- * MCE errors in a human-readable form:
- */
-void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
-EXPORT_SYMBOL(x86_mce_decode_callback);
+static struct notifier_block mce_dec_nb = {
+       .notifier_call = default_decode_mce,
+       .priority      = -1,
+};
 
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
 {
        unsigned next, entry;
 
+       /* Emit the trace record: */
+       trace_mce_record(mce);
+
        mce->finished = 0;
        wmb();
        for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
 
        /*
         * Print out human-readable details about the MCE error,
-        * (if the CPU has an implementation for that):
+        * (if the CPU has an implementation for that)
         */
-       x86_mce_decode_callback(m);
+       atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 }
 
 static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
 static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_timer(unsigned long data)
+static void mce_start_timer(unsigned long data)
 {
        struct timer_list *t = &per_cpu(mce_timer, data);
        int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
 }
 EXPORT_SYMBOL_GPL(mce_notify_irq);
 
-static int mce_banks_init(void)
+static int __cpuinit __mcheck_cpu_mce_banks_init(void)
 {
        int i;
 
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
 /*
  * Initialize Machine Checks for a CPU.
  */
-static int __cpuinit mce_cap_init(void)
+static int __cpuinit __mcheck_cpu_cap_init(void)
 {
        unsigned b;
        u64 cap;
@@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
        WARN_ON(banks != 0 && b != banks);
        banks = b;
        if (!mce_banks) {
-               int err = mce_banks_init();
+               int err = __mcheck_cpu_mce_banks_init();
 
                if (err)
                        return err;
@@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
        return 0;
 }
 
-static void mce_init(void)
+static void __mcheck_cpu_init_generic(void)
 {
        mce_banks_t all_banks;
        u64 cap;
@@ -1273,7 +1287,7 @@ static void mce_init(void)
 }
 
 /* Add per CPU specific workarounds here */
-static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 {
        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
                pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
        return 0;
 }
 
-static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
        if (c->x86 != 5)
                return;
@@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
        }
 }
 
-static void mce_cpu_features(struct cpuinfo_x86 *c)
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
@@ -1369,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
        }
 }
 
-static void mce_init_timer(void)
+static void __mcheck_cpu_init_timer(void)
 {
        struct timer_list *t = &__get_cpu_var(mce_timer);
        int *n = &__get_cpu_var(mce_next_interval);
@@ -1380,7 +1394,7 @@ static void mce_init_timer(void)
        *n = check_interval * HZ;
        if (!*n)
                return;
-       setup_timer(t, mcheck_timer, smp_processor_id());
+       setup_timer(t, mce_start_timer, smp_processor_id());
        t->expires = round_jiffies(jiffies + *n);
        add_timer_on(t, smp_processor_id());
 }
@@ -1400,27 +1414,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
  */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 {
        if (mce_disabled)
                return;
 
-       mce_ancient_init(c);
+       __mcheck_cpu_ancient_init(c);
 
        if (!mce_available(c))
                return;
 
-       if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
+       if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
                mce_disabled = 1;
                return;
        }
 
        machine_check_vector = do_machine_check;
 
-       mce_init();
-       mce_cpu_features(c);
-       mce_init_timer();
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(c);
+       __mcheck_cpu_init_timer();
        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+
 }
 
 /*
@@ -1640,6 +1655,15 @@ static int __init mcheck_enable(char *str)
 }
 __setup("mce", mcheck_enable);
 
+int __init mcheck_init(void)
+{
+       atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
+
+       mcheck_intel_therm_init();
+
+       return 0;
+}
+
 /*
  * Sysfs support
  */
@@ -1648,7 +1672,7 @@ __setup("mce", mcheck_enable);
  * Disable machine checks on suspend and shutdown. We can't really handle
  * them later.
  */
-static int mce_disable(void)
+static int mce_disable_error_reporting(void)
 {
        int i;
 
@@ -1663,12 +1687,12 @@ static int mce_disable(void)
 
 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 {
-       return mce_disable();
+       return mce_disable_error_reporting();
 }
 
 static int mce_shutdown(struct sys_device *dev)
 {
-       return mce_disable();
+       return mce_disable_error_reporting();
 }
 
 /*
@@ -1678,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
  */
 static int mce_resume(struct sys_device *dev)
 {
-       mce_init();
-       mce_cpu_features(&current_cpu_data);
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(&current_cpu_data);
 
        return 0;
 }
@@ -1689,8 +1713,8 @@ static void mce_cpu_restart(void *data)
        del_timer_sync(&__get_cpu_var(mce_timer));
        if (!mce_available(&current_cpu_data))
                return;
-       mce_init();
-       mce_init_timer();
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_timer();
 }
 
 /* Reinit MCEs after user configuration changes */
@@ -1716,7 +1740,7 @@ static void mce_enable_ce(void *all)
        cmci_reenable();
        cmci_recheck();
        if (all)
-               mce_init_timer();
+               __mcheck_cpu_init_timer();
 }
 
 static struct sysdev_class mce_sysclass = {
@@ -1929,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void __cpuinit mce_disable_cpu(void *h)
 {
        unsigned long action = *(unsigned long *)h;
        int i;
 
        if (!mce_available(&current_cpu_data))
                return;
+
        if (!(action & CPU_TASKS_FROZEN))
                cmci_clear();
        for (i = 0; i < banks; i++) {
@@ -1946,7 +1971,7 @@ static void mce_disable_cpu(void *h)
        }
 }
 
-static void mce_reenable_cpu(void *h)
+static void __cpuinit mce_reenable_cpu(void *h)
 {
        unsigned long action = *(unsigned long *)h;
        int i;
@@ -2025,7 +2050,7 @@ static __init void mce_init_banks(void)
        }
 }
 
-static __init int mce_init_device(void)
+static __init int mcheck_init_device(void)
 {
        int err;
        int i = 0;
@@ -2053,7 +2078,7 @@ static __init int mce_init_device(void)
        return err;
 }
 
-device_initcall(mce_init_device);
+device_initcall(mcheck_init_device);
 
 /*
  * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2126,7 @@ static int fake_panic_set(void *data, u64 val)
 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
                        fake_panic_set, "%llu\n");
 
-static int __init mce_debugfs_init(void)
+static int __init mcheck_debugfs_init(void)
 {
        struct dentry *dmce, *ffake_panic;
 
@@ -2115,5 +2140,5 @@ static int __init mce_debugfs_init(void)
 
        return 0;
 }
-late_initcall(mce_debugfs_init);
+late_initcall(mcheck_debugfs_init);
 #endif
index b3a1dba75330a4891b7d49739bd8507076a642da..4fef985fc221622623473c25e9abadda053095c9 100644 (file)
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en = ATOMIC_INIT(0);
 
+static u32 lvtthmr_init __read_mostly;
+
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)                                \
        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
        ack_APIC_irq();
 }
 
+void __init mcheck_intel_therm_init(void)
+{
+       /*
+        * This function is only called on boot CPU. Save the init thermal
+        * LVT value on BSP and use that value to restore APs' thermal LVT
+        * entry BIOS programmed later
+        */
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
+               cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
+               lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
 void intel_init_thermal(struct cpuinfo_x86 *c)
 {
        unsigned int cpu = smp_processor_id();
@@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
         * since it might be delivered via SMI already:
         */
        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-       h = apic_read(APIC_LVTTHMR);
+
+       /*
+        * The initial value of thermal LVT entries on all APs always reads
+        * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+        * sequence to them and LVT registers are reset to 0s except for
+        * the mask bits which are set to 1s when APs receive INIT IPI.
+        * Always restore the value that BIOS has programmed on AP based on
+        * BSP's info we saved since BIOS is always setting the same value
+        * for all threads/cores
+        */
+       apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+       h = lvtthmr_init;
+
        if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                printk(KERN_DEBUG
                       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
index b5801c311846304f2fc2263732e93a6a83c07217..c1bbed1021d96c63e96593f1687d50bc7716b22e 100644 (file)
@@ -77,6 +77,18 @@ struct cpu_hw_events {
        struct debug_store      *ds;
 };
 
+struct event_constraint {
+       unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       int             code;
+};
+
+#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
+#define EVENT_CONSTRAINT_END  { .code = 0, .idxmsk[0] = 0 }
+
+#define for_each_event_constraint(e, c) \
+       for ((e) = (c); (e)->idxmsk[0]; (e)++)
+
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -102,6 +114,8 @@ struct x86_pmu {
        u64             intel_ctrl;
        void            (*enable_bts)(u64 config);
        void            (*disable_bts)(void);
+       int             (*get_event_idx)(struct cpu_hw_events *cpuc,
+                                        struct hw_perf_event *hwc);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
        .enabled = 1,
 };
 
+static const struct event_constraint *event_constraints;
+
 /*
  * Not sure about some of these
  */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
        return hw_event & P6_EVNTSEL_MASK;
 }
 
+static const struct event_constraint intel_p6_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0xc1, 0x1),    /* FLOPS */
+       EVENT_CONSTRAINT(0x10, 0x1),    /* FP_COMP_OPS_EXE */
+       EVENT_CONSTRAINT(0x11, 0x1),    /* FP_ASSIST */
+       EVENT_CONSTRAINT(0x12, 0x2),    /* MUL */
+       EVENT_CONSTRAINT(0x13, 0x2),    /* DIV */
+       EVENT_CONSTRAINT(0x14, 0x1),    /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT_END
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
 };
 
+static const struct event_constraint intel_core_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0x10, 0x1),    /* FP_COMP_OPS_EXE */
+       EVENT_CONSTRAINT(0x11, 0x2),    /* FP_ASSIST */
+       EVENT_CONSTRAINT(0x12, 0x2),    /* MUL */
+       EVENT_CONSTRAINT(0x13, 0x2),    /* DIV */
+       EVENT_CONSTRAINT(0x14, 0x1),    /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT(0x18, 0x1),    /* IDLE_DURING_DIV */
+       EVENT_CONSTRAINT(0x19, 0x2),    /* DELAYED_BYPASS */
+       EVENT_CONSTRAINT(0xa1, 0x1),    /* RS_UOPS_DISPATCH_CYCLES */
+       EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED */
+       EVENT_CONSTRAINT_END
+};
+
+static const struct event_constraint intel_nehalem_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0x40, 0x3),    /* L1D_CACHE_LD */
+       EVENT_CONSTRAINT(0x41, 0x3),    /* L1D_CACHE_ST */
+       EVENT_CONSTRAINT(0x42, 0x3),    /* L1D_CACHE_LOCK */
+       EVENT_CONSTRAINT(0x43, 0x3),    /* L1D_ALL_REF */
+       EVENT_CONSTRAINT(0x4e, 0x3),    /* L1D_PREFETCH */
+       EVENT_CONSTRAINT(0x4c, 0x3),    /* LOAD_HIT_PRE */
+       EVENT_CONSTRAINT(0x51, 0x3),    /* L1D */
+       EVENT_CONSTRAINT(0x52, 0x3),    /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+       EVENT_CONSTRAINT(0x53, 0x3),    /* L1D_CACHE_LOCK_FB_HIT */
+       EVENT_CONSTRAINT(0xc5, 0x3),    /* CACHE_LOCK_CYCLES */
+       EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
        return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
 
-static const u64 nehalem_hw_cache_event_ids
+static __initconst u64 nehalem_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
 };
 
-static const u64 core2_hw_cache_event_ids
+static __initconst u64 core2_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
  },
 };
 
-static const u64 atom_hw_cache_event_ids
+static __initconst u64 atom_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 #define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
 #define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
 #define CORE_EVNTSEL_INV_MASK          0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK  0xFF000000ULL
+#define CORE_EVNTSEL_REG_MASK          0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK              \
        (CORE_EVNTSEL_EVENT_MASK |      \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
        return hw_event & CORE_EVNTSEL_MASK;
 }
 
-static const u64 amd_hw_cache_event_ids
+static __initconst u64 amd_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
         */
        hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
+       hwc->idx = -1;
+
        /*
         * Count user and OS events unless requested not to.
         */
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
                x86_pmu_enable_event(hwc, idx);
 }
 
-static int
-fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+static int fixed_mode_idx(struct hw_perf_event *hwc)
 {
        unsigned int hw_event;
 
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
        if (!x86_pmu.num_events_fixed)
                return -1;
 
+       /*
+        * fixed counters do not take all possible filters
+        */
+       if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
+               return -1;
+
        if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
                return X86_PMC_IDX_FIXED_INSTRUCTIONS;
        if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
 }
 
 /*
- * Find a PMC slot for the freshly enabled / scheduled in event:
+ * generic counter allocator: get next free counter
  */
-static int x86_pmu_enable(struct perf_event *event)
+static int
+gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+       int idx;
+
+       idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
+       return idx == x86_pmu.num_events ? -1 : idx;
+}
+
+/*
+ * intel-specific counter allocator: check event constraints
+ */
+static int
+intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+       const struct event_constraint *event_constraint;
+       int i, code;
+
+       if (!event_constraints)
+               goto skip;
+
+       code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
+
+       for_each_event_constraint(event_constraint, event_constraints) {
+               if (code == event_constraint->code) {
+                       for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
+                               if (!test_and_set_bit(i, cpuc->used_mask))
+                                       return i;
+                       }
+                       return -1;
+               }
+       }
+skip:
+       return gen_get_event_idx(cpuc, hwc);
+}
+
+static int
+x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 {
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
        int idx;
 
-       idx = fixed_mode_idx(event, hwc);
+       idx = fixed_mode_idx(hwc);
        if (idx == X86_PMC_IDX_FIXED_BTS) {
                /* BTS is already occupied. */
                if (test_and_set_bit(idx, cpuc->used_mask))
                        return -EAGAIN;
 
                hwc->config_base        = 0;
-               hwc->event_base = 0;
+               hwc->event_base         = 0;
                hwc->idx                = idx;
        } else if (idx >= 0) {
                /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
        } else {
                idx = hwc->idx;
                /* Try to get the previous generic event again */
-               if (test_and_set_bit(idx, cpuc->used_mask)) {
+               if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-                       idx = find_first_zero_bit(cpuc->used_mask,
-                                                 x86_pmu.num_events);
-                       if (idx == x86_pmu.num_events)
+                       idx = x86_pmu.get_event_idx(cpuc, hwc);
+                       if (idx == -1)
                                return -EAGAIN;
 
                        set_bit(idx, cpuc->used_mask);
                        hwc->idx = idx;
                }
-               hwc->config_base  = x86_pmu.eventsel;
-               hwc->event_base = x86_pmu.perfctr;
+               hwc->config_base = x86_pmu.eventsel;
+               hwc->event_base  = x86_pmu.perfctr;
        }
 
+       return idx;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx;
+
+       idx = x86_schedule_event(cpuc, hwc);
+       if (idx < 0)
+               return idx;
+
        perf_events_lapic_init();
 
        x86_pmu.disable(hwc, idx);
@@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
        .priority               = 1
 };
 
-static struct x86_pmu p6_pmu = {
+static __initconst struct x86_pmu p6_pmu = {
        .name                   = "p6",
        .handle_irq             = p6_pmu_handle_irq,
        .disable_all            = p6_pmu_disable_all,
@@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = {
         */
        .event_bits             = 32,
        .event_mask             = (1ULL << 32) - 1,
+       .get_event_idx          = intel_get_event_idx,
 };
 
-static struct x86_pmu intel_pmu = {
+static __initconst struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
        .disable_all            = intel_pmu_disable_all,
@@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = {
        .max_period             = (1ULL << 31) - 1,
        .enable_bts             = intel_pmu_enable_bts,
        .disable_bts            = intel_pmu_disable_bts,
+       .get_event_idx          = intel_get_event_idx,
 };
 
-static struct x86_pmu amd_pmu = {
+static __initconst struct x86_pmu amd_pmu = {
        .name                   = "AMD",
        .handle_irq             = amd_pmu_handle_irq,
        .disable_all            = amd_pmu_disable_all,
@@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = {
        .apic                   = 1,
        /* use highest bit to detect overflow */
        .max_period             = (1ULL << 47) - 1,
+       .get_event_idx          = gen_get_event_idx,
 };
 
-static int p6_pmu_init(void)
+static __init int p6_pmu_init(void)
 {
        switch (boot_cpu_data.x86_model) {
        case 1:
@@ -1932,10 +2047,12 @@ static int p6_pmu_init(void)
        case 7:
        case 8:
        case 11: /* Pentium III */
+               event_constraints = intel_p6_event_constraints;
                break;
        case 9:
        case 13:
                /* Pentium M */
+               event_constraints = intel_p6_event_constraints;
                break;
        default:
                pr_cont("unsupported p6 CPU model %d ",
@@ -1954,7 +2071,7 @@ static int p6_pmu_init(void)
        return 0;
 }
 
-static int intel_pmu_init(void)
+static __init int intel_pmu_init(void)
 {
        union cpuid10_edx edx;
        union cpuid10_eax eax;
@@ -2007,12 +2124,14 @@ static int intel_pmu_init(void)
                       sizeof(hw_cache_event_ids));
 
                pr_cont("Core2 events, ");
+               event_constraints = intel_core_event_constraints;
                break;
        default:
        case 26:
                memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
 
+               event_constraints = intel_nehalem_event_constraints;
                pr_cont("Nehalem/Corei7 events, ");
                break;
        case 28:
@@ -2025,7 +2144,7 @@ static int intel_pmu_init(void)
        return 0;
 }
 
-static int amd_pmu_init(void)
+static __init int amd_pmu_init(void)
 {
        /* Performance-monitoring supported from K7 and later: */
        if (boot_cpu_data.x86 < 6)
@@ -2105,11 +2224,47 @@ static const struct pmu pmu = {
        .unthrottle     = x86_pmu_unthrottle,
 };
 
+static int
+validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+       struct hw_perf_event fake_event = event->hw;
+
+       if (event->pmu && event->pmu != &pmu)
+               return 0;
+
+       return x86_schedule_event(cpuc, &fake_event) >= 0;
+}
+
+static int validate_group(struct perf_event *event)
+{
+       struct perf_event *sibling, *leader = event->group_leader;
+       struct cpu_hw_events fake_pmu;
+
+       memset(&fake_pmu, 0, sizeof(fake_pmu));
+
+       if (!validate_event(&fake_pmu, leader))
+               return -ENOSPC;
+
+       list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+               if (!validate_event(&fake_pmu, sibling))
+                       return -ENOSPC;
+       }
+
+       if (!validate_event(&fake_pmu, event))
+               return -ENOSPC;
+
+       return 0;
+}
+
 const struct pmu *hw_perf_event_init(struct perf_event *event)
 {
        int err;
 
        err = __hw_perf_event_init(event);
+       if (!err) {
+               if (event->group_leader != event)
+                       err = validate_group(event);
+       }
        if (err) {
                if (event->destroy)
                        event->destroy(event);
index 7d52e9da5e0cc0d1942b6cd9624f83f504491e77..50b9c220e1213ceacf48e10f4f302a98b65ced4f 100644 (file)
@@ -333,6 +333,10 @@ ENTRY(ret_from_fork)
        CFI_ENDPROC
 END(ret_from_fork)
 
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
@@ -383,6 +387,10 @@ need_resched:
 END(resume_kernel)
 #endif
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -513,6 +521,10 @@ sysexit_audit:
        PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        # system call handler stub
 ENTRY(system_call)
        RING0_INT_FRAME                 # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
        jmp resume_userspace
 END(syscall_badsys)
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
 
+/*
+ *  Irq entries should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn) \
 ENTRY(name)                            \
        RING0_INT_FRAME;                \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
        jmp error_code
        CFI_ENDPROC
 END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 ENTRY(kernel_thread_helper)
        pushl $0                # fake return address for unwinder
index bd5bbddddf91eab956106bff572e03a0ba86b9f9..722df1b1152d57721568ef583e4f2c002922979b 100644 (file)
@@ -803,6 +803,10 @@ END(interrupt)
        call \func
        .endm
 
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        /*
         * The interrupt stubs push (~vector+0x80) onto the stack and
         * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
 
        CFI_ENDPROC
 END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * APIC interrupts.
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644 (file)
index 0000000..d42f65a
--- /dev/null
@@ -0,0 +1,555 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+       unsigned long bp_info;
+
+       bp_info = (len | type) & 0xf;
+       bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+       bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+       return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+       return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+       int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+       *len = (bp_info & 0xc) | 0x40;
+       *type = (bp_info & 0x3) | 0x80;
+
+       return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned long *dr7;
+       int i;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+               if (!*slot) {
+                       *slot = bp;
+                       break;
+               }
+       }
+
+       if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+               return -EBUSY;
+
+       set_debugreg(info->address, i);
+       __get_cpu_var(cpu_debugreg[i]) = info->address;
+
+       dr7 = &__get_cpu_var(cpu_dr7);
+       *dr7 |= encode_dr7(i, info->len, info->type);
+
+       set_debugreg(*dr7, 7);
+
+       return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned long *dr7;
+       int i;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+               if (*slot == bp) {
+                       *slot = NULL;
+                       break;
+               }
+       }
+
+       if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+               return;
+
+       dr7 = &__get_cpu_var(cpu_dr7);
+       *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+       set_debugreg(*dr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+       unsigned int len_in_bytes = 0;
+
+       switch (hbp_len) {
+       case X86_BREAKPOINT_LEN_1:
+               len_in_bytes = 1;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               len_in_bytes = 2;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               len_in_bytes = 4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               len_in_bytes = 8;
+               break;
+#endif
+       }
+       return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+       unsigned int len;
+
+       len = get_hbp_len(hbp_len);
+
+       return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+       unsigned int len;
+
+       len = get_hbp_len(hbp_len);
+
+       return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+static int arch_store_info(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       /*
+        * For kernel-addresses, either the address or symbol name can be
+        * specified.
+        */
+       if (info->name)
+               info->address = (unsigned long)
+                               kallsyms_lookup_name(info->name);
+       if (info->address)
+               return 0;
+
+       return -EINVAL;
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+                          int *gen_len, int *gen_type)
+{
+       /* Len */
+       switch (x86_len) {
+       case X86_BREAKPOINT_LEN_1:
+               *gen_len = HW_BREAKPOINT_LEN_1;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               *gen_len = HW_BREAKPOINT_LEN_2;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               *gen_len = HW_BREAKPOINT_LEN_4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               *gen_len = HW_BREAKPOINT_LEN_8;
+               break;
+#endif
+       default:
+               return -EINVAL;
+       }
+
+       /* Type */
+       switch (x86_type) {
+       case X86_BREAKPOINT_EXECUTE:
+               *gen_type = HW_BREAKPOINT_X;
+               break;
+       case X86_BREAKPOINT_WRITE:
+               *gen_type = HW_BREAKPOINT_W;
+               break;
+       case X86_BREAKPOINT_RW:
+               *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+       info->address = bp->attr.bp_addr;
+
+       /* Len */
+       switch (bp->attr.bp_len) {
+       case HW_BREAKPOINT_LEN_1:
+               info->len = X86_BREAKPOINT_LEN_1;
+               break;
+       case HW_BREAKPOINT_LEN_2:
+               info->len = X86_BREAKPOINT_LEN_2;
+               break;
+       case HW_BREAKPOINT_LEN_4:
+               info->len = X86_BREAKPOINT_LEN_4;
+               break;
+#ifdef CONFIG_X86_64
+       case HW_BREAKPOINT_LEN_8:
+               info->len = X86_BREAKPOINT_LEN_8;
+               break;
+#endif
+       default:
+               return -EINVAL;
+       }
+
+       /* Type */
+       switch (bp->attr.bp_type) {
+       case HW_BREAKPOINT_W:
+               info->type = X86_BREAKPOINT_WRITE;
+               break;
+       case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+               info->type = X86_BREAKPOINT_RW;
+               break;
+       case HW_BREAKPOINT_X:
+               info->type = X86_BREAKPOINT_EXECUTE;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+                                 struct task_struct *tsk)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned int align;
+       int ret;
+
+
+       ret = arch_build_bp_info(bp);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+
+       if (info->type == X86_BREAKPOINT_EXECUTE)
+               /*
+                * Ptrace-refactoring code
+                * For now, we'll allow instruction breakpoint only for user-space
+                * addresses
+                */
+               if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+                       info->len != X86_BREAKPOINT_EXECUTE)
+                       return ret;
+
+       switch (info->len) {
+       case X86_BREAKPOINT_LEN_1:
+               align = 0;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               align = 1;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               align = 3;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               align = 7;
+               break;
+#endif
+       default:
+               return ret;
+       }
+
+       if (bp->callback)
+               ret = arch_store_info(bp);
+
+       if (ret < 0)
+               return ret;
+       /*
+        * Check that the low-order bits of the address are appropriate
+        * for the alignment implied by len.
+        */
+       if (info->address & align)
+               return -EINVAL;
+
+       /* Check that the virtual address is in the proper range */
+       if (tsk) {
+               if (!arch_check_va_in_userspace(info->address, info->len))
+                       return -EFAULT;
+       } else {
+               if (!arch_check_va_in_kernelspace(info->address, info->len))
+                       return -EFAULT;
+       }
+
+       return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+       int i;
+       int dr7 = 0;
+       struct perf_event *bp;
+       struct arch_hw_breakpoint *info;
+       struct thread_struct *thread = &current->thread;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               bp = thread->ptrace_bps[i];
+
+               if (bp && !bp->attr.disabled) {
+                       dump->u_debugreg[i] = bp->attr.bp_addr;
+                       info = counter_arch_bp(bp);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               } else {
+                       dump->u_debugreg[i] = 0;
+               }
+       }
+
+       dump->u_debugreg[4] = 0;
+       dump->u_debugreg[5] = 0;
+       dump->u_debugreg[6] = current->thread.debugreg6;
+
+       dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+       int i;
+       struct thread_struct *t = &tsk->thread;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               unregister_hw_breakpoint(t->ptrace_bps[i]);
+               t->ptrace_bps[i] = NULL;
+       }
+}
+
+void hw_breakpoint_restore(void)
+{
+       set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+       set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+       set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+       set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+       set_debugreg(current->thread.debugreg6, 6);
+       set_debugreg(__get_cpu_var(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+       int i, cpu, rc = NOTIFY_STOP;
+       struct perf_event *bp;
+       unsigned long dr7, dr6;
+       unsigned long *dr6_p;
+
+       /* The DR6 value is pointed by args->err */
+       dr6_p = (unsigned long *)ERR_PTR(args->err);
+       dr6 = *dr6_p;
+
+       /* Do an early return if no trap bits are set in DR6 */
+       if ((dr6 & DR_TRAP_BITS) == 0)
+               return NOTIFY_DONE;
+
+       get_debugreg(dr7, 7);
+       /* Disable breakpoints during exception handling */
+       set_debugreg(0UL, 7);
+       /*
+        * Assert that local interrupts are disabled
+        * Reset the DRn bits in the virtualized register value.
+        * The ptrace trigger routine will add in whatever is needed.
+        */
+       current->thread.debugreg6 &= ~DR_TRAP_BITS;
+       cpu = get_cpu();
+
+       /* Handle all the breakpoints that were triggered */
+       for (i = 0; i < HBP_NUM; ++i) {
+               if (likely(!(dr6 & (DR_TRAP0 << i))))
+                       continue;
+
+               /*
+                * The counter may be concurrently released but that can only
+                * occur from a call_rcu() path. We can then safely fetch
+                * the breakpoint, use its callback, touch its counter
+                * while we are in an rcu_read_lock() path.
+                */
+               rcu_read_lock();
+
+               bp = per_cpu(bp_per_reg[i], cpu);
+               if (bp)
+                       rc = NOTIFY_DONE;
+               /*
+                * Reset the 'i'th TRAP bit in dr6 to denote completion of
+                * exception handling
+                */
+               (*dr6_p) &= ~(DR_TRAP0 << i);
+               /*
+                * bp can be NULL due to lazy debug register switching
+                * or due to concurrent perf counter removing.
+                */
+               if (!bp) {
+                       rcu_read_unlock();
+                       break;
+               }
+
+               (bp->callback)(bp, args->regs);
+
+               rcu_read_unlock();
+       }
+       if (dr6 & (~DR_TRAP_BITS))
+               rc = NOTIFY_DONE;
+
+       set_debugreg(dr7, 7);
+       put_cpu();
+
+       return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+               struct notifier_block *unused, unsigned long val, void *data)
+{
+       if (val != DIE_DEBUG)
+               return NOTIFY_DONE;
+
+       return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+       /* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+       /* TODO */
+}
index 04bbd52785688c4152c449459d943a0482c68830..19212cb01558101c5fd861fc838e83230e94e15c 100644 (file)
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_printf(p, "  TLB shootdowns\n");
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_printf(p, "  Thermal event interrupts\n");
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_printf(p, "  Threshold APIC interrupts\n");
-# endif
 #endif
 #ifdef CONFIG_X86_MCE
        seq_printf(p, "%*s: ", prec, "MCE");
@@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
        sum += irq_stats(cpu)->irq_call_count;
        sum += irq_stats(cpu)->irq_tlb_count;
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        sum += irq_stats(cpu)->irq_threshold_count;
-# endif
 #endif
 #ifdef CONFIG_X86_MCE
        sum += per_cpu(mce_exception_count, cpu);
index 8d82a77a3f3b96ea3c0dc37e91551dddc7e10b51..34e86b67550c523d3bbefb0275ede78d526b0826 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 
+#include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
 
@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
                        "resuming...\n");
        kgdb_arch_handle_exception(args->trapnr, args->signr,
                                   args->err, "c", "", regs);
+       /*
+        * Reset the BS bit in dr6 (pointed by args->err) to
+        * denote completion of processing
+        */
+       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
 
        return NOTIFY_STOP;
 }
index 7b5169d2b00026272ed26874913c42ff315befb3..3fe86d706a1493ad59cb655478bd9b41cfd11f4d 100644 (file)
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/kallsyms.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
 
 void jprobe_return_end(void);
 
@@ -106,50 +109,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
        /*      -----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 };
-static const u32 onebyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-       W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-       W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-       W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-       W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-       W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-       W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-       W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-       W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-       W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-       W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-       W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-       W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-       W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-       W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-       W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-       W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-       W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-       W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-       W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-       W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-       W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-       W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-       W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
 #undef W
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +203,75 @@ retry:
        }
 }
 
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+       struct kprobe *kp;
+       kp = get_kprobe((void *)addr);
+       if (!kp)
+               return -EINVAL;
+
+       /*
+        *  Basically, kp->ainsn.insn has an original instruction.
+        *  However, RIP-relative instruction can not do single-stepping
+        *  at different place, fix_riprel() tweaks the displacement of
+        *  that instruction. In that case, we can't recover the instruction
+        *  from the kp->ainsn.insn.
+        *
+        *  On the other hand, kp->opcode has a copy of the first byte of
+        *  the probed instruction, which is overwritten by int3. And
+        *  the instruction at kp->addr is not modified by kprobes except
+        *  for the first byte, we can recover the original instruction
+        *  from it and kp->opcode.
+        */
+       memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       buf[0] = kp->opcode;
+       return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr) {
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+
+               /*
+                * Check if the instruction has been modified by another
+                * kprobe, in which case we replace the breakpoint by the
+                * original instruction in our buffer.
+                */
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               /*
+                                * Another debugging subsystem might insert
+                                * this breakpoint. In that case, we can't
+                                * recover it.
+                                */
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               addr += insn.length;
+       }
+
+       return (addr == paddr);
+}
+
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
@@ -277,68 +305,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 static void __kprobes fix_riprel(struct kprobe *p)
 {
 #ifdef CONFIG_X86_64
-       u8 *insn = p->ainsn.insn;
-       s64 disp;
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
-               }
-               break;
-       }
+       struct insn insn;
+       kernel_insn_init(&insn, p->ainsn.insn);
 
-       /* Skip REX instruction prefix.  */
-       if (is_REX_prefix(insn))
-               ++insn;
-
-       if (*insn == 0x0f) {
-               /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)twobyte_has_modrm);
-       } else
-               /* One-byte opcode.  */
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)onebyte_has_modrm);
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) {
-                       /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       ++insn;
-                       /*
-                        * The copied instruction uses the %rip-relative
-                        * addressing mode.  Adjust the displacement for the
-                        * difference between the original location of this
-                        * instruction and the location of the copy that will
-                        * actually be run.  The tricky bit here is making sure
-                        * that the sign extension happens correctly in this
-                        * calculation, since we need a signed 32-bit result to
-                        * be sign-extended to 64 bits when it's added to the
-                        * %rip value and yield the same 64-bit result that the
-                        * sign-extension of the original signed 32-bit
-                        * displacement would have given.
-                        */
-                       disp = (u8 *) p->addr + *((s32 *) insn) -
-                              (u8 *) p->ainsn.insn;
-                       BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-                       *(s32 *)insn = (s32) disp;
-               }
+       if (insn_rip_relative(&insn)) {
+               s64 newdisp;
+               u8 *disp;
+               insn_get_displacement(&insn);
+               /*
+                * The copied instruction uses the %rip-relative addressing
+                * mode.  Adjust the displacement for the difference between
+                * the original location of this instruction and the location
+                * of the copy that will actually be run.  The tricky bit here
+                * is making sure that the sign extension happens correctly in
+                * this calculation, since we need a signed 32-bit result to
+                * be sign-extended to 64 bits when it's added to the %rip
+                * value and yield the same 64-bit result that the sign-
+                * extension of the original signed 32-bit displacement would
+                * have given.
+                */
+               newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+                         (u8 *) p->ainsn.insn;
+               BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
+               disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+               *(s32 *) disp = (s32) newdisp;
        }
 #endif
 }
@@ -359,6 +349,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
+       if (!can_probe((unsigned long)p->addr))
+               return -EILSEQ;
        /* insn: must be on special executable page on x86. */
        p->ainsn.insn = get_insn_slot();
        if (!p->ainsn.insn)
@@ -472,17 +464,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
-               /* TODO: Provide re-entrancy from post_kprobes_handler() and
-                * avoid exception stack corruption while single-stepping on
-                * the instruction of the new probe.
-                */
-               arch_disarm_kprobe(p);
-               regs->ip = (unsigned long)p->addr;
-               reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-#endif
        case KPROBE_HIT_ACTIVE:
                save_previous_kprobe(kcb);
                set_current_kprobe(p, regs, kcb);
@@ -491,18 +472,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
                kcb->kprobe_status = KPROBE_REENTER;
                break;
        case KPROBE_HIT_SS:
-               if (p == kprobe_running()) {
-                       regs->flags &= ~X86_EFLAGS_TF;
-                       regs->flags |= kcb->kprobe_saved_flags;
-                       return 0;
-               } else {
-                       /* A probe has been hit in the codepath leading up
-                        * to, or just after, single-stepping of a probed
-                        * instruction. This entire codepath should strictly
-                        * reside in .kprobes.text section. Raise a warning
-                        * to highlight this peculiar case.
-                        */
-               }
+               /* A probe has been hit in the codepath leading up to, or just
+                * after, single-stepping of a probed instruction. This entire
+                * codepath should strictly reside in .kprobes.text section.
+                * Raise a BUG or we'll continue in an endless reentering loop
+                * and eventually a stack overflow.
+                */
+               printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+                      p->addr);
+               dump_kprobe(p);
+               BUG();
        default:
                /* impossible cases */
                WARN_ON(1);
@@ -967,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
                        ret = NOTIFY_STOP;
                break;
        case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
+               if (post_kprobe_handler(args->regs)) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
                        ret = NOTIFY_STOP;
+               }
                break;
        case DIE_GPF:
                /*
index c1c429d00130c2a233b35f69449d5a02a9f6f2e9..c843f8406da2b95f416cc9a83c2bead2d4d17ae6 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
+       hw_breakpoint_disable();
 
        if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
index 84c3bf209e98a390ff536e46fd8fec362479f22a..4a8bb82248ae8a9a945854105c19447fec8a6530 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
 
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
+       hw_breakpoint_disable();
 
        if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
index 5284cd2b57769f53e79f520ecc6f8199720497cf..744508e7cfdd051e3896fe5ec28d5d3da0f3c16c 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/clockchips.h>
 #include <linux/random.h>
 #include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -17,6 +18,7 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@ void flush_thread(void)
        }
 #endif
 
-       clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-       tsk->thread.debugreg0 = 0;
-       tsk->thread.debugreg1 = 0;
-       tsk->thread.debugreg2 = 0;
-       tsk->thread.debugreg3 = 0;
-       tsk->thread.debugreg6 = 0;
-       tsk->thread.debugreg7 = 0;
+       flush_ptrace_hw_breakpoint(tsk);
        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
        /*
         * Forget coprocessor state..
@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
        else if (next->debugctlmsr != prev->debugctlmsr)
                update_debugctlmsr(next->debugctlmsr);
 
-       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-               set_debugreg(next->debugreg0, 0);
-               set_debugreg(next->debugreg1, 1);
-               set_debugreg(next->debugreg2, 2);
-               set_debugreg(next->debugreg3, 3);
-               /* no 4 and 5 */
-               set_debugreg(next->debugreg6, 6);
-               set_debugreg(next->debugreg7, 7);
-       }
-
        if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
            test_tsk_thread_flag(next_p, TIF_NOTSC)) {
                /* prev and next are different */
index 4cf79567cdab0728b33c2f9698e3a5b535e4eb28..d5bd3132ee706d764510eec5058cf3e214cb2cc2 100644 (file)
@@ -58,6 +58,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
        task_user_gs(p) = get_user_gs(regs);
 
+       p->thread.io_bitmap_ptr = NULL;
        tsk = current;
+       err = -ENOMEM;
+
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
        if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
                p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
                                                IO_BITMAP_BYTES, GFP_KERNEL);
index eb62cbcaa490ad553ef5d70b6751a2288d25089e..70cf15873f3d65da38e42fbbb670555d2fc22722 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
        p->thread.fs = me->thread.fs;
        p->thread.gs = me->thread.gs;
+       p->thread.io_bitmap_ptr = NULL;
 
        savesegment(gs, p->thread.gsindex);
        savesegment(fs, p->thread.fsindex);
        savesegment(es, p->thread.es);
        savesegment(ds, p->thread.ds);
 
+       err = -ENOMEM;
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
                if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@ out:
                kfree(p->thread.io_bitmap_ptr);
                p->thread.io_bitmap_max = 0;
        }
+
        return err;
 }
 
@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         */
        if (preload_fpu)
                __math_state_restore();
+
        return prev_p;
 }
 
index 7b058a2dc66afecdaeb58877102957dff77e7d81..04d182a7cfdbd6e3c88c41658f6d0a91878e6366 100644 (file)
@@ -22,6 +22,8 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -34,6 +36,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -49,6 +52,118 @@ enum x86_regset {
        REGSET_IOPERM32,
 };
 
+struct pt_regs_offset {
+       const char *name;
+       int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+       REG_OFFSET_NAME(r15),
+       REG_OFFSET_NAME(r14),
+       REG_OFFSET_NAME(r13),
+       REG_OFFSET_NAME(r12),
+       REG_OFFSET_NAME(r11),
+       REG_OFFSET_NAME(r10),
+       REG_OFFSET_NAME(r9),
+       REG_OFFSET_NAME(r8),
+#endif
+       REG_OFFSET_NAME(bx),
+       REG_OFFSET_NAME(cx),
+       REG_OFFSET_NAME(dx),
+       REG_OFFSET_NAME(si),
+       REG_OFFSET_NAME(di),
+       REG_OFFSET_NAME(bp),
+       REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+       REG_OFFSET_NAME(ds),
+       REG_OFFSET_NAME(es),
+       REG_OFFSET_NAME(fs),
+       REG_OFFSET_NAME(gs),
+#endif
+       REG_OFFSET_NAME(orig_ax),
+       REG_OFFSET_NAME(ip),
+       REG_OFFSET_NAME(cs),
+       REG_OFFSET_NAME(flags),
+       REG_OFFSET_NAME(sp),
+       REG_OFFSET_NAME(ss),
+       REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:      the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (!strcmp(roff->name, name))
+                       return roff->offset;
+       return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset:    the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (roff->offset == offset)
+                       return roff->name;
+       return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+       [0] = offsetof(struct pt_regs, ax),
+       [1] = offsetof(struct pt_regs, dx),
+       [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+       [0] = offsetof(struct pt_regs, di),
+       [1] = offsetof(struct pt_regs, si),
+       [2] = offsetof(struct pt_regs, dx),
+       [3] = offsetof(struct pt_regs, cx),
+       [4] = offsetof(struct pt_regs, r8),
+       [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs:      pt_regs which contains registers at function entry.
+ * @n:         argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
+{
+       if (n < ARRAY_SIZE(arg_offs_table))
+               return *(unsigned long *)((char *)regs + arg_offs_table[n]);
+       else {
+               /*
+                * The typical case: arg n is on the stack.
+                * (Note: stack[0] = return address, so skip it)
+                */
+               n -= ARRAY_SIZE(arg_offs_table);
+               return regs_get_kernel_stack_nth(regs, 1 + n);
+       }
+}
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-       return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK              (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (test_tsk_thread_flag(task, TIF_IA32))
-               return IA32_PAGE_OFFSET - 3;
-#endif
-       return TASK_SIZE_MAX - 7;
-}
-
 #endif /* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -454,98 +555,238 @@ static int genregs_set(struct task_struct *target,
        return ret;
 }
 
+static void ptrace_triggered(struct perf_event *bp, void *data)
+{
+       int i;
+       struct thread_struct *thread = &(current->thread);
+
+       /*
+        * Store in the virtual DR6 register the fact that the breakpoint
+        * was hit so the thread's debugger will see it.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               if (thread->ptrace_bps[i] == bp)
+                       break;
+       }
+
+       thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 {
-       switch (n) {
-       case 0:         return child->thread.debugreg0;
-       case 1:         return child->thread.debugreg1;
-       case 2:         return child->thread.debugreg2;
-       case 3:         return child->thread.debugreg3;
-       case 6:         return child->thread.debugreg6;
-       case 7:         return child->thread.debugreg7;
+       int i;
+       int dr7 = 0;
+       struct arch_hw_breakpoint *info;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               if (bp[i] && !bp[i]->attr.disabled) {
+                       info = counter_arch_bp(bp[i]);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               }
        }
-       return 0;
+
+       return dr7;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-                              int n, unsigned long data)
+static struct perf_event *
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+                        struct task_struct *tsk, int disabled)
 {
-       int i;
+       int err;
+       int gen_len, gen_type;
+       DEFINE_BREAKPOINT_ATTR(attr);
 
-       if (unlikely(n == 4 || n == 5))
-               return -EIO;
+       /*
+        * We shoud have at least an inactive breakpoint at this
+        * slot. It means the user is writing dr7 without having
+        * written the address register first
+        */
+       if (!bp)
+               return ERR_PTR(-EINVAL);
 
-       if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-               return -EIO;
+       err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+       if (err)
+               return ERR_PTR(err);
 
-       switch (n) {
-       case 0:         child->thread.debugreg0 = data; break;
-       case 1:         child->thread.debugreg1 = data; break;
-       case 2:         child->thread.debugreg2 = data; break;
-       case 3:         child->thread.debugreg3 = data; break;
+       attr = bp->attr;
+       attr.bp_len = gen_len;
+       attr.bp_type = gen_type;
+       attr.disabled = disabled;
 
-       case 6:
-               if ((data & ~0xffffffffUL) != 0)
-                       return -EIO;
-               child->thread.debugreg6 = data;
-               break;
+       return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long old_dr7;
+       int i, orig_ret = 0, rc = 0;
+       int enabled, second_pass = 0;
+       unsigned len, type;
+       struct perf_event *bp;
+
+       data &= ~DR_CONTROL_RESERVED;
+       old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+       /*
+        * Loop through all the hardware breakpoints, making the
+        * appropriate changes to each.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               enabled = decode_dr7(data, i, &len, &type);
+               bp = thread->ptrace_bps[i];
+
+               if (!enabled) {
+                       if (bp) {
+                               /*
+                                * Don't unregister the breakpoints right-away,
+                                * unless all register_user_hw_breakpoint()
+                                * requests have succeeded. This prevents
+                                * any window of opportunity for debug
+                                * register grabbing by other users.
+                                */
+                               if (!second_pass)
+                                       continue;
+
+                               thread->ptrace_bps[i] = NULL;
+                               bp = ptrace_modify_breakpoint(bp, len, type,
+                                                             tsk, 1);
+                               if (IS_ERR(bp)) {
+                                       rc = PTR_ERR(bp);
+                                       thread->ptrace_bps[i] = NULL;
+                                       break;
+                               }
+                               thread->ptrace_bps[i] = bp;
+                       }
+                       continue;
+               }
+
+               bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+
+               /* Incorrect bp, or we have a bug in bp API */
+               if (IS_ERR(bp)) {
+                       rc = PTR_ERR(bp);
+                       thread->ptrace_bps[i] = NULL;
+                       break;
+               }
+               thread->ptrace_bps[i] = bp;
+       }
+       /*
+        * Make a second pass to free the remaining unused breakpoints
+        * or to restore the original breakpoints if an error occurred.
+        */
+       if (!second_pass) {
+               second_pass = 1;
+               if (rc < 0) {
+                       orig_ret = rc;
+                       data = old_dr7;
+               }
+               goto restore;
+       }
+       return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long val = 0;
 
-       case 7:
+       if (n < HBP_NUM) {
+               struct perf_event *bp;
+               bp = thread->ptrace_bps[n];
+               if (!bp)
+                       return 0;
+               val = bp->hw.info.address;
+       } else if (n == 6) {
+               val = thread->debugreg6;
+        } else if (n == 7) {
+               val = ptrace_get_dr7(thread->ptrace_bps);
+       }
+       return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+                                     unsigned long addr)
+{
+       struct perf_event *bp;
+       struct thread_struct *t = &tsk->thread;
+       DEFINE_BREAKPOINT_ATTR(attr);
+
+       if (!t->ptrace_bps[nr]) {
                /*
-                * Sanity-check data. Take one half-byte at once with
-                * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                * 2 and 3 are LENi. Given a list of invalid values,
-                * we do mask |= 1 << invalid_value, so that
-                * (mask >> check) & 1 is a correct test for invalid
-                * values.
-                *
-                * R/Wi contains the type of the breakpoint /
-                * watchpoint, LENi contains the length of the watched
-                * data in the watchpoint case.
-                *
-                * The invalid values are:
-                * - LENi == 0x10 (undefined), so mask |= 0x0f00.       [32-bit]
-                * - R/Wi == 0x10 (break on I/O reads or writes), so
-                *   mask |= 0x4444.
-                * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                *   0x1110.
-                *
-                * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                *
-                * See the Intel Manual "System Programming Guide",
-                * 15.2.4
-                *
-                * Note that LENi == 0x10 is defined on x86_64 in long
-                * mode (i.e. even for 32-bit userspace software, but
-                * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                * See the AMD manual no. 24593 (AMD64 System Programming)
+                * Put stub len and type to register (reserve) an inactive but
+                * correct bp
                 */
-#ifdef CONFIG_X86_32
-#define        DR7_MASK        0x5f54
-#else
-#define        DR7_MASK        0x5554
-#endif
-               data &= ~DR_CONTROL_RESERVED;
-               for (i = 0; i < 4; i++)
-                       if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                               return -EIO;
-               child->thread.debugreg7 = data;
-               if (data)
-                       set_tsk_thread_flag(child, TIF_DEBUG);
-               else
-                       clear_tsk_thread_flag(child, TIF_DEBUG);
-               break;
+               attr.bp_addr = addr;
+               attr.bp_len = HW_BREAKPOINT_LEN_1;
+               attr.bp_type = HW_BREAKPOINT_W;
+               attr.disabled = 1;
+
+               bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+       } else {
+               bp = t->ptrace_bps[nr];
+               t->ptrace_bps[nr] = NULL;
+
+               attr = bp->attr;
+               attr.bp_addr = addr;
+               bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
        }
+       /*
+        * CHECKME: the previous code returned -EIO if the addr wasn't a
+        * valid task virtual addr. The new one will return -EINVAL in this
+        * case.
+        * -EINVAL may be what we want for in-kernel breakpoints users, but
+        * -EIO looks better for ptrace, since we refuse a register writing
+        * for the user. And anyway this is the previous behaviour.
+        */
+       if (IS_ERR(bp))
+               return PTR_ERR(bp);
+
+       t->ptrace_bps[nr] = bp;
 
        return 0;
 }
 
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       int rc = 0;
+
+       /* There are no DR4 or DR5 registers */
+       if (n == 4 || n == 5)
+               return -EIO;
+
+       if (n == 6) {
+               thread->debugreg6 = val;
+               goto ret_path;
+       }
+       if (n < HBP_NUM) {
+               rc = ptrace_set_breakpoint_addr(tsk, n, val);
+               if (rc)
+                       return rc;
+       }
+       /* All that's left is DR7 */
+       if (n == 7)
+               rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+       return rc;
+}
+
 /*
  * These access the current or another (stopped) task's io permission
  * bitmap for debugging or core dump.
index 2a34f9c5be214fd428c4197a1f09819d65759a37..c0ca8f921c91f24091cb89af9fad7360c6312313 100644 (file)
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
+#include <asm/mce.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -1031,6 +1032,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
        x86_init.oem.banner();
+
+       mcheck_init();
 }
 
 #ifdef CONFIG_X86_32
index 6a44a76055adcb781572978d1e332a9923f7e2d6..fbf3b07c856740805f0dbe64a85fa0d2131df37e 100644 (file)
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
 
        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
        if (signr > 0) {
-               /*
-                * Re-enable any watchpoints before delivering the
-                * signal to user space. The processor register will
-                * have been cleared if the watchpoint triggered
-                * inside the kernel.
-                */
-               if (current->thread.debugreg7)
-                       set_debugreg(current->thread.debugreg7, 7);
-
                /* Whee! Actually deliver the signal.  */
                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
                        /*
index 7e37dcee0cc352df1104e211d77cce2f99a9e4e8..33399176512a8a2c4c718d53ad76bdea631bd46e 100644 (file)
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
        struct task_struct *tsk = current;
-       unsigned long condition;
+       unsigned long dr6;
        int si_code;
 
-       get_debugreg(condition, 6);
+       get_debugreg(dr6, 6);
 
        /* Catch kmemcheck conditions first of all! */
-       if (condition & DR_STEP && kmemcheck_trap(regs))
+       if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
                return;
 
+       /* DR6 may or may not be cleared by the CPU */
+       set_debugreg(0, 6);
        /*
         * The processor cleared BTF, so don't mark that we need it set.
         */
        clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
        tsk->thread.debugctlmsr = 0;
 
-       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-                                               SIGTRAP) == NOTIFY_STOP)
+       /* Store the virtualized DR6 value */
+       tsk->thread.debugreg6 = dr6;
+
+       if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+                                                       SIGTRAP) == NOTIFY_STOP)
                return;
 
        /* It's safe to allow irq's after DR6 has been saved */
        preempt_conditional_sti(regs);
 
-       /* Mask out spurious debug traps due to lazy DR7 setting */
-       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-               if (!tsk->thread.debugreg7)
-                       goto clear_dr7;
+       if (regs->flags & X86_VM_MASK) {
+               handle_vm86_trap((struct kernel_vm86_regs *) regs,
+                               error_code, 1);
+               return;
        }
 
-#ifdef CONFIG_X86_32
-       if (regs->flags & X86_VM_MASK)
-               goto debug_vm86;
-#endif
-
-       /* Save debug status register where ptrace can see it */
-       tsk->thread.debugreg6 = condition;
-
        /*
-        * Single-stepping through TF: make sure we ignore any events in
-        * kernel space (but re-enable TF when returning to user mode).
+        * Single-stepping through system calls: ignore any exceptions in
+        * kernel space, but re-enable TF when returning to user mode.
+        *
+        * We already checked v86 mode above, so we can check for kernel mode
+        * by just checking the CPL of CS.
         */
-       if (condition & DR_STEP) {
-               if (!user_mode(regs))
-                       goto clear_TF_reenable;
+       if ((dr6 & DR_STEP) && !user_mode(regs)) {
+               tsk->thread.debugreg6 &= ~DR_STEP;
+               set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+               regs->flags &= ~X86_EFLAGS_TF;
        }
-
-       si_code = get_si_code(condition);
-       /* Ok, finally something we can handle */
-       send_sigtrap(tsk, regs, error_code, si_code);
-
-       /*
-        * Disable additional traps. They'll be re-enabled when
-        * the signal is delivered.
-        */
-clear_dr7:
-       set_debugreg(0, 7);
+       si_code = get_si_code(tsk->thread.debugreg6);
+       if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+               send_sigtrap(tsk, regs, error_code, si_code);
        preempt_conditional_cli(regs);
-       return;
 
-#ifdef CONFIG_X86_32
-debug_vm86:
-       /* reenable preemption: handle_vm86_trap() might sleep */
-       dec_preempt_count();
-       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-       conditional_cli(regs);
-       return;
-#endif
-
-clear_TF_reenable:
-       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->flags &= ~X86_EFLAGS_TF;
-       preempt_conditional_cli(regs);
        return;
 }
 
index ae07d261527cba458ed1682118b19295bc997847..4fc80174191ce4b17549b643fe11dee645286c3f 100644 (file)
@@ -42,6 +42,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+#include <asm/debugreg.h>
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu, kvm_run);
 
-       if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-               set_debugreg(current->thread.debugreg0, 0);
-               set_debugreg(current->thread.debugreg1, 1);
-               set_debugreg(current->thread.debugreg2, 2);
-               set_debugreg(current->thread.debugreg3, 3);
-               set_debugreg(current->thread.debugreg6, 6);
-               set_debugreg(current->thread.debugreg7, 7);
-       }
+       /*
+        * If the guest has used debug registers, at least dr7
+        * will be disabled while returning to the host.
+        * If we don't have active breakpoints in the host, we don't
+        * care about the messed up debug address registers. But if
+        * we have some of them active, restore the old state.
+        */
+       if (hw_breakpoint_active())
+               hw_breakpoint_restore();
 
        set_bit(KVM_REQ_KICK, &vcpu->requests);
        local_irq_enable();
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644 (file)
index 0000000..8df89f0
--- /dev/null
@@ -0,0 +1 @@
+inat-tables.c
index 85f5db95c60f03718f292080587f7faa337ef570..a2d6472895fb309884d442b100502447fcb4c5e9 100644 (file)
@@ -2,12 +2,25 @@
 # Makefile for x86 specific library files.
 #
 
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN     $@
+      cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+       $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
 obj-$(CONFIG_SMP) := msr.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
+lib-y += insn.o inat.o
 
 obj-y += msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644 (file)
index 0000000..46fc4ee
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+       return inat_primary_table[opcode];
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+                                     insn_attr_t esc_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_escape_id(esc_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_escape_tables[n][0];
+       if (!table)
+               return 0;
+       if (inat_has_variant(table[opcode]) && m) {
+               table = inat_escape_tables[n][m];
+               if (!table)
+                       return 0;
+       }
+       return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+                                    insn_attr_t grp_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_group_id(grp_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_group_tables[n][0];
+       if (!table)
+               return inat_group_common_attribute(grp_attr);
+       if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
+               table = inat_group_tables[n][m];
+               if (!table)
+                       return inat_group_common_attribute(grp_attr);
+       }
+       return table[X86_MODRM_REG(modrm)] |
+              inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+                                  insn_byte_t vex_p)
+{
+       const insn_attr_t *table;
+       if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+               return 0;
+       table = inat_avx_tables[vex_m][vex_p];
+       if (!table)
+               return 0;
+       return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644 (file)
index 0000000..9f33b98
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <linux/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+#define get_next(t, insn)      \
+       ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define peek_next(t, insn)     \
+       ({t r; r = *(t*)insn->next_byte; r; })
+
+#define peek_nbyte_next(t, insn, n)    \
+       ({t r; r = *(t*)((insn)->next_byte + n); r; })
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn:      &struct insn to be initialized
+ * @kaddr:     address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64:    !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+       memset(insn, 0, sizeof(*insn));
+       insn->kaddr = kaddr;
+       insn->next_byte = kaddr;
+       insn->x86_64 = x86_64 ? 1 : 0;
+       insn->opnd_bytes = 4;
+       if (x86_64)
+               insn->addr_bytes = 8;
+       else
+               insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode.  No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+       struct insn_field *prefixes = &insn->prefixes;
+       insn_attr_t attr;
+       insn_byte_t b, lb;
+       int i, nb;
+
+       if (prefixes->got)
+               return;
+
+       nb = 0;
+       lb = 0;
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       while (inat_is_legacy_prefix(attr)) {
+               /* Skip if same prefix */
+               for (i = 0; i < nb; i++)
+                       if (prefixes->bytes[i] == b)
+                               goto found;
+               if (nb == 4)
+                       /* Invalid instruction */
+                       break;
+               prefixes->bytes[nb++] = b;
+               if (inat_is_address_size_prefix(attr)) {
+                       /* address size switches 2/4 or 4/8 */
+                       if (insn->x86_64)
+                               insn->addr_bytes ^= 12;
+                       else
+                               insn->addr_bytes ^= 6;
+               } else if (inat_is_operand_size_prefix(attr)) {
+                       /* oprand size switches 2/4 */
+                       insn->opnd_bytes ^= 6;
+               }
+found:
+               prefixes->nbytes++;
+               insn->next_byte++;
+               lb = b;
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+       }
+       /* Set the last prefix */
+       if (lb && lb != insn->prefixes.bytes[3]) {
+               if (unlikely(insn->prefixes.bytes[3])) {
+                       /* Swap the last prefix */
+                       b = insn->prefixes.bytes[3];
+                       for (i = 0; i < nb; i++)
+                               if (prefixes->bytes[i] == lb)
+                                       prefixes->bytes[i] = b;
+               }
+               insn->prefixes.bytes[3] = lb;
+       }
+
+       /* Decode REX prefix */
+       if (insn->x86_64) {
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+               if (inat_is_rex_prefix(attr)) {
+                       insn->rex_prefix.value = b;
+                       insn->rex_prefix.nbytes = 1;
+                       insn->next_byte++;
+                       if (X86_REX_W(b))
+                               /* REX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               }
+       }
+       insn->rex_prefix.got = 1;
+
+       /* Decode VEX prefix */
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       if (inat_is_vex_prefix(attr)) {
+               insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+               if (!insn->x86_64) {
+                       /*
+                        * In 32-bits mode, if the [7:6] bits (mod bits of
+                        * ModRM) on the second byte are not 11b, it is
+                        * LDS or LES.
+                        */
+                       if (X86_MODRM_MOD(b2) != 3)
+                               goto vex_end;
+               }
+               insn->vex_prefix.bytes[0] = b;
+               insn->vex_prefix.bytes[1] = b2;
+               if (inat_is_vex3_prefix(attr)) {
+                       b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+                       insn->vex_prefix.bytes[2] = b2;
+                       insn->vex_prefix.nbytes = 3;
+                       insn->next_byte += 3;
+                       if (insn->x86_64 && X86_VEX_W(b2))
+                               /* VEX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               } else {
+                       insn->vex_prefix.nbytes = 2;
+                       insn->next_byte += 2;
+               }
+       }
+vex_end:
+       insn->vex_prefix.got = 1;
+
+       prefixes->got = 1;
+       return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1.  No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+       struct insn_field *opcode = &insn->opcode;
+       insn_byte_t op, pfx;
+       if (opcode->got)
+               return;
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+
+       /* Get first opcode */
+       op = get_next(insn_byte_t, insn);
+       opcode->bytes[0] = op;
+       opcode->nbytes = 1;
+
+       /* Check if there is VEX prefix or not */
+       if (insn_is_avx(insn)) {
+               insn_byte_t m, p;
+               m = insn_vex_m_bits(insn);
+               p = insn_vex_p_bits(insn);
+               insn->attr = inat_get_avx_attribute(op, m, p);
+               if (!inat_accept_vex(insn->attr))
+                       insn->attr = 0; /* This instruction is bad */
+               goto end;       /* VEX has only 1 byte for opcode */
+       }
+
+       insn->attr = inat_get_opcode_attribute(op);
+       while (inat_is_escape(insn->attr)) {
+               /* Get escaped opcode */
+               op = get_next(insn_byte_t, insn);
+               opcode->bytes[opcode->nbytes++] = op;
+               pfx = insn_last_prefix(insn);
+               insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+       }
+       if (inat_must_vex(insn->attr))
+               insn->attr = 0; /* This instruction is bad */
+end:
+       opcode->got = 1;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any.  If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)).  No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+       insn_byte_t pfx, mod;
+       if (modrm->got)
+               return;
+       if (!insn->opcode.got)
+               insn_get_opcode(insn);
+
+       if (inat_has_modrm(insn->attr)) {
+               mod = get_next(insn_byte_t, insn);
+               modrm->value = mod;
+               modrm->nbytes = 1;
+               if (inat_is_group(insn->attr)) {
+                       pfx = insn_last_prefix(insn);
+                       insn->attr = inat_get_group_attribute(mod, pfx,
+                                                             insn->attr);
+               }
+       }
+
+       if (insn->x86_64 && inat_is_force64(insn->attr))
+               insn->opnd_bytes = 8;
+       modrm->got = 1;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.  No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+
+       if (!insn->x86_64)
+               return 0;
+       if (!modrm->got)
+               insn_get_modrm(insn);
+       /*
+        * For rip-relative instructions, the mod field (top 2 bits)
+        * is zero and the r/m field (bottom 3 bits) is 0x5.
+        */
+       return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+       insn_byte_t modrm;
+
+       if (insn->sib.got)
+               return;
+       if (!insn->modrm.got)
+               insn_get_modrm(insn);
+       if (insn->modrm.nbytes) {
+               modrm = (insn_byte_t)insn->modrm.value;
+               if (insn->addr_bytes != 2 &&
+                   X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+                       insn->sib.value = get_next(insn_byte_t, insn);
+                       insn->sib.nbytes = 1;
+               }
+       }
+       insn->sib.got = 1;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+       insn_byte_t mod, rm, base;
+
+       if (insn->displacement.got)
+               return;
+       if (!insn->sib.got)
+               insn_get_sib(insn);
+       if (insn->modrm.nbytes) {
+               /*
+                * Interpreting the modrm byte:
+                * mod = 00 - no displacement fields (exceptions below)
+                * mod = 01 - 1-byte displacement field
+                * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+                *      address size = 2 (0x67 prefix in 32-bit mode)
+                * mod = 11 - no memory operand
+                *
+                * If address size = 2...
+                * mod = 00, r/m = 110 - displacement field is 2 bytes
+                *
+                * If address size != 2...
+                * mod != 11, r/m = 100 - SIB byte exists
+                * mod = 00, SIB base = 101 - displacement field is 4 bytes
+                * mod = 00, r/m = 101 - rip-relative addressing, displacement
+                *      field is 4 bytes
+                */
+               mod = X86_MODRM_MOD(insn->modrm.value);
+               rm = X86_MODRM_RM(insn->modrm.value);
+               base = X86_SIB_BASE(insn->sib.value);
+               if (mod == 3)
+                       goto out;
+               if (mod == 1) {
+                       insn->displacement.value = get_next(char, insn);
+                       insn->displacement.nbytes = 1;
+               } else if (insn->addr_bytes == 2) {
+                       if ((mod == 0 && rm == 6) || mod == 2) {
+                               insn->displacement.value =
+                                        get_next(short, insn);
+                               insn->displacement.nbytes = 2;
+                       }
+               } else {
+                       if ((mod == 0 && rm == 5) || mod == 2 ||
+                           (mod == 0 && base == 5)) {
+                               insn->displacement.value = get_next(int, insn);
+                               insn->displacement.nbytes = 4;
+                       }
+               }
+       }
+out:
+       insn->displacement.got = 1;
+}
+
+/* Decode moffset16/32/64 */
+static void __get_moffset(struct insn *insn)
+{
+       switch (insn->addr_bytes) {
+       case 2:
+               insn->moffset1.value = get_next(short, insn);
+               insn->moffset1.nbytes = 2;
+               break;
+       case 4:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               break;
+       case 8:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               insn->moffset2.value = get_next(int, insn);
+               insn->moffset2.nbytes = 4;
+               break;
+       }
+       insn->moffset1.got = insn->moffset2.got = 1;
+}
+
+/* Decode imm v32(Iz) */
+static void __get_immv32(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case 4:
+       case 8:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       }
+}
+
+/* Decode imm v64(Iv/Ov) */
+static void __get_immv(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       }
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static void __get_immptr(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               /* ptr16:64 is not exist (no segment) */
+               return;
+       }
+       insn->immediate2.value = get_next(unsigned short, insn);
+       insn->immediate2.nbytes = 2;
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+       if (insn->immediate.got)
+               return;
+       if (!insn->displacement.got)
+               insn_get_displacement(insn);
+
+       if (inat_has_moffset(insn->attr)) {
+               __get_moffset(insn);
+               goto done;
+       }
+
+       if (!inat_has_immediate(insn->attr))
+               /* no immediates */
+               goto done;
+
+       switch (inat_immediate_size(insn->attr)) {
+       case INAT_IMM_BYTE:
+               insn->immediate.value = get_next(char, insn);
+               insn->immediate.nbytes = 1;
+               break;
+       case INAT_IMM_WORD:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case INAT_IMM_DWORD:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       case INAT_IMM_QWORD:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       case INAT_IMM_PTR:
+               __get_immptr(insn);
+               break;
+       case INAT_IMM_VWORD32:
+               __get_immv32(insn);
+               break;
+       case INAT_IMM_VWORD:
+               __get_immv(insn);
+               break;
+       default:
+               break;
+       }
+       if (inat_has_second_immediate(insn->attr)) {
+               insn->immediate2.value = get_next(char, insn);
+               insn->immediate2.nbytes = 1;
+       }
+done:
+       insn->immediate.got = 1;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+       if (insn->length)
+               return;
+       if (!insn->immediate.got)
+               insn_get_immediate(insn);
+       insn->length = (unsigned char)((unsigned long)insn->next_byte
+                                    - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644 (file)
index 0000000..a793da5
--- /dev/null
@@ -0,0 +1,893 @@
+# x86 Opcode Maps
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg:  mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+#  (VEX): this opcode can accept VEX prefix.
+#  (oVEX): this opcode requires VEX prefix.
+#  (o128): this opcode only supports 128bit VEX.
+#  (o256): this opcode only supports 256bit VEX.
+#
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Xb,Yb
+a5: MOVS/W/D/Q Xv,Yv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+af: SCAS/W/D/Q rAX,Xv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
+c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c6: Grp11 Eb,Ib (1A)
+c7: Grp11 Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix)
+f3: REP/REPE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+0d: NOP Ev | GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
+11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
+12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
+13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
+14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
+15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
+16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
+17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+18: Grp16 (1A)
+19:
+1a:
+1b:
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
+29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
+2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
+2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
+2c: cvttps2pi Ppi,Wps | cvttss2si  Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd  Vsd,Wsd (66),(VEX),(o128)
+2f: comiss Vss,Wss (VEX),(o128) | comisd  Vsd,Wsd (66),(VEX),(o128)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
+51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
+52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
+53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
+54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
+55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
+56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
+57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
+58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
+59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
+5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
+5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
+5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
+5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
+5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
+5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
+61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
+62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
+63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
+64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
+65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
+66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
+67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
+68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
+69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
+6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
+6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
+6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
+6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
+6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
+6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
+75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
+76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
+77: emms/vzeroupper/vzeroall (VEX)
+78: VMREAD Ed/q,Gd/q
+79: VMWRITE Gd/q,Ed/q
+7a:
+7b:
+7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
+7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
+7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
+7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JNAE/JC Jz (f64)
+83: JNB/JAE/JNC Jz (f64)
+84: JZ/JE Jz (f64)
+85: JNZ/JNE Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JNBE/JA Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev
+bd: BSR Gv,Ev
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
+c3: movnti Md/q,Gd/q
+c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
+c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
+c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
+d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
+d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
+d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
+d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
+d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
+d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
+d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
+d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
+da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
+db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
+dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
+dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
+de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
+df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
+e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
+e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
+e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
+e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
+e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
+e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
+e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
+e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
+e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
+ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
+eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
+ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
+ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
+ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
+ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xf0-0xff
+f0: lddqu Vdq,Mdq (F2),(VEX)
+f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
+f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
+f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
+f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
+f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
+f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
+f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
+f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
+f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
+fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
+fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
+fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
+fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
+fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
+01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
+02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
+03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
+04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
+05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
+06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
+07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
+08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
+09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
+0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
+0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
+0c: Vpermilps /r (66),(oVEX)
+0d: Vpermilpd /r (66),(oVEX)
+0e: vtestps /r (66),(oVEX)
+0f: vtestpd /r (66),(oVEX)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13:
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16:
+17: ptest Vdq,Wdq (66),(VEX)
+18: vbroadcastss /r (66),(oVEX)
+19: vbroadcastsd /r (66),(oVEX),(o256)
+1a: vbroadcastf128 /r (66),(oVEX),(o256)
+1b:
+1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
+1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
+1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
+21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
+22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
+23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
+24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
+25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+26:
+27:
+28: pmuldq Vdq,Wdq (66),(VEX),(o128)
+29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
+2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
+2b: packusdw Vdq,Wdq (66),(VEX),(o128)
+2c: vmaskmovps(ld) /r (66),(oVEX)
+2d: vmaskmovpd(ld) /r (66),(oVEX)
+2e: vmaskmovps(st) /r (66),(oVEX)
+2f: vmaskmovpd(st) /r (66),(oVEX)
+# 0x0f 0x38 0x30-0x3f
+30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
+31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
+32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
+33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
+34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
+35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
+36:
+37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
+38: pminsb Vdq,Wdq (66),(VEX),(o128)
+39: pminsd Vdq,Wdq (66),(VEX),(o128)
+3a: pminuw Vdq,Wdq (66),(VEX),(o128)
+3b: pminud Vdq,Wdq (66),(VEX),(o128)
+3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
+3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
+3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
+3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0x38 0x40-0x8f
+40: pmulld Vdq,Wdq (66),(VEX),(o128)
+41: phminposuw Vdq,Wdq (66),(VEX),(o128)
+80: INVEPT Gd/q,Mdq (66)
+81: INVPID Gd/q,Mdq (66)
+# 0x0f 0x38 0x90-0xbf (FMA)
+96: vfmaddsub132pd/ps /r (66),(VEX)
+97: vfmsubadd132pd/ps /r (66),(VEX)
+98: vfmadd132pd/ps /r (66),(VEX)
+99: vfmadd132sd/ss /r (66),(VEX),(o128)
+9a: vfmsub132pd/ps /r (66),(VEX)
+9b: vfmsub132sd/ss /r (66),(VEX),(o128)
+9c: vfnmadd132pd/ps /r (66),(VEX)
+9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
+9e: vfnmsub132pd/ps /r (66),(VEX)
+9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
+a6: vfmaddsub213pd/ps /r (66),(VEX)
+a7: vfmsubadd213pd/ps /r (66),(VEX)
+a8: vfmadd213pd/ps /r (66),(VEX)
+a9: vfmadd213sd/ss /r (66),(VEX),(o128)
+aa: vfmsub213pd/ps /r (66),(VEX)
+ab: vfmsub213sd/ss /r (66),(VEX),(o128)
+ac: vfnmadd213pd/ps /r (66),(VEX)
+ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
+ae: vfnmsub213pd/ps /r (66),(VEX)
+af: vfnmsub213sd/ss /r (66),(VEX),(o128)
+b6: vfmaddsub231pd/ps /r (66),(VEX)
+b7: vfmsubadd231pd/ps /r (66),(VEX)
+b8: vfmadd231pd/ps /r (66),(VEX)
+b9: vfmadd231sd/ss /r (66),(VEX),(o128)
+ba: vfmsub231pd/ps /r (66),(VEX)
+bb: vfmsub231sd/ss /r (66),(VEX),(o128)
+bc: vfnmadd231pd/ps /r (66),(VEX)
+bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
+be: vfnmsub231pd/ps /r (66),(VEX)
+bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+# 0x0f 0x38 0xc0-0xff
+db: aesimc Vdq,Wdq (66),(VEX),(o128)
+dc: aesenc Vdq,Wdq (66),(VEX),(o128)
+dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
+de: aesdec Vdq,Wdq (66),(VEX),(o128)
+df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
+f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
+f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+04: vpermilps /r,Ib (66),(oVEX)
+05: vpermilpd /r,Ib (66),(oVEX)
+06: vperm2f128 /r,Ib (66),(oVEX),(o256)
+08: roundps Vdq,Wdq,Ib (66),(VEX)
+09: roundpd Vdq,Wdq,Ib (66),(VEX)
+0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
+0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
+0c: blendps Vdq,Wdq,Ib (66),(VEX)
+0d: blendpd Vdq,Wdq,Ib (66),(VEX)
+0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
+0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
+14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
+15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
+16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
+17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
+18: vinsertf128 /r,Ib (66),(oVEX),(o256)
+19: vextractf128 /r,Ib (66),(oVEX),(o256)
+20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
+21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
+22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
+40: dpps Vdq,Wdq,Ib (66),(VEX)
+41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
+42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
+44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
+4a: vblendvps /r,Ib (66),(oVEX)
+4b: vblendvpd /r,Ib (66),(oVEX)
+4c: vpblendvb /r,Ib (66),(oVEX),(o128)
+60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
+61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
+62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
+63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
+df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Ep
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
+7: VMPTRST Mq
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+GrpTable: Grp11
+0: MOV
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
+4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
+6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
+4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
+6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
+3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
+6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
+7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp15
+0: fxsave
+1: fxstor
+2: ldmxcsr (VEX)
+3: stmxcsr (VEX)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
index f4cee9028cf0b01e11951662b625f63371f627e6..8f4e2ac93928edd82f4b34ac3bdead37eea289d4 100644 (file)
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
-static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
        return 0;
 }
 
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
 {
        int ret = 0;
 
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
  *
  *   Handle a fault on the vmalloc or module mapping area
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
  *
  * This assumes no large pages in there.
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        pgd_t *pgd, *pgd_ref;
        pud_t *pud, *pud_ref;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static noinline int
+static noinline __kprobes int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
index 16ccbd77917f22c1693b9b41fcb8dc7485acee39..11a4ad4d62530ff58b7bfa56d086ad69e3fc0bd1 100644 (file)
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
        struct die_args *arg = args;
 
        if (val == DIE_DEBUG && (arg->err & DR_STEP))
-               if (post_kmmio_handler(arg->err, arg->regs) == 1)
+               if (post_kmmio_handler(arg->err, arg->regs) == 1) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
                        return NOTIFY_STOP;
+               }
 
        return NOTIFY_DONE;
 }
index 8aa85f17667e5034cfd2fae3eecd217c878d0529..0a979f3e5b8a7596aaf7d402cf73b0bb7eace8a9 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/mce.h>
 #include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/debugreg.h>
 
 #ifdef CONFIG_X86_32
 static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
 #endif
        load_TR_desc();                         /* This does ltr */
        load_LDT(&current->active_mm->context); /* This does lldt */
-
-       /*
-        * Now maybe reload the debug registers
-        */
-       if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
-               set_debugreg(current->thread.debugreg0, 0);
-               set_debugreg(current->thread.debugreg1, 1);
-               set_debugreg(current->thread.debugreg2, 2);
-               set_debugreg(current->thread.debugreg3, 3);
-               /* no 4 and 5 */
-               set_debugreg(current->thread.debugreg6, 6);
-               set_debugreg(current->thread.debugreg7, 7);
-#else
-               /* CONFIG_X86_64 */
-               loaddebug(&current->thread, 0);
-               loaddebug(&current->thread, 1);
-               loaddebug(&current->thread, 2);
-               loaddebug(&current->thread, 3);
-               /* no 4 and 5 */
-               loaddebug(&current->thread, 6);
-               loaddebug(&current->thread, 7);
-#endif
-       }
-
 }
 
 /**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644 (file)
index 0000000..f820826
--- /dev/null
@@ -0,0 +1,31 @@
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+  posttest_verbose = -v
+else
+  posttest_verbose =
+endif
+
+ifeq ($(CONFIG_64BIT),y)
+  posttest_64bit = -y
+else
+  posttest_64bit = -n
+endif
+
+distill_awk = $(srctree)/arch/x86/tools/distill.awk
+chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
+
+quiet_cmd_posttest = TEST    $@
+      cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
+
+posttest: $(obj)/test_get_len vmlinux
+       $(call cmd,posttest)
+
+hostprogs-y    := test_get_len
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644 (file)
index 0000000..0d13cd9
--- /dev/null
@@ -0,0 +1,23 @@
+# GNU objdump version checker
+#
+# Usage:
+# objdump -v | awk -f chkobjdump.awk
+BEGIN {
+       # objdump version 2.19 or later is OK for the test.
+       od_ver = 2;
+       od_sver = 19;
+}
+
+/^GNU/ {
+       split($4, ver, ".");
+       if (ver[1] > od_ver ||
+           (ver[1] == od_ver && ver[2] >= od_sver)) {
+               exit 1;
+       } else {
+               printf("Warning: objdump version %s is older than %d.%d\n",
+                      $4, od_ver, od_sver);
+               print("Warning: Skipping posttest.");
+               # Logic is inverted, because we just skip test without error.
+               exit 0;
+       }
+}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644 (file)
index 0000000..c13c0ee
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+       prev_addr = ""
+       prev_hex = ""
+       prev_mnemonic = ""
+       bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+       fwait_expr = "^9b "
+       fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+       # Symbol entry
+       printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+       if (split($0, field, "\t") < 3) {
+               # This is a continuation of the same insn.
+               prev_hex = prev_hex field[2]
+       } else {
+               # Skip bad instructions
+               if (match(prev_mnemonic, bad_expr))
+                       prev_addr = ""
+               # Split fwait from other f* instructions
+               if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+                       printf "%s\t%s\n", prev_addr, fwait_str
+                       sub(fwait_expr, "", prev_hex)
+               }
+               if (prev_addr != "")
+                       printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+               prev_addr = field[1]
+               prev_hex = field[2]
+               prev_mnemonic = field[3]
+       }
+}
+
+END {
+       if (prev_addr != "")
+               printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644 (file)
index 0000000..e34e92a
--- /dev/null
@@ -0,0 +1,380 @@
+#!/bin/awk -f
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+       if (!match("abc", "[[:lower:]]+"))
+               return "Your awk doesn't support charactor-class."
+       if (sprintf("%x", 0) != "0")
+               return "Your awk has a printf-format problem."
+       return ""
+}
+
+# Clear working vars
+function clear_vars() {
+       delete table
+       delete lptable2
+       delete lptable1
+       delete lptable3
+       eid = -1 # escape id
+       gid = -1 # group id
+       aid = -1 # AVX id
+       tname = ""
+}
+
+BEGIN {
+       # Implementation error checking
+       awkchecked = check_awk_implement()
+       if (awkchecked != "") {
+               print "Error: " awkchecked > "/dev/stderr"
+               print "Please try to use gawk." > "/dev/stderr"
+               exit 1
+       }
+
+       # Setup generating tables
+       print "/* x86 opcode map generated from x86-opcode-map.txt */"
+       print "/* Do not change this code. */\n"
+       ggid = 1
+       geid = 1
+       gaid = 0
+       delete etable
+       delete gtable
+       delete atable
+
+       opnd_expr = "^[[:alpha:]/]"
+       ext_expr = "^\\("
+       sep_expr = "^\\|$"
+       group_expr = "^Grp[[:alnum:]]+"
+
+       imm_expr = "^[IJAO][[:lower:]]"
+       imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+       imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+       imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+       imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+       imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+       imm_flag["Ob"] = "INAT_MOFFSET"
+       imm_flag["Ov"] = "INAT_MOFFSET"
+
+       modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])"
+       force64_expr = "\\([df]64\\)"
+       rex_expr = "^REX(\\.[XRWB]+)*"
+       fpu_expr = "^ESC" # TODO
+
+       lprefix1_expr = "\\(66\\)"
+       lprefix2_expr = "\\(F3\\)"
+       lprefix3_expr = "\\(F2\\)"
+       max_lprefix = 4
+
+       vexok_expr = "\\(VEX\\)"
+       vexonly_expr = "\\(oVEX\\)"
+
+       prefix_expr = "\\(Prefix\\)"
+       prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+       prefix_num["REPNE"] = "INAT_PFX_REPNE"
+       prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+       prefix_num["LOCK"] = "INAT_PFX_LOCK"
+       prefix_num["SEG=CS"] = "INAT_PFX_CS"
+       prefix_num["SEG=DS"] = "INAT_PFX_DS"
+       prefix_num["SEG=ES"] = "INAT_PFX_ES"
+       prefix_num["SEG=FS"] = "INAT_PFX_FS"
+       prefix_num["SEG=GS"] = "INAT_PFX_GS"
+       prefix_num["SEG=SS"] = "INAT_PFX_SS"
+       prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+       prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
+       prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+
+       clear_vars()
+}
+
+function semantic_error(msg) {
+       print "Semantic error at " NR ": " msg > "/dev/stderr"
+       exit 1
+}
+
+function debug(msg) {
+       print "DEBUG: " msg
+}
+
+function array_size(arr,   i,c) {
+       c = 0
+       for (i in arr)
+               c++
+       return c
+}
+
+/^Table:/ {
+       print "/* " $0 " */"
+       if (tname != "")
+               semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+       if (NF != 1) {
+               # escape opcode table
+               ref = ""
+               for (i = 2; i <= NF; i++)
+                       ref = ref $i
+               eid = escape[ref]
+               tname = sprintf("inat_escape_table_%d", eid)
+       }
+}
+
+/^AVXcode:/ {
+       if (NF != 1) {
+               # AVX/escape opcode table
+               aid = $2
+               if (gaid <= aid)
+                       gaid = aid + 1
+               if (tname == "")        # AVX only opcode table
+                       tname = sprintf("inat_avx_table_%d", $2)
+       }
+       if (aid == -1 && eid == -1)     # primary opcode table
+               tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+       print "/* " $0 " */"
+       if (!($2 in group))
+               semantic_error("No group: " $2 )
+       gid = group[$2]
+       tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+       print "const insn_attr_t " name " = {"
+       for (i = 0; i < n; i++) {
+               id = sprintf(fmt, i)
+               if (tbl[id])
+                       print " [" id "] = " tbl[id] ","
+       }
+       print "};"
+}
+
+/^EndTable/ {
+       if (gid != -1) {
+               # print group tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,3] = tname "_3"
+               }
+       } else {
+               # print primary/escaped tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,0] = tname
+                       if (aid >= 0)
+                               atable[aid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,1] = tname "_1"
+                       if (aid >= 0)
+                               atable[aid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,2] = tname "_2"
+                       if (aid >= 0)
+                               atable[aid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,3] = tname "_3"
+                       if (aid >= 0)
+                               atable[aid,3] = tname "_3"
+               }
+       }
+       print ""
+       clear_vars()
+}
+
+function add_flags(old,new) {
+       if (old && new)
+               return old " | " new
+       else if (old)
+               return old
+       else
+               return new
+}
+
+# convert operands to flags.
+function convert_operands(opnd,       i,imm,mod)
+{
+       imm = null
+       mod = null
+       for (i in opnd) {
+               i  = opnd[i]
+               if (match(i, imm_expr) == 1) {
+                       if (!imm_flag[i])
+                               semantic_error("Unknown imm opnd: " i)
+                       if (imm) {
+                               if (i != "Ib")
+                                       semantic_error("Second IMM error")
+                               imm = add_flags(imm, "INAT_SCNDIMM")
+                       } else
+                               imm = imm_flag[i]
+               } else if (match(i, modrm_expr))
+                       mod = "INAT_MODRM"
+       }
+       return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+\:/ {
+       if (NR == 1)
+               next
+       # get index
+       idx = "0x" substr($1, 1, index($1,":") - 1)
+       if (idx in table)
+               semantic_error("Redefine " idx " in " tname)
+
+       # check if escaped opcode
+       if ("escape" == $2) {
+               if ($3 != "#")
+                       semantic_error("No escaped name")
+               ref = ""
+               for (i = 4; i <= NF; i++)
+                       ref = ref $i
+               if (ref in escape)
+                       semantic_error("Redefine escape (" ref ")")
+               escape[ref] = geid
+               geid++
+               table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
+               next
+       }
+
+       variant = null
+       # converts
+       i = 2
+       while (i <= NF) {
+               opcode = $(i++)
+               delete opnds
+               ext = null
+               flags = null
+               opnd = null
+               # parse one opcode
+               if (match($i, opnd_expr)) {
+                       opnd = $i
+                       split($(i++), opnds, ",")
+                       flags = convert_operands(opnds)
+               }
+               if (match($i, ext_expr))
+                       ext = $(i++)
+               if (match($i, sep_expr))
+                       i++
+               else if (i < NF)
+                       semantic_error($i " is not a separator")
+
+               # check if group opcode
+               if (match(opcode, group_expr)) {
+                       if (!(opcode in group)) {
+                               group[opcode] = ggid
+                               ggid++
+                       }
+                       flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
+               }
+               # check force(or default) 64bit
+               if (match(ext, force64_expr))
+                       flags = add_flags(flags, "INAT_FORCE64")
+
+               # check REX prefix
+               if (match(opcode, rex_expr))
+                       flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
+
+               # check coprocessor escape : TODO
+               if (match(opcode, fpu_expr))
+                       flags = add_flags(flags, "INAT_MODRM")
+
+               # check VEX only code
+               if (match(ext, vexonly_expr))
+                       flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
+
+               # check VEX only code
+               if (match(ext, vexok_expr))
+                       flags = add_flags(flags, "INAT_VEXOK")
+
+               # check prefixes
+               if (match(ext, prefix_expr)) {
+                       if (!prefix_num[opcode])
+                               semantic_error("Unknown prefix: " opcode)
+                       flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
+               }
+               if (length(flags) == 0)
+                       continue
+               # check if last prefix
+               if (match(ext, lprefix1_expr)) {
+                       lptable1[idx] = add_flags(lptable1[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else if (match(ext, lprefix2_expr)) {
+                       lptable2[idx] = add_flags(lptable2[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else if (match(ext, lprefix3_expr)) {
+                       lptable3[idx] = add_flags(lptable3[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else {
+                       table[idx] = add_flags(table[idx],flags)
+               }
+       }
+       if (variant)
+               table[idx] = add_flags(table[idx],variant)
+}
+
+END {
+       if (awkchecked != "")
+               exit 1
+       # print escape opcode map's array
+       print "/* Escape opcode map array */"
+       print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < geid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (etable[i,j])
+                               print " ["i"]["j"] = "etable[i,j]","
+       print "};\n"
+       # print group opcode map's array
+       print "/* Group opcode map array */"
+       print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < ggid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (gtable[i,j])
+                               print " ["i"]["j"] = "gtable[i,j]","
+       print "};\n"
+       # print AVX opcode map's array
+       print "/* AVX opcode map array */"
+       print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < gaid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (atable[i,j])
+                               print " ["i"]["j"] = "atable[i,j]","
+       print "};"
+}
+
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644 (file)
index 0000000..d8214dc
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#define unlikely(cond) (cond)
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular.  See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+ */
+
+const char *prog;
+static int verbose;
+static int x86_64;
+
+static void usage(void)
+{
+       fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
+               " %s [-y|-n] [-v] \n", prog);
+       fprintf(stderr, "\t-y   64bit mode\n");
+       fprintf(stderr, "\t-n   32bit mode\n");
+       fprintf(stderr, "\t-v   verbose mode\n");
+       exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+       fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+       exit(3);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+                      struct insn_field *field)
+{
+       fprintf(fp, "%s.%s = {\n", indent, name);
+       fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+               indent, field->value, field->bytes[0], field->bytes[1],
+               field->bytes[2], field->bytes[3]);
+       fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+               field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+       fprintf(fp, "Instruction = { \n");
+       dump_field(fp, "prefixes", "\t",        &insn->prefixes);
+       dump_field(fp, "rex_prefix", "\t",      &insn->rex_prefix);
+       dump_field(fp, "vex_prefix", "\t",      &insn->vex_prefix);
+       dump_field(fp, "opcode", "\t",          &insn->opcode);
+       dump_field(fp, "modrm", "\t",           &insn->modrm);
+       dump_field(fp, "sib", "\t",             &insn->sib);
+       dump_field(fp, "displacement", "\t",    &insn->displacement);
+       dump_field(fp, "immediate1", "\t",      &insn->immediate1);
+       dump_field(fp, "immediate2", "\t",      &insn->immediate2);
+       fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+               insn->attr, insn->opnd_bytes, insn->addr_bytes);
+       fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+               insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void parse_args(int argc, char **argv)
+{
+       int c;
+       prog = argv[0];
+       while ((c = getopt(argc, argv, "ynv")) != -1) {
+               switch (c) {
+               case 'y':
+                       x86_64 = 1;
+                       break;
+               case 'n':
+                       x86_64 = 0;
+                       break;
+               case 'v':
+                       verbose = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+       char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
+       unsigned char insn_buf[16];
+       struct insn insn;
+       int insns = 0, c;
+       int warnings = 0;
+
+       parse_args(argc, argv);
+
+       while (fgets(line, BUFSIZE, stdin)) {
+               char copy[BUFSIZE], *s, *tab1, *tab2;
+               int nb = 0;
+               unsigned int b;
+
+               if (line[0] == '<') {
+                       /* Symbol line */
+                       strcpy(sym, line);
+                       continue;
+               }
+
+               insns++;
+               memset(insn_buf, 0, 16);
+               strcpy(copy, line);
+               tab1 = strchr(copy, '\t');
+               if (!tab1)
+                       malformed_line(line, insns);
+               s = tab1 + 1;
+               s += strspn(s, " ");
+               tab2 = strchr(s, '\t');
+               if (!tab2)
+                       malformed_line(line, insns);
+               *tab2 = '\0';   /* Characters beyond tab2 aren't examined */
+               while (s < tab2) {
+                       if (sscanf(s, "%x", &b) == 1) {
+                               insn_buf[nb++] = (unsigned char) b;
+                               s += 3;
+                       } else
+                               break;
+               }
+               /* Decode an instruction */
+               insn_init(&insn, insn_buf, x86_64);
+               insn_get_length(&insn);
+               if (insn.length != nb) {
+                       warnings++;
+                       fprintf(stderr, "Warning: %s found difference at %s\n",
+                               prog, sym);
+                       fprintf(stderr, "Warning: %s", line);
+                       fprintf(stderr, "Warning: objdump says %d bytes, but "
+                               "insn_get_length() says %d\n", nb,
+                               insn.length);
+                       if (verbose)
+                               dump_insn(stderr, &insn);
+               }
+       }
+       if (warnings)
+               fprintf(stderr, "Warning: decoded and checked %d"
+                       " instructions with %d warnings\n", insns, warnings);
+       else
+               fprintf(stderr, "Succeed: decoded and checked %d"
+                       " instructions\n", insns);
+       return 0;
+}
index 713ed7d372475dc325ac7cfca8cc5e22181af29b..689cc6a6214df3b2f9aecb04aedabeaa2779955d 100644 (file)
@@ -3,7 +3,6 @@
 
 static bool report_gart_errors;
 static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
-static void (*orig_mce_callback)(struct mce *m);
 
 void amd_report_gart_errors(bool v)
 {
@@ -363,8 +362,10 @@ static inline void amd_decode_err_code(unsigned int ec)
                pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
 }
 
-static void amd_decode_mce(struct mce *m)
+static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
+                          void *data)
 {
+       struct mce *m = (struct mce *)data;
        struct err_regs regs;
        int node, ecc;
 
@@ -420,20 +421,22 @@ static void amd_decode_mce(struct mce *m)
        }
 
        amd_decode_err_code(m->status & 0xffff);
+
+       return NOTIFY_STOP;
 }
 
+static struct notifier_block amd_mce_dec_nb = {
+       .notifier_call  = amd_decode_mce,
+};
+
 static int __init mce_amd_init(void)
 {
        /*
         * We can decode MCEs for Opteron and later CPUs:
         */
        if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
-           (boot_cpu_data.x86 >= 0xf)) {
-               /* safe the default decode mce callback */
-               orig_mce_callback = x86_mce_decode_callback;
-
-               x86_mce_decode_callback = amd_decode_mce;
-       }
+           (boot_cpu_data.x86 >= 0xf))
+               atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
 
        return 0;
 }
@@ -442,7 +445,7 @@ early_initcall(mce_amd_init);
 #ifdef MODULE
 static void __exit mce_amd_exit(void)
 {
-       x86_mce_decode_callback = orig_mce_callback;
+       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
 }
 
 MODULE_DESCRIPTION("AMD MCE decoder");
index 4ec5e67e18cfda40ad34a52187a3ff288682f6be..47bbdf9c38d0428328e48d9c772cdc3086375e5c 100644 (file)
@@ -117,12 +117,12 @@ struct ftrace_event_call {
        struct dentry           *dir;
        struct trace_event      *event;
        int                     enabled;
-       int                     (*regfunc)(void *);
-       void                    (*unregfunc)(void *);
+       int                     (*regfunc)(struct ftrace_event_call *);
+       void                    (*unregfunc)(struct ftrace_event_call *);
        int                     id;
-       int                     (*raw_init)(void);
-       int                     (*show_format)(struct ftrace_event_call *call,
-                                              struct trace_seq *s);
+       int                     (*raw_init)(struct ftrace_event_call *);
+       int                     (*show_format)(struct ftrace_event_call *,
+                                              struct trace_seq *);
        int                     (*define_fields)(struct ftrace_event_call *);
        struct list_head        fields;
        int                     filter_active;
@@ -131,20 +131,20 @@ struct ftrace_event_call {
        void                    *data;
 
        atomic_t                profile_count;
-       int                     (*profile_enable)(void);
-       void                    (*profile_disable)(void);
+       int                     (*profile_enable)(struct ftrace_event_call *);
+       void                    (*profile_disable)(struct ftrace_event_call *);
 };
 
 #define FTRACE_MAX_PROFILE_SIZE        2048
 
-extern char                    *trace_profile_buf;
-extern char                    *trace_profile_buf_nmi;
+extern char *perf_trace_buf;
+extern char *perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED                32
 #define MAX_FILTER_STR_VAL     256     /* Should handle KSYM_SYMBOL_LEN */
 
 extern void destroy_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_match_preds(struct event_filter *filter, void *rec);
 extern int filter_current_check_discard(struct ring_buffer *buffer,
                                        struct ftrace_event_call *call,
                                        void *rec,
@@ -157,11 +157,12 @@ enum {
        FILTER_PTR_STRING,
 };
 
-extern int trace_define_field(struct ftrace_event_call *call,
-                             const char *type, const char *name,
-                             int offset, int size, int is_signed,
-                             int filter_type);
 extern int trace_define_common_fields(struct ftrace_event_call *call);
+extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+                             const char *name, int offset, int size,
+                             int is_signed, int filter_type);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
 
 #define is_signed_type(type)   (((type)(-1)) < 0)
 
@@ -186,4 +187,13 @@ do {                                                                       \
                __trace_printk(ip, fmt, ##args);                        \
 } while (0)
 
+#ifdef CONFIG_EVENT_PROFILE
+struct perf_event;
+extern int ftrace_profile_enable(int event_id);
+extern void ftrace_profile_disable(int event_id);
+extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+                                    char *filter_str);
+extern void ftrace_profile_free_filter(struct perf_event *event);
+#endif
+
 #endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
new file mode 100644 (file)
index 0000000..a03daed
--- /dev/null
@@ -0,0 +1,131 @@
+#ifndef _LINUX_HW_BREAKPOINT_H
+#define _LINUX_HW_BREAKPOINT_H
+
+enum {
+       HW_BREAKPOINT_LEN_1 = 1,
+       HW_BREAKPOINT_LEN_2 = 2,
+       HW_BREAKPOINT_LEN_4 = 4,
+       HW_BREAKPOINT_LEN_8 = 8,
+};
+
+enum {
+       HW_BREAKPOINT_R = 1,
+       HW_BREAKPOINT_W = 2,
+       HW_BREAKPOINT_X = 4,
+};
+
+#ifdef __KERNEL__
+
+#include <linux/perf_event.h>
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
+/* As it's for in-kernel or ptrace use, we want it to be pinned */
+#define DEFINE_BREAKPOINT_ATTR(name)   \
+struct perf_event_attr name = {                \
+       .type = PERF_TYPE_BREAKPOINT,   \
+       .size = sizeof(name),           \
+       .pinned = 1,                    \
+};
+
+static inline void hw_breakpoint_init(struct perf_event_attr *attr)
+{
+       attr->type = PERF_TYPE_BREAKPOINT;
+       attr->size = sizeof(*attr);
+       attr->pinned = 1;
+}
+
+static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
+{
+       return bp->attr.bp_addr;
+}
+
+static inline int hw_breakpoint_type(struct perf_event *bp)
+{
+       return bp->attr.bp_type;
+}
+
+static inline int hw_breakpoint_len(struct perf_event *bp)
+{
+       return bp->attr.bp_len;
+}
+
+extern struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered,
+                           struct task_struct *tsk);
+
+/* FIXME: only change from the attr, and don't unregister */
+extern struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+                         struct perf_event_attr *attr,
+                         perf_callback_t triggered,
+                         struct task_struct *tsk);
+
+/*
+ * Kernel breakpoints are not associated with any particular thread.
+ */
+extern struct perf_event *
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
+                               perf_callback_t triggered,
+                               int cpu);
+
+extern struct perf_event **
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered);
+
+extern int register_perf_hw_breakpoint(struct perf_event *bp);
+extern int __register_perf_hw_breakpoint(struct perf_event *bp);
+extern void unregister_hw_breakpoint(struct perf_event *bp);
+extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events);
+
+extern int reserve_bp_slot(struct perf_event *bp);
+extern void release_bp_slot(struct perf_event *bp);
+
+extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk);
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+       return &bp->hw.info;
+}
+
+#else /* !CONFIG_HAVE_HW_BREAKPOINT */
+
+static inline struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered,
+                           struct task_struct *tsk)    { return NULL; }
+static inline struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp,
+                         struct perf_event_attr *attr,
+                         perf_callback_t triggered,
+                         struct task_struct *tsk)      { return NULL; }
+static inline struct perf_event *
+register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr,
+                               perf_callback_t triggered,
+                               int cpu)                { return NULL; }
+static inline struct perf_event **
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered)  { return NULL; }
+static inline int
+register_perf_hw_breakpoint(struct perf_event *bp)     { return -ENOSYS; }
+static inline int
+__register_perf_hw_breakpoint(struct perf_event *bp)   { return -ENOSYS; }
+static inline void unregister_hw_breakpoint(struct perf_event *bp)     { }
+static inline void
+unregister_wide_hw_breakpoint(struct perf_event **cpu_events)          { }
+static inline int
+reserve_bp_slot(struct perf_event *bp)                 {return -ENOSYS; }
+static inline void release_bp_slot(struct perf_event *bp)              { }
+
+static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { }
+
+static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
+{
+       return NULL;
+}
+
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_HW_BREAKPOINT_H */
index 3a46b7b7abb219c40bf39ce4d5f4e448da131212..1b672f74a32f0d76d27c277c759c6c3a24135674 100644 (file)
@@ -296,6 +296,8 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 int disable_kprobe(struct kprobe *kp);
 int enable_kprobe(struct kprobe *kp);
 
+void dump_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
index 7b7fbf433cffb6377605eea61682c48905c7b33e..e3fb256067066ad947596efeb9f848ee136acbe7 100644 (file)
@@ -106,6 +106,8 @@ enum perf_sw_ids {
        PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
        PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
        PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
+       PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
+       PERF_COUNT_SW_EMULATION_FAULTS          = 8,
 
        PERF_COUNT_SW_MAX,                      /* non-ABI */
 };
@@ -225,6 +227,7 @@ struct perf_counter_attr {
 #define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
 #define PERF_COUNTER_IOC_PERIOD                _IOW('$', 4, u64)
 #define PERF_COUNTER_IOC_SET_OUTPUT    _IO ('$', 5)
+#define PERF_COUNTER_IOC_SET_FILTER    _IOW('$', 6, char *)
 
 enum perf_counter_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
index 9e7012689a8407b65d21b57f8ea97760a40bb888..43adbd7f0010c21719f75dc05dfbf5d61465abf7 100644 (file)
 #include <linux/ioctl.h>
 #include <asm/byteorder.h>
 
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <asm/hw_breakpoint.h>
+#endif
+
 /*
  * User-space ABI bits:
  */
@@ -31,6 +35,7 @@ enum perf_type_id {
        PERF_TYPE_TRACEPOINT                    = 2,
        PERF_TYPE_HW_CACHE                      = 3,
        PERF_TYPE_RAW                           = 4,
+       PERF_TYPE_BREAKPOINT                    = 5,
 
        PERF_TYPE_MAX,                          /* non-ABI */
 };
@@ -102,6 +107,8 @@ enum perf_sw_ids {
        PERF_COUNT_SW_CPU_MIGRATIONS            = 4,
        PERF_COUNT_SW_PAGE_FAULTS_MIN           = 5,
        PERF_COUNT_SW_PAGE_FAULTS_MAJ           = 6,
+       PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
+       PERF_COUNT_SW_EMULATION_FAULTS          = 8,
 
        PERF_COUNT_SW_MAX,                      /* non-ABI */
 };
@@ -207,6 +214,15 @@ struct perf_event_attr {
                __u32           wakeup_events;    /* wakeup every n events */
                __u32           wakeup_watermark; /* bytes before wakeup   */
        };
+
+       union {
+               struct { /* Hardware breakpoint info */
+                       __u64           bp_addr;
+                       __u32           bp_type;
+                       __u32           bp_len;
+               };
+       };
+
        __u32                   __reserved_2;
 
        __u64                   __reserved_3;
@@ -219,8 +235,9 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_DISABLE         _IO ('$', 1)
 #define PERF_EVENT_IOC_REFRESH         _IO ('$', 2)
 #define PERF_EVENT_IOC_RESET           _IO ('$', 3)
-#define PERF_EVENT_IOC_PERIOD          _IOW('$', 4, u64)
+#define PERF_EVENT_IOC_PERIOD          _IOW('$', 4, __u64)
 #define PERF_EVENT_IOC_SET_OUTPUT      _IO ('$', 5)
+#define PERF_EVENT_IOC_SET_FILTER      _IOW('$', 6, char *)
 
 enum perf_event_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
@@ -475,6 +492,11 @@ struct hw_perf_event {
                        s64             remaining;
                        struct hrtimer  hrtimer;
                };
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+               union { /* breakpoint */
+                       struct arch_hw_breakpoint       info;
+               };
+#endif
        };
        atomic64_t                      prev_count;
        u64                             sample_period;
@@ -543,6 +565,10 @@ struct perf_pending_entry {
        void (*func)(struct perf_pending_entry *);
 };
 
+typedef void (*perf_callback_t)(struct perf_event *, void *);
+
+struct perf_sample_data;
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -585,7 +611,7 @@ struct perf_event {
        u64                             tstamp_running;
        u64                             tstamp_stopped;
 
-       struct perf_event_attr  attr;
+       struct perf_event_attr          attr;
        struct hw_perf_event            hw;
 
        struct perf_event_context       *ctx;
@@ -633,7 +659,20 @@ struct perf_event {
 
        struct pid_namespace            *ns;
        u64                             id;
+
+       void (*overflow_handler)(struct perf_event *event,
+                       int nmi, struct perf_sample_data *data,
+                       struct pt_regs *regs);
+
+#ifdef CONFIG_EVENT_PROFILE
+       struct event_filter             *filter;
 #endif
+
+       perf_callback_t                 callback;
+
+       perf_callback_t                 event_callback;
+
+#endif /* CONFIG_PERF_EVENTS */
 };
 
 /**
@@ -706,7 +745,6 @@ struct perf_output_handle {
        int                             nmi;
        int                             sample;
        int                             locked;
-       unsigned long                   flags;
 };
 
 #ifdef CONFIG_PERF_EVENTS
@@ -738,6 +776,14 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader,
               struct perf_cpu_context *cpuctx,
               struct perf_event_context *ctx, int cpu);
 extern void perf_event_update_userpage(struct perf_event *event);
+extern int perf_event_release_kernel(struct perf_event *event);
+extern struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr,
+                               int cpu,
+                               pid_t pid,
+                               perf_callback_t callback);
+extern u64 perf_event_read_value(struct perf_event *event,
+                                u64 *enabled, u64 *running);
 
 struct perf_sample_data {
        u64                             type;
@@ -814,6 +860,7 @@ extern int sysctl_perf_event_sample_rate;
 extern void perf_event_init(void);
 extern void perf_tp_event(int event_id, u64 addr, u64 count,
                                 void *record, int entry_size);
+extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
 #define perf_misc_flags(regs)  (user_mode(regs) ? PERF_RECORD_MISC_USER : \
@@ -827,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle,
 extern void perf_output_end(struct perf_output_handle *handle);
 extern void perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
+extern int perf_swevent_get_recursion_context(void);
+extern void perf_swevent_put_recursion_context(int rctx);
 #else
 static inline void
 perf_event_task_sched_in(struct task_struct *task, int cpu)            { }
@@ -848,11 +897,15 @@ static inline int perf_event_task_enable(void)                            { return -EINVAL; }
 static inline void
 perf_sw_event(u32 event_id, u64 nr, int nmi,
                     struct pt_regs *regs, u64 addr)                    { }
+static inline void
+perf_bp_event(struct perf_event *event, void *data)            { }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)         { }
 static inline void perf_event_comm(struct task_struct *tsk)            { }
 static inline void perf_event_fork(struct task_struct *tsk)            { }
 static inline void perf_event_init(void)                               { }
+static inline int  perf_swevent_get_recursion_context(void)  { return -1; }
+static inline void perf_swevent_put_recursion_context(int rctx)                { }
 
 #endif
 
index a990ace1a8380f01901b742a6b0821aff46a5d9d..e79e2f3ccc516e73b42f32c32ddb772c5cb1cc2c 100644 (file)
@@ -99,37 +99,16 @@ struct perf_event_attr;
 #define __SC_TEST6(t6, a6, ...)        __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
 #ifdef CONFIG_EVENT_PROFILE
-#define TRACE_SYS_ENTER_PROFILE(sname)                                        \
-static int prof_sysenter_enable_##sname(void)                                 \
-{                                                                             \
-       return reg_prof_syscall_enter("sys"#sname);                            \
-}                                                                             \
-                                                                              \
-static void prof_sysenter_disable_##sname(void)                                       \
-{                                                                             \
-       unreg_prof_syscall_enter("sys"#sname);                                 \
-}
-
-#define TRACE_SYS_EXIT_PROFILE(sname)                                         \
-static int prof_sysexit_enable_##sname(void)                                  \
-{                                                                             \
-       return reg_prof_syscall_exit("sys"#sname);                             \
-}                                                                             \
-                                                                              \
-static void prof_sysexit_disable_##sname(void)                                \
-{                                                                              \
-       unreg_prof_syscall_exit("sys"#sname);                                  \
-}
 
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)                                   \
        .profile_count = ATOMIC_INIT(-1),                                      \
-       .profile_enable = prof_sysenter_enable_##sname,                        \
-       .profile_disable = prof_sysenter_disable_##sname,
+       .profile_enable = prof_sysenter_enable,                                \
+       .profile_disable = prof_sysenter_disable,
 
 #define TRACE_SYS_EXIT_PROFILE_INIT(sname)                                    \
        .profile_count = ATOMIC_INIT(-1),                                      \
-       .profile_enable = prof_sysexit_enable_##sname,                         \
-       .profile_disable = prof_sysexit_disable_##sname,
+       .profile_enable = prof_sysexit_enable,                                 \
+       .profile_disable = prof_sysexit_disable,
 #else
 #define TRACE_SYS_ENTER_PROFILE(sname)
 #define TRACE_SYS_ENTER_PROFILE_INIT(sname)
@@ -153,74 +132,46 @@ static void prof_sysexit_disable_##sname(void)                                   \
 #define __SC_STR_TDECL6(t, a, ...)     #t, __SC_STR_TDECL5(__VA_ARGS__)
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)                               \
+       static const struct syscall_metadata __syscall_meta_##sname;    \
        static struct ftrace_event_call event_enter_##sname;            \
-       struct trace_event enter_syscall_print_##sname = {              \
+       static struct trace_event enter_syscall_print_##sname = {       \
                .trace                  = print_syscall_enter,          \
        };                                                              \
-       static int init_enter_##sname(void)                             \
-       {                                                               \
-               int num, id;                                            \
-               num = syscall_name_to_nr("sys"#sname);                  \
-               if (num < 0)                                            \
-                       return -ENOSYS;                                 \
-               id = register_ftrace_event(&enter_syscall_print_##sname);\
-               if (!id)                                                \
-                       return -ENODEV;                                 \
-               event_enter_##sname.id = id;                            \
-               set_syscall_enter_id(num, id);                          \
-               INIT_LIST_HEAD(&event_enter_##sname.fields);            \
-               return 0;                                               \
-       }                                                               \
-       TRACE_SYS_ENTER_PROFILE(sname);                                 \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_enter_##sname = {                                       \
                .name                   = "sys_enter"#sname,            \
                .system                 = "syscalls",                   \
-               .event                  = &event_syscall_enter,         \
-               .raw_init               = init_enter_##sname,           \
+               .event                  = &enter_syscall_print_##sname, \
+               .raw_init               = init_syscall_trace,           \
                .show_format            = syscall_enter_format,         \
                .define_fields          = syscall_enter_define_fields,  \
                .regfunc                = reg_event_syscall_enter,      \
                .unregfunc              = unreg_event_syscall_enter,    \
-               .data                   = "sys"#sname,                  \
+               .data                   = (void *)&__syscall_meta_##sname,\
                TRACE_SYS_ENTER_PROFILE_INIT(sname)                     \
        }
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)                                        \
+       static const struct syscall_metadata __syscall_meta_##sname;    \
        static struct ftrace_event_call event_exit_##sname;             \
-       struct trace_event exit_syscall_print_##sname = {               \
+       static struct trace_event exit_syscall_print_##sname = {        \
                .trace                  = print_syscall_exit,           \
        };                                                              \
-       static int init_exit_##sname(void)                              \
-       {                                                               \
-               int num, id;                                            \
-               num = syscall_name_to_nr("sys"#sname);                  \
-               if (num < 0)                                            \
-                       return -ENOSYS;                                 \
-               id = register_ftrace_event(&exit_syscall_print_##sname);\
-               if (!id)                                                \
-                       return -ENODEV;                                 \
-               event_exit_##sname.id = id;                             \
-               set_syscall_exit_id(num, id);                           \
-               INIT_LIST_HEAD(&event_exit_##sname.fields);             \
-               return 0;                                               \
-       }                                                               \
-       TRACE_SYS_EXIT_PROFILE(sname);                                  \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_exit_##sname = {                                        \
                .name                   = "sys_exit"#sname,             \
                .system                 = "syscalls",                   \
-               .event                  = &event_syscall_exit,          \
-               .raw_init               = init_exit_##sname,            \
+               .event                  = &exit_syscall_print_##sname,  \
+               .raw_init               = init_syscall_trace,           \
                .show_format            = syscall_exit_format,          \
                .define_fields          = syscall_exit_define_fields,   \
                .regfunc                = reg_event_syscall_exit,       \
                .unregfunc              = unreg_event_syscall_exit,     \
-               .data                   = "sys"#sname,                  \
+               .data                   = (void *)&__syscall_meta_##sname,\
                TRACE_SYS_EXIT_PROFILE_INIT(sname)                      \
        }
 
index 2aac8a83e89b9a7994b7570ffb3ea879d763ee23..f59604ed0ec606c75449d6b2cd8416abf6f7f38f 100644 (file)
@@ -280,6 +280,12 @@ static inline void tracepoint_synchronize_unregister(void)
  * TRACE_EVENT_FN to perform any (un)registration work.
  */
 
+#define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
+#define DEFINE_EVENT(template, name, proto, args)              \
+       DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #define TRACE_EVENT(name, proto, args, struct, assign, print)  \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define TRACE_EVENT_FN(name, proto, args, struct,              \
index 2a4b3bf740336b23418ae972745f8704a2f420fb..5acfb1eb4df91cd096da3ee728bd636955247b0f 100644 (file)
                assign, print, reg, unreg)                      \
        DEFINE_TRACE_FN(name, reg, unreg)
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args) \
+       DEFINE_TRACE(name)
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_TRACE(name)
+
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)       \
        DEFINE_TRACE(name)
@@ -63,6 +71,9 @@
 
 #undef TRACE_EVENT
 #undef TRACE_EVENT_FN
+#undef DECLARE_EVENT_CLASS
+#undef DEFINE_EVENT
+#undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
 
 /* Only undef what we defined in this file */
index 8abd620a490ef7c3cef60dfd3c1c4030b73940bf..1af72dc2427881d203c1984081e4691d23bc0c30 100644 (file)
@@ -13,7 +13,7 @@ TRACE_EVENT(lock_kernel,
        TP_ARGS(func, file, line),
 
        TP_STRUCT__entry(
-               __field(        int,            lock_depth              )
+               __field(        int,            depth                   )
                __field_ext(    const char *,   func, FILTER_PTR_STRING )
                __field_ext(    const char *,   file, FILTER_PTR_STRING )
                __field(        int,            line                    )
@@ -21,13 +21,13 @@ TRACE_EVENT(lock_kernel,
 
        TP_fast_assign(
                /* We want to record the lock_depth after lock is acquired */
-               __entry->lock_depth = current->lock_depth + 1;
+               __entry->depth = current->lock_depth + 1;
                __entry->func = func;
                __entry->file = file;
                __entry->line = line;
        ),
 
-       TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth,
+       TP_printk("depth=%d file:line=%s:%d func=%s()", __entry->depth,
                  __entry->file, __entry->line, __entry->func)
 );
 
@@ -38,20 +38,20 @@ TRACE_EVENT(unlock_kernel,
        TP_ARGS(func, file, line),
 
        TP_STRUCT__entry(
-               __field(int,            lock_depth)
-               __field(const char *,   func)
-               __field(const char *,   file)
-               __field(int,            line)
+               __field(int,            depth           )
+               __field(const char *,   func            )
+               __field(const char *,   file            )
+               __field(int,            line            )
        ),
 
        TP_fast_assign(
-               __entry->lock_depth = current->lock_depth;
+               __entry->depth = current->lock_depth;
                __entry->func = func;
                __entry->file = file;
                __entry->line = line;
        ),
 
-       TP_printk("depth: %d, %s:%d %s()", __entry->lock_depth,
+       TP_printk("depth=%d file:line=%s:%d func=%s()", __entry->depth,
                  __entry->file, __entry->line, __entry->func)
 );
 
index 00405b5f624a2d742a1d95d2a6fb59d530b6e369..5fb72733331e4e8a16d0144ad97097b174de80a8 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/tracepoint.h>
 
-TRACE_EVENT(block_rq_abort,
+DECLARE_EVENT_CLASS(block_rq_with_error,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
@@ -40,41 +40,28 @@ TRACE_EVENT(block_rq_abort,
                  __entry->nr_sector, __entry->errors)
 );
 
-TRACE_EVENT(block_rq_insert,
+DEFINE_EVENT(block_rq_with_error, block_rq_abort,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
-       TP_ARGS(q, rq),
+       TP_ARGS(q, rq)
+);
 
-       TP_STRUCT__entry(
-               __field(  dev_t,        dev                     )
-               __field(  sector_t,     sector                  )
-               __field(  unsigned int, nr_sector               )
-               __field(  unsigned int, bytes                   )
-               __array(  char,         rwbs,   6               )
-               __array(  char,         comm,   TASK_COMM_LEN   )
-               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
-       ),
+DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
 
-       TP_fast_assign(
-               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-               __entry->sector    = blk_pc_request(rq) ? 0 : blk_rq_pos(rq);
-               __entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq);
-               __entry->bytes     = blk_pc_request(rq) ? blk_rq_bytes(rq) : 0;
+       TP_PROTO(struct request_queue *q, struct request *rq),
 
-               blk_fill_rwbs_rq(__entry->rwbs, rq);
-               blk_dump_cmd(__get_str(cmd), rq);
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-       ),
+       TP_ARGS(q, rq)
+);
 
-       TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->rwbs, __entry->bytes, __get_str(cmd),
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->comm)
+DEFINE_EVENT(block_rq_with_error, block_rq_complete,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq)
 );
 
-TRACE_EVENT(block_rq_issue,
+DECLARE_EVENT_CLASS(block_rq,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
@@ -86,7 +73,7 @@ TRACE_EVENT(block_rq_issue,
                __field(  unsigned int, nr_sector               )
                __field(  unsigned int, bytes                   )
                __array(  char,         rwbs,   6               )
-               __array(  char,         comm,   TASK_COMM_LEN   )
+               __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
        ),
 
@@ -108,68 +95,18 @@ TRACE_EVENT(block_rq_issue,
                  __entry->nr_sector, __entry->comm)
 );
 
-TRACE_EVENT(block_rq_requeue,
+DEFINE_EVENT(block_rq, block_rq_insert,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
-       TP_ARGS(q, rq),
-
-       TP_STRUCT__entry(
-               __field(  dev_t,        dev                     )
-               __field(  sector_t,     sector                  )
-               __field(  unsigned int, nr_sector               )
-               __field(  int,          errors                  )
-               __array(  char,         rwbs,   6               )
-               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-               __entry->sector    = blk_pc_request(rq) ? 0 : blk_rq_pos(rq);
-               __entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq);
-               __entry->errors    = rq->errors;
-
-               blk_fill_rwbs_rq(__entry->rwbs, rq);
-               blk_dump_cmd(__get_str(cmd), rq);
-       ),
-
-       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->rwbs, __get_str(cmd),
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->errors)
+       TP_ARGS(q, rq)
 );
 
-TRACE_EVENT(block_rq_complete,
+DEFINE_EVENT(block_rq, block_rq_issue,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
 
-       TP_ARGS(q, rq),
-
-       TP_STRUCT__entry(
-               __field(  dev_t,        dev                     )
-               __field(  sector_t,     sector                  )
-               __field(  unsigned int, nr_sector               )
-               __field(  int,          errors                  )
-               __array(  char,         rwbs,   6               )
-               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
-               __entry->sector    = blk_pc_request(rq) ? 0 : blk_rq_pos(rq);
-               __entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq);
-               __entry->errors    = rq->errors;
-
-               blk_fill_rwbs_rq(__entry->rwbs, rq);
-               blk_dump_cmd(__get_str(cmd), rq);
-       ),
-
-       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->rwbs, __get_str(cmd),
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->errors)
+       TP_ARGS(q, rq)
 );
 
 TRACE_EVENT(block_bio_bounce,
@@ -228,7 +165,7 @@ TRACE_EVENT(block_bio_complete,
                  __entry->nr_sector, __entry->error)
 );
 
-TRACE_EVENT(block_bio_backmerge,
+DECLARE_EVENT_CLASS(block_bio,
 
        TP_PROTO(struct request_queue *q, struct bio *bio),
 
@@ -256,63 +193,28 @@ TRACE_EVENT(block_bio_backmerge,
                  __entry->nr_sector, __entry->comm)
 );
 
-TRACE_EVENT(block_bio_frontmerge,
+DEFINE_EVENT(block_bio, block_bio_backmerge,
 
        TP_PROTO(struct request_queue *q, struct bio *bio),
 
-       TP_ARGS(q, bio),
-
-       TP_STRUCT__entry(
-               __field( dev_t,         dev                     )
-               __field( sector_t,      sector                  )
-               __field( unsigned,      nr_sector               )
-               __array( char,          rwbs,   6               )
-               __array( char,          comm,   TASK_COMM_LEN   )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = bio->bi_bdev->bd_dev;
-               __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
-               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-       ),
-
-       TP_printk("%d,%d %s %llu + %u [%s]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->comm)
+       TP_ARGS(q, bio)
 );
 
-TRACE_EVENT(block_bio_queue,
+DEFINE_EVENT(block_bio, block_bio_frontmerge,
 
        TP_PROTO(struct request_queue *q, struct bio *bio),
 
-       TP_ARGS(q, bio),
+       TP_ARGS(q, bio)
+);
 
-       TP_STRUCT__entry(
-               __field( dev_t,         dev                     )
-               __field( sector_t,      sector                  )
-               __field( unsigned int,  nr_sector               )
-               __array( char,          rwbs,   6               )
-               __array( char,          comm,   TASK_COMM_LEN   )
-       ),
+DEFINE_EVENT(block_bio, block_bio_queue,
 
-       TP_fast_assign(
-               __entry->dev            = bio->bi_bdev->bd_dev;
-               __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
-               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-       ),
+       TP_PROTO(struct request_queue *q, struct bio *bio),
 
-       TP_printk("%d,%d %s %llu + %u [%s]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->comm)
+       TP_ARGS(q, bio)
 );
 
-TRACE_EVENT(block_getrq,
+DECLARE_EVENT_CLASS(block_get_rq,
 
        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 
@@ -341,33 +243,18 @@ TRACE_EVENT(block_getrq,
                  __entry->nr_sector, __entry->comm)
 );
 
-TRACE_EVENT(block_sleeprq,
+DEFINE_EVENT(block_get_rq, block_getrq,
 
        TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 
-       TP_ARGS(q, bio, rw),
+       TP_ARGS(q, bio, rw)
+);
 
-       TP_STRUCT__entry(
-               __field( dev_t,         dev                     )
-               __field( sector_t,      sector                  )
-               __field( unsigned int,  nr_sector               )
-               __array( char,          rwbs,   6               )
-               __array( char,          comm,   TASK_COMM_LEN   )
-       ),
+DEFINE_EVENT(block_get_rq, block_sleeprq,
 
-       TP_fast_assign(
-               __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
-               __entry->sector         = bio ? bio->bi_sector : 0;
-               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
-               blk_fill_rwbs(__entry->rwbs,
-                           bio ? bio->bi_rw : 0, __entry->nr_sector);
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-       ),
+       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 
-       TP_printk("%d,%d %s %llu + %u [%s]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-                 (unsigned long long)__entry->sector,
-                 __entry->nr_sector, __entry->comm)
+       TP_ARGS(q, bio, rw)
 );
 
 TRACE_EVENT(block_plug,
@@ -387,7 +274,7 @@ TRACE_EVENT(block_plug,
        TP_printk("[%s]", __entry->comm)
 );
 
-TRACE_EVENT(block_unplug_timer,
+DECLARE_EVENT_CLASS(block_unplug,
 
        TP_PROTO(struct request_queue *q),
 
@@ -406,23 +293,18 @@ TRACE_EVENT(block_unplug_timer,
        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
 );
 
-TRACE_EVENT(block_unplug_io,
+DEFINE_EVENT(block_unplug, block_unplug_timer,
 
        TP_PROTO(struct request_queue *q),
 
-       TP_ARGS(q),
+       TP_ARGS(q)
+);
 
-       TP_STRUCT__entry(
-               __field( int,           nr_rq                   )
-               __array( char,          comm,   TASK_COMM_LEN   )
-       ),
+DEFINE_EVENT(block_unplug, block_unplug_io,
 
-       TP_fast_assign(
-               __entry->nr_rq  = q->rq.count[READ] + q->rq.count[WRITE];
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-       ),
+       TP_PROTO(struct request_queue *q),
 
-       TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+       TP_ARGS(q)
 );
 
 TRACE_EVENT(block_split,
index d09550bf3f951ec4a9230f9156a5fd29190b9e13..318f76535bd44c888d2a1045f0f4a520d3c5799a 100644 (file)
@@ -90,7 +90,7 @@ TRACE_EVENT(ext4_allocate_inode,
                  (unsigned long) __entry->dir, __entry->mode)
 );
 
-TRACE_EVENT(ext4_write_begin,
+DECLARE_EVENT_CLASS(ext4__write_begin,
 
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int flags),
@@ -118,7 +118,23 @@ TRACE_EVENT(ext4_write_begin,
                  __entry->pos, __entry->len, __entry->flags)
 );
 
-TRACE_EVENT(ext4_ordered_write_end,
+DEFINE_EVENT(ext4__write_begin, ext4_write_begin,
+
+       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+                unsigned int flags),
+
+       TP_ARGS(inode, pos, len, flags)
+);
+
+DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,
+
+       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+                unsigned int flags),
+
+       TP_ARGS(inode, pos, len, flags)
+);
+
+DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),
 
@@ -145,57 +161,36 @@ TRACE_EVENT(ext4_ordered_write_end,
                  __entry->pos, __entry->len, __entry->copied)
 );
 
-TRACE_EVENT(ext4_writeback_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
+
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),
 
-       TP_ARGS(inode, pos, len, copied),
+       TP_ARGS(inode, pos, len, copied)
+);
 
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        unsigned int, len               )
-               __field(        unsigned int, copied            )
-       ),
+DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end,
 
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->pos    = pos;
-               __entry->len    = len;
-               __entry->copied = copied;
-       ),
+       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+                unsigned int copied),
 
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
+       TP_ARGS(inode, pos, len, copied)
 );
 
-TRACE_EVENT(ext4_journalled_write_end,
+DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,
+
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),
-       TP_ARGS(inode, pos, len, copied),
 
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        unsigned int, len               )
-               __field(        unsigned int, copied            )
-       ),
+       TP_ARGS(inode, pos, len, copied)
+);
 
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->pos    = pos;
-               __entry->len    = len;
-               __entry->copied = copied;
-       ),
+DEFINE_EVENT(ext4__write_end, ext4_da_write_end,
 
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
+       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+                unsigned int copied),
+
+       TP_ARGS(inode, pos, len, copied)
 );
 
 TRACE_EVENT(ext4_writepage,
@@ -337,60 +332,6 @@ TRACE_EVENT(ext4_da_writepages_result,
                  (unsigned long) __entry->writeback_index)
 );
 
-TRACE_EVENT(ext4_da_write_begin,
-       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-                       unsigned int flags),
-
-       TP_ARGS(inode, pos, len, flags),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        unsigned int, len               )
-               __field(        unsigned int, flags             )
-       ),
-
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->pos    = pos;
-               __entry->len    = len;
-               __entry->flags  = flags;
-       ),
-
-       TP_printk("dev %s ino %lu pos %llu len %u flags %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->flags)
-);
-
-TRACE_EVENT(ext4_da_write_end,
-       TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-                       unsigned int copied),
-
-       TP_ARGS(inode, pos, len, copied),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        unsigned int, len               )
-               __field(        unsigned int, copied            )
-       ),
-
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->pos    = pos;
-               __entry->len    = len;
-               __entry->copied = copied;
-       ),
-
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
-);
-
 TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),
index b89f9db4a404eaeb1800d2424eae5bd6e26387d4..0e4cfb694fe70630457af67e1b1bc568f56c9b09 100644 (file)
@@ -48,7 +48,7 @@ TRACE_EVENT(irq_handler_entry,
                __assign_str(name, action->name);
        ),
 
-       TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+       TP_printk("irq=%d name=%s", __entry->irq, __get_str(name))
 );
 
 /**
@@ -78,22 +78,11 @@ TRACE_EVENT(irq_handler_exit,
                __entry->ret    = ret;
        ),
 
-       TP_printk("irq=%d return=%s",
+       TP_printk("irq=%d ret=%s",
                  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
-/**
- * softirq_entry - called immediately before the softirq handler
- * @h: pointer to struct softirq_action
- * @vec: pointer to first struct softirq_action in softirq_vec array
- *
- * The @h parameter, contains a pointer to the struct softirq_action
- * which has a pointer to the action handler that is called. By subtracting
- * the @vec pointer from the @h pointer, we can determine the softirq
- * number. Also, when used in combination with the softirq_exit tracepoint
- * we can determine the softirq latency.
- */
-TRACE_EVENT(softirq_entry,
+DECLARE_EVENT_CLASS(softirq,
 
        TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
 
@@ -107,10 +96,28 @@ TRACE_EVENT(softirq_entry,
                __entry->vec = (int)(h - vec);
        ),
 
-       TP_printk("softirq=%d action=%s", __entry->vec,
+       TP_printk("vec=%d [action=%s]", __entry->vec,
                  show_softirq_name(__entry->vec))
 );
 
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
+DEFINE_EVENT(softirq, softirq_entry,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec)
+);
+
 /**
  * softirq_exit - called immediately after the softirq handler returns
  * @h: pointer to struct softirq_action
@@ -122,22 +129,11 @@ TRACE_EVENT(softirq_entry,
  * combination with the softirq_entry tracepoint we can determine the softirq
  * latency.
  */
-TRACE_EVENT(softirq_exit,
+DEFINE_EVENT(softirq, softirq_exit,
 
        TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
 
-       TP_ARGS(h, vec),
-
-       TP_STRUCT__entry(
-               __field(        int,    vec                     )
-       ),
-
-       TP_fast_assign(
-               __entry->vec = (int)(h - vec);
-       ),
-
-       TP_printk("softirq=%d action=%s", __entry->vec,
-                 show_softirq_name(__entry->vec))
+       TP_ARGS(h, vec)
 );
 
 #endif /*  _TRACE_IRQ_H */
index 3c60b75adb9e226de91d34cb9c635fbc999a9fa6..96b370a050deb27ca6fe8393990e52734dbbff09 100644 (file)
@@ -30,7 +30,7 @@ TRACE_EVENT(jbd2_checkpoint,
                  jbd2_dev_to_name(__entry->dev), __entry->result)
 );
 
-TRACE_EVENT(jbd2_start_commit,
+DECLARE_EVENT_CLASS(jbd2_commit,
 
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
@@ -53,73 +53,32 @@ TRACE_EVENT(jbd2_start_commit,
                  __entry->sync_commit)
 );
 
-TRACE_EVENT(jbd2_commit_locking,
+DEFINE_EVENT(jbd2_commit, jbd2_start_commit,
 
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
-       TP_ARGS(journal, commit_transaction),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit               )
-               __field(        int,    transaction               )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
-               __entry->transaction    = commit_transaction->t_tid;
-       ),
-
-       TP_printk("dev %s transaction %d sync %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->transaction,
-                 __entry->sync_commit)
+       TP_ARGS(journal, commit_transaction)
 );
 
-TRACE_EVENT(jbd2_commit_flushing,
+DEFINE_EVENT(jbd2_commit, jbd2_commit_locking,
 
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
-       TP_ARGS(journal, commit_transaction),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit               )
-               __field(        int,    transaction               )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
-               __entry->transaction    = commit_transaction->t_tid;
-       ),
-
-       TP_printk("dev %s transaction %d sync %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->transaction,
-                 __entry->sync_commit)
+       TP_ARGS(journal, commit_transaction)
 );
 
-TRACE_EVENT(jbd2_commit_logging,
+DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing,
 
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
-       TP_ARGS(journal, commit_transaction),
+       TP_ARGS(journal, commit_transaction)
+);
 
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit               )
-               __field(        int,    transaction               )
-       ),
+DEFINE_EVENT(jbd2_commit, jbd2_commit_logging,
 
-       TP_fast_assign(
-               __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
-               __entry->transaction    = commit_transaction->t_tid;
-       ),
+       TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
-       TP_printk("dev %s transaction %d sync %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->transaction,
-                 __entry->sync_commit)
+       TP_ARGS(journal, commit_transaction)
 );
 
 TRACE_EVENT(jbd2_end_commit,
index eaf46bdd18a5f81719b21de90fba91d5e64e78ea..3adca0ca9dbee10479d34d5a3e3562609ef89e86 100644 (file)
@@ -44,7 +44,7 @@
        {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"}          \
        ) : "GFP_NOWAIT"
 
-TRACE_EVENT(kmalloc,
+DECLARE_EVENT_CLASS(kmem_alloc,
 
        TP_PROTO(unsigned long call_site,
                 const void *ptr,
@@ -78,41 +78,23 @@ TRACE_EVENT(kmalloc,
                show_gfp_flags(__entry->gfp_flags))
 );
 
-TRACE_EVENT(kmem_cache_alloc,
+DEFINE_EVENT(kmem_alloc, kmalloc,
 
-       TP_PROTO(unsigned long call_site,
-                const void *ptr,
-                size_t bytes_req,
-                size_t bytes_alloc,
-                gfp_t gfp_flags),
+       TP_PROTO(unsigned long call_site, const void *ptr,
+                size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
 
-       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
+);
 
-       TP_STRUCT__entry(
-               __field(        unsigned long,  call_site       )
-               __field(        const void *,   ptr             )
-               __field(        size_t,         bytes_req       )
-               __field(        size_t,         bytes_alloc     )
-               __field(        gfp_t,          gfp_flags       )
-       ),
+DEFINE_EVENT(kmem_alloc, kmem_cache_alloc,
 
-       TP_fast_assign(
-               __entry->call_site      = call_site;
-               __entry->ptr            = ptr;
-               __entry->bytes_req      = bytes_req;
-               __entry->bytes_alloc    = bytes_alloc;
-               __entry->gfp_flags      = gfp_flags;
-       ),
+       TP_PROTO(unsigned long call_site, const void *ptr,
+                size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
 
-       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
-               __entry->call_site,
-               __entry->ptr,
-               __entry->bytes_req,
-               __entry->bytes_alloc,
-               show_gfp_flags(__entry->gfp_flags))
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
 );
 
-TRACE_EVENT(kmalloc_node,
+DECLARE_EVENT_CLASS(kmem_alloc_node,
 
        TP_PROTO(unsigned long call_site,
                 const void *ptr,
@@ -150,45 +132,25 @@ TRACE_EVENT(kmalloc_node,
                __entry->node)
 );
 
-TRACE_EVENT(kmem_cache_alloc_node,
+DEFINE_EVENT(kmem_alloc_node, kmalloc_node,
 
-       TP_PROTO(unsigned long call_site,
-                const void *ptr,
-                size_t bytes_req,
-                size_t bytes_alloc,
-                gfp_t gfp_flags,
-                int node),
+       TP_PROTO(unsigned long call_site, const void *ptr,
+                size_t bytes_req, size_t bytes_alloc,
+                gfp_t gfp_flags, int node),
 
-       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
+);
 
-       TP_STRUCT__entry(
-               __field(        unsigned long,  call_site       )
-               __field(        const void *,   ptr             )
-               __field(        size_t,         bytes_req       )
-               __field(        size_t,         bytes_alloc     )
-               __field(        gfp_t,          gfp_flags       )
-               __field(        int,            node            )
-       ),
+DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,
 
-       TP_fast_assign(
-               __entry->call_site      = call_site;
-               __entry->ptr            = ptr;
-               __entry->bytes_req      = bytes_req;
-               __entry->bytes_alloc    = bytes_alloc;
-               __entry->gfp_flags      = gfp_flags;
-               __entry->node           = node;
-       ),
+       TP_PROTO(unsigned long call_site, const void *ptr,
+                size_t bytes_req, size_t bytes_alloc,
+                gfp_t gfp_flags, int node),
 
-       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
-               __entry->call_site,
-               __entry->ptr,
-               __entry->bytes_req,
-               __entry->bytes_alloc,
-               show_gfp_flags(__entry->gfp_flags),
-               __entry->node)
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
 );
 
-TRACE_EVENT(kfree,
+DECLARE_EVENT_CLASS(kmem_free,
 
        TP_PROTO(unsigned long call_site, const void *ptr),
 
@@ -207,23 +169,18 @@ TRACE_EVENT(kfree,
        TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
 );
 
-TRACE_EVENT(kmem_cache_free,
+DEFINE_EVENT(kmem_free, kfree,
 
        TP_PROTO(unsigned long call_site, const void *ptr),
 
-       TP_ARGS(call_site, ptr),
+       TP_ARGS(call_site, ptr)
+);
 
-       TP_STRUCT__entry(
-               __field(        unsigned long,  call_site       )
-               __field(        const void *,   ptr             )
-       ),
+DEFINE_EVENT(kmem_free, kmem_cache_free,
 
-       TP_fast_assign(
-               __entry->call_site      = call_site;
-               __entry->ptr            = ptr;
-       ),
+       TP_PROTO(unsigned long call_site, const void *ptr),
 
-       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+       TP_ARGS(call_site, ptr)
 );
 
 TRACE_EVENT(mm_page_free_direct,
@@ -299,7 +256,7 @@ TRACE_EVENT(mm_page_alloc,
                show_gfp_flags(__entry->gfp_flags))
 );
 
-TRACE_EVENT(mm_page_alloc_zone_locked,
+DECLARE_EVENT_CLASS(mm_page,
 
        TP_PROTO(struct page *page, unsigned int order, int migratetype),
 
@@ -325,29 +282,22 @@ TRACE_EVENT(mm_page_alloc_zone_locked,
                __entry->order == 0)
 );
 
-TRACE_EVENT(mm_page_pcpu_drain,
+DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
 
-       TP_PROTO(struct page *page, int order, int migratetype),
+       TP_PROTO(struct page *page, unsigned int order, int migratetype),
 
-       TP_ARGS(page, order, migratetype),
+       TP_ARGS(page, order, migratetype)
+);
 
-       TP_STRUCT__entry(
-               __field(        struct page *,  page            )
-               __field(        int,            order           )
-               __field(        int,            migratetype     )
-       ),
+DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain,
 
-       TP_fast_assign(
-               __entry->page           = page;
-               __entry->order          = order;
-               __entry->migratetype    = migratetype;
-       ),
+       TP_PROTO(struct page *page, unsigned int order, int migratetype),
+
+       TP_ARGS(page, order, migratetype),
 
        TP_printk("page=%p pfn=%lu order=%d migratetype=%d",
-               __entry->page,
-               page_to_pfn(__entry->page),
-               __entry->order,
-               __entry->migratetype)
+               __entry->page, page_to_pfn(__entry->page),
+               __entry->order, __entry->migratetype)
 );
 
 TRACE_EVENT(mm_page_alloc_extfrag,
similarity index 92%
rename from include/trace/events/lockdep.h
rename to include/trace/events/lock.h
index bcf1d209a00dabf7838aae49c2e5edcc5fc6dde8..a870ba125aa87509c6a49c0036e3c801d7159aea 100644 (file)
@@ -1,8 +1,8 @@
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM lockdep
+#define TRACE_SYSTEM lock
 
-#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_LOCKDEP_H
+#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCK_H
 
 #include <linux/lockdep.h>
 #include <linux/tracepoint.h>
@@ -90,7 +90,7 @@ TRACE_EVENT(lock_acquired,
 #endif
 #endif
 
-#endif /* _TRACE_LOCKDEP_H */
+#endif /* _TRACE_LOCK_H */
 
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
new file mode 100644 (file)
index 0000000..7eee778
--- /dev/null
@@ -0,0 +1,69 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mce
+
+#if !defined(_TRACE_MCE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MCE_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <asm/mce.h>
+
+TRACE_EVENT(mce_record,
+
+       TP_PROTO(struct mce *m),
+
+       TP_ARGS(m),
+
+       TP_STRUCT__entry(
+               __field(        u64,            mcgcap          )
+               __field(        u64,            mcgstatus       )
+               __field(        u8,             bank            )
+               __field(        u64,            status          )
+               __field(        u64,            addr            )
+               __field(        u64,            misc            )
+               __field(        u64,            ip              )
+               __field(        u8,             cs              )
+               __field(        u64,            tsc             )
+               __field(        u64,            walltime        )
+               __field(        u32,            cpu             )
+               __field(        u32,            cpuid           )
+               __field(        u32,            apicid          )
+               __field(        u32,            socketid        )
+               __field(        u8,             cpuvendor       )
+       ),
+
+       TP_fast_assign(
+               __entry->mcgcap         = m->mcgcap;
+               __entry->mcgstatus      = m->mcgstatus;
+               __entry->bank           = m->bank;
+               __entry->status         = m->status;
+               __entry->addr           = m->addr;
+               __entry->misc           = m->misc;
+               __entry->ip             = m->ip;
+               __entry->cs             = m->cs;
+               __entry->tsc            = m->tsc;
+               __entry->walltime       = m->time;
+               __entry->cpu            = m->extcpu;
+               __entry->cpuid          = m->cpuid;
+               __entry->apicid         = m->apicid;
+               __entry->socketid       = m->socketid;
+               __entry->cpuvendor      = m->cpuvendor;
+       ),
+
+       TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
+               __entry->cpu,
+               __entry->mcgcap, __entry->mcgstatus,
+               __entry->bank, __entry->status,
+               __entry->addr, __entry->misc,
+               __entry->cs, __entry->ip,
+               __entry->tsc,
+               __entry->cpuvendor, __entry->cpuid,
+               __entry->walltime,
+               __entry->socketid,
+               __entry->apicid)
+);
+
+#endif /* _TRACE_MCE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 84160fb18478f99da3ae79bc19bc4da6044d8978..4b0f48ba16a688da9ead5b901604419c7823ea3b 100644 (file)
@@ -51,7 +51,7 @@ TRACE_EVENT(module_free,
        TP_printk("%s", __get_str(name))
 );
 
-TRACE_EVENT(module_get,
+DECLARE_EVENT_CLASS(module_refcnt,
 
        TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 
@@ -73,26 +73,18 @@ TRACE_EVENT(module_get,
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
 );
 
-TRACE_EVENT(module_put,
+DEFINE_EVENT(module_refcnt, module_get,
 
        TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 
-       TP_ARGS(mod, ip, refcnt),
+       TP_ARGS(mod, ip, refcnt)
+);
 
-       TP_STRUCT__entry(
-               __field(        unsigned long,  ip              )
-               __field(        int,            refcnt          )
-               __string(       name,           mod->name       )
-       ),
+DEFINE_EVENT(module_refcnt, module_put,
 
-       TP_fast_assign(
-               __entry->ip     = ip;
-               __entry->refcnt = refcnt;
-               __assign_str(name, mod->name);
-       ),
+       TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
 
-       TP_printk("%s call_site=%pf refcnt=%d",
-                 __get_str(name), (void *)__entry->ip, __entry->refcnt)
+       TP_ARGS(mod, ip, refcnt)
 );
 
 TRACE_EVENT(module_request,
index ea6d579261ad017d3c54c61fcecba118b20cd81f..c4efe9b8280d4f9c4261c24de5f80010f1f445ce 100644 (file)
@@ -16,9 +16,7 @@ enum {
 };
 #endif
 
-
-
-TRACE_EVENT(power_start,
+DECLARE_EVENT_CLASS(power,
 
        TP_PROTO(unsigned int type, unsigned int state),
 
@@ -37,42 +35,36 @@ TRACE_EVENT(power_start,
        TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state)
 );
 
-TRACE_EVENT(power_end,
-
-       TP_PROTO(int dummy),
+DEFINE_EVENT(power, power_start,
 
-       TP_ARGS(dummy),
+       TP_PROTO(unsigned int type, unsigned int state),
 
-       TP_STRUCT__entry(
-               __field(        u64,            dummy           )
-       ),
+       TP_ARGS(type, state)
+);
 
-       TP_fast_assign(
-               __entry->dummy = 0xffff;
-       ),
+DEFINE_EVENT(power, power_frequency,
 
-       TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+       TP_PROTO(unsigned int type, unsigned int state),
 
+       TP_ARGS(type, state)
 );
 
+TRACE_EVENT(power_end,
 
-TRACE_EVENT(power_frequency,
-
-       TP_PROTO(unsigned int type, unsigned int state),
+       TP_PROTO(int dummy),
 
-       TP_ARGS(type, state),
+       TP_ARGS(dummy),
 
        TP_STRUCT__entry(
-               __field(        u64,            type            )
-               __field(        u64,            state           )
+               __field(        u64,            dummy           )
        ),
 
        TP_fast_assign(
-               __entry->type = type;
-               __entry->state = state;
+               __entry->dummy = 0xffff;
        ),
 
-       TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state)
+       TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+
 );
 
 #endif /* _TRACE_POWER_H */
index 4069c43f4187e522a65435a65f3888fff39a707e..cfceb0b73e205bb936a6e3fcfeb34f5515ab0feb 100644 (file)
@@ -26,7 +26,7 @@ TRACE_EVENT(sched_kthread_stop,
                __entry->pid    = t->pid;
        ),
 
-       TP_printk("task %s:%d", __entry->comm, __entry->pid)
+       TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
 );
 
 /*
@@ -46,7 +46,7 @@ TRACE_EVENT(sched_kthread_stop_ret,
                __entry->ret    = ret;
        ),
 
-       TP_printk("ret %d", __entry->ret)
+       TP_printk("ret=%d", __entry->ret)
 );
 
 /*
@@ -73,7 +73,7 @@ TRACE_EVENT(sched_wait_task,
                __entry->prio   = p->prio;
        ),
 
-       TP_printk("task %s:%d [%d]",
+       TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
 );
 
@@ -83,7 +83,7 @@ TRACE_EVENT(sched_wait_task,
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
-TRACE_EVENT(sched_wakeup,
+DECLARE_EVENT_CLASS(sched_wakeup_template,
 
        TP_PROTO(struct rq *rq, struct task_struct *p, int success),
 
@@ -94,7 +94,7 @@ TRACE_EVENT(sched_wakeup,
                __field(        pid_t,  pid                     )
                __field(        int,    prio                    )
                __field(        int,    success                 )
-               __field(        int,    cpu                     )
+               __field(        int,    target_cpu              )
        ),
 
        TP_fast_assign(
@@ -102,46 +102,27 @@ TRACE_EVENT(sched_wakeup,
                __entry->pid            = p->pid;
                __entry->prio           = p->prio;
                __entry->success        = success;
-               __entry->cpu            = task_cpu(p);
+               __entry->target_cpu     = task_cpu(p);
        ),
 
-       TP_printk("task %s:%d [%d] success=%d [%03d]",
+       TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
                  __entry->comm, __entry->pid, __entry->prio,
-                 __entry->success, __entry->cpu)
+                 __entry->success, __entry->target_cpu)
 );
 
+DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
+            TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+            TP_ARGS(rq, p, success));
+
 /*
  * Tracepoint for waking up a new task:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
-TRACE_EVENT(sched_wakeup_new,
-
-       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-       TP_ARGS(rq, p, success),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-               __field(        int,    success                 )
-               __field(        int,    cpu                     )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-               __entry->success        = success;
-               __entry->cpu            = task_cpu(p);
-       ),
-
-       TP_printk("task %s:%d [%d] success=%d [%03d]",
-                 __entry->comm, __entry->pid, __entry->prio,
-                 __entry->success, __entry->cpu)
-);
+DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
+            TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+            TP_ARGS(rq, p, success));
 
 /*
  * Tracepoint for task switches, performed by the scheduler:
@@ -176,7 +157,7 @@ TRACE_EVENT(sched_switch,
                __entry->next_prio      = next->prio;
        ),
 
-       TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
+       TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
                __entry->prev_state ?
                  __print_flags(__entry->prev_state, "|",
@@ -211,15 +192,12 @@ TRACE_EVENT(sched_migrate_task,
                __entry->dest_cpu       = dest_cpu;
        ),
 
-       TP_printk("task %s:%d [%d] from: %d  to: %d",
+       TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->orig_cpu, __entry->dest_cpu)
 );
 
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
+DECLARE_EVENT_CLASS(sched_process_template,
 
        TP_PROTO(struct task_struct *p),
 
@@ -237,34 +215,24 @@ TRACE_EVENT(sched_process_free,
                __entry->prio           = p->prio;
        ),
 
-       TP_printk("task %s:%d [%d]",
+       TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
- * Tracepoint for a task exiting:
+ * Tracepoint for freeing a task:
  */
-TRACE_EVENT(sched_process_exit,
+DEFINE_EVENT(sched_process_template, sched_process_free,
+            TP_PROTO(struct task_struct *p),
+            TP_ARGS(p));
+            
 
-       TP_PROTO(struct task_struct *p),
-
-       TP_ARGS(p),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-       ),
-
-       TP_printk("task %s:%d [%d]",
-                 __entry->comm, __entry->pid, __entry->prio)
-);
+/*
+ * Tracepoint for a task exiting:
+ */
+DEFINE_EVENT(sched_process_template, sched_process_exit,
+            TP_PROTO(struct task_struct *p),
+            TP_ARGS(p));
 
 /*
  * Tracepoint for a waiting task:
@@ -287,7 +255,7 @@ TRACE_EVENT(sched_process_wait,
                __entry->prio           = current->prio;
        ),
 
-       TP_printk("task %s:%d [%d]",
+       TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
 );
 
@@ -314,46 +282,16 @@ TRACE_EVENT(sched_process_fork,
                __entry->child_pid      = child->pid;
        ),
 
-       TP_printk("parent %s:%d  child %s:%d",
+       TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
                __entry->parent_comm, __entry->parent_pid,
                __entry->child_comm, __entry->child_pid)
 );
 
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-       TP_PROTO(int sig, struct task_struct *p),
-
-       TP_ARGS(sig, p),
-
-       TP_STRUCT__entry(
-               __field(        int,    sig                     )
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid    = p->pid;
-               __entry->sig    = sig;
-       ),
-
-       TP_printk("sig: %d  task %s:%d",
-                 __entry->sig, __entry->comm, __entry->pid)
-);
-
 /*
  * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
  *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
  */
-
-/*
- * Tracepoint for accounting wait time (time the task is runnable
- * but not actually running due to scheduler contention).
- */
-TRACE_EVENT(sched_stat_wait,
+DECLARE_EVENT_CLASS(sched_stat_template,
 
        TP_PROTO(struct task_struct *tsk, u64 delay),
 
@@ -374,11 +312,36 @@ TRACE_EVENT(sched_stat_wait,
                __perf_count(delay);
        ),
 
-       TP_printk("task: %s:%d wait: %Lu [ns]",
+       TP_printk("comm=%s pid=%d delay=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->delay)
 );
 
+
+/*
+ * Tracepoint for accounting wait time (time the task is runnable
+ * but not actually running due to scheduler contention).
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_wait,
+            TP_PROTO(struct task_struct *tsk, u64 delay),
+            TP_ARGS(tsk, delay));
+
+/*
+ * Tracepoint for accounting sleep time (time the task is not runnable,
+ * including iowait, see below).
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_sleep,
+            TP_PROTO(struct task_struct *tsk, u64 delay),
+            TP_ARGS(tsk, delay));
+
+/*
+ * Tracepoint for accounting iowait time (time the task is not runnable
+ * due to waiting on IO to complete).
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
+            TP_PROTO(struct task_struct *tsk, u64 delay),
+            TP_ARGS(tsk, delay));
+
 /*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
@@ -406,72 +369,12 @@ TRACE_EVENT(sched_stat_runtime,
                __perf_count(runtime);
        ),
 
-       TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]",
+       TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->runtime,
                        (unsigned long long)__entry->vruntime)
 );
 
-/*
- * Tracepoint for accounting sleep time (time the task is not runnable,
- * including iowait, see below).
- */
-TRACE_EVENT(sched_stat_sleep,
-
-       TP_PROTO(struct task_struct *tsk, u64 delay),
-
-       TP_ARGS(tsk, delay),
-
-       TP_STRUCT__entry(
-               __array( char,  comm,   TASK_COMM_LEN   )
-               __field( pid_t, pid                     )
-               __field( u64,   delay                   )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
-               __entry->pid    = tsk->pid;
-               __entry->delay  = delay;
-       )
-       TP_perf_assign(
-               __perf_count(delay);
-       ),
-
-       TP_printk("task: %s:%d sleep: %Lu [ns]",
-                       __entry->comm, __entry->pid,
-                       (unsigned long long)__entry->delay)
-);
-
-/*
- * Tracepoint for accounting iowait time (time the task is not runnable
- * due to waiting on IO to complete).
- */
-TRACE_EVENT(sched_stat_iowait,
-
-       TP_PROTO(struct task_struct *tsk, u64 delay),
-
-       TP_ARGS(tsk, delay),
-
-       TP_STRUCT__entry(
-               __array( char,  comm,   TASK_COMM_LEN   )
-               __field( pid_t, pid                     )
-               __field( u64,   delay                   )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
-               __entry->pid    = tsk->pid;
-               __entry->delay  = delay;
-       )
-       TP_perf_assign(
-               __perf_count(delay);
-       ),
-
-       TP_printk("task: %s:%d iowait: %Lu [ns]",
-                       __entry->comm, __entry->pid,
-                       (unsigned long long)__entry->delay)
-);
-
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
new file mode 100644 (file)
index 0000000..a510b75
--- /dev/null
@@ -0,0 +1,173 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM signal
+
+#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SIGNAL_H
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#define TP_STORE_SIGINFO(__entry, info)                                \
+       do {                                                    \
+               if (info == SEND_SIG_NOINFO) {                  \
+                       __entry->errno  = 0;                    \
+                       __entry->code   = SI_USER;              \
+               } else if (info == SEND_SIG_PRIV) {             \
+                       __entry->errno  = 0;                    \
+                       __entry->code   = SI_KERNEL;            \
+               } else {                                        \
+                       __entry->errno  = info->si_errno;       \
+                       __entry->code   = info->si_code;        \
+               }                                               \
+       } while (0)
+
+/**
+ * signal_generate - called when a signal is generated
+ * @sig: signal number
+ * @info: pointer to struct siginfo
+ * @task: pointer to struct task_struct
+ *
+ * Current process sends a 'sig' signal to 'task' process with
+ * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
+ * 'info' is not a pointer and you can't access its field. Instead,
+ * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
+ * means that si_code is SI_KERNEL.
+ */
+TRACE_EVENT(signal_generate,
+
+       TP_PROTO(int sig, struct siginfo *info, struct task_struct *task),
+
+       TP_ARGS(sig, info, task),
+
+       TP_STRUCT__entry(
+               __field(        int,    sig                     )
+               __field(        int,    errno                   )
+               __field(        int,    code                    )
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+       ),
+
+       TP_fast_assign(
+               __entry->sig    = sig;
+               TP_STORE_SIGINFO(__entry, info);
+               memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+               __entry->pid    = task->pid;
+       ),
+
+       TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d",
+                 __entry->sig, __entry->errno, __entry->code,
+                 __entry->comm, __entry->pid)
+);
+
+/**
+ * signal_deliver - called when a signal is delivered
+ * @sig: signal number
+ * @info: pointer to struct siginfo
+ * @ka: pointer to struct k_sigaction
+ *
+ * A 'sig' signal is delivered to current process with 'info' siginfo,
+ * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
+ * SIG_DFL.
+ * Note that some signals reported by signal_generate tracepoint can be
+ * lost, ignored or modified (by debugger) before hitting this tracepoint.
+ * This means, this can show which signals are actually delivered, but
+ * matching generated signals and delivered signals may not be correct.
+ */
+TRACE_EVENT(signal_deliver,
+
+       TP_PROTO(int sig, struct siginfo *info, struct k_sigaction *ka),
+
+       TP_ARGS(sig, info, ka),
+
+       TP_STRUCT__entry(
+               __field(        int,            sig             )
+               __field(        int,            errno           )
+               __field(        int,            code            )
+               __field(        unsigned long,  sa_handler      )
+               __field(        unsigned long,  sa_flags        )
+       ),
+
+       TP_fast_assign(
+               __entry->sig    = sig;
+               TP_STORE_SIGINFO(__entry, info);
+               __entry->sa_handler     = (unsigned long)ka->sa.sa_handler;
+               __entry->sa_flags       = ka->sa.sa_flags;
+       ),
+
+       TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
+                 __entry->sig, __entry->errno, __entry->code,
+                 __entry->sa_handler, __entry->sa_flags)
+);
+
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+TRACE_EVENT(signal_overflow_fail,
+
+       TP_PROTO(int sig, int group, struct siginfo *info),
+
+       TP_ARGS(sig, group, info),
+
+       TP_STRUCT__entry(
+               __field(        int,    sig     )
+               __field(        int,    group   )
+               __field(        int,    errno   )
+               __field(        int,    code    )
+       ),
+
+       TP_fast_assign(
+               __entry->sig    = sig;
+               __entry->group  = group;
+               TP_STORE_SIGINFO(__entry, info);
+       ),
+
+       TP_printk("sig=%d group=%d errno=%d code=%d",
+                 __entry->sig, __entry->group, __entry->errno, __entry->code)
+);
+
+/**
+ * signal_lose_info - called when siginfo is lost
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel generates 'sig' signal but loses 'info' siginfo, because siginfo
+ * queue is overflow.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of non-RT signals.
+ */
+TRACE_EVENT(signal_lose_info,
+
+       TP_PROTO(int sig, int group, struct siginfo *info),
+
+       TP_ARGS(sig, group, info),
+
+       TP_STRUCT__entry(
+               __field(        int,    sig     )
+               __field(        int,    group   )
+               __field(        int,    errno   )
+               __field(        int,    code    )
+       ),
+
+       TP_fast_assign(
+               __entry->sig    = sig;
+               __entry->group  = group;
+               TP_STORE_SIGINFO(__entry, info);
+       ),
+
+       TP_printk("sig=%d group=%d errno=%d code=%d",
+                 __entry->sig, __entry->group, __entry->errno, __entry->code)
+);
+#endif /* _TRACE_SIGNAL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 1844c48d640e3477edf791868e4e72248f7e3381..e5ce87a0498da94355af6e0cfdda0bb88e843cec 100644 (file)
@@ -26,7 +26,7 @@ TRACE_EVENT(timer_init,
                __entry->timer  = timer;
        ),
 
-       TP_printk("timer %p", __entry->timer)
+       TP_printk("timer=%p", __entry->timer)
 );
 
 /**
@@ -54,7 +54,7 @@ TRACE_EVENT(timer_start,
                __entry->now            = jiffies;
        ),
 
-       TP_printk("timer %p: func %pf, expires %lu, timeout %ld",
+       TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld]",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now)
 );
@@ -81,7 +81,7 @@ TRACE_EVENT(timer_expire_entry,
                __entry->now            = jiffies;
        ),
 
-       TP_printk("timer %p: now %lu", __entry->timer, __entry->now)
+       TP_printk("timer=%p now=%lu", __entry->timer, __entry->now)
 );
 
 /**
@@ -108,7 +108,7 @@ TRACE_EVENT(timer_expire_exit,
                __entry->timer  = timer;
        ),
 
-       TP_printk("timer %p", __entry->timer)
+       TP_printk("timer=%p", __entry->timer)
 );
 
 /**
@@ -129,7 +129,7 @@ TRACE_EVENT(timer_cancel,
                __entry->timer  = timer;
        ),
 
-       TP_printk("timer %p", __entry->timer)
+       TP_printk("timer=%p", __entry->timer)
 );
 
 /**
@@ -140,24 +140,24 @@ TRACE_EVENT(timer_cancel,
  */
 TRACE_EVENT(hrtimer_init,
 
-       TP_PROTO(struct hrtimer *timer, clockid_t clockid,
+       TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),
 
-       TP_ARGS(timer, clockid, mode),
+       TP_ARGS(hrtimer, clockid, mode),
 
        TP_STRUCT__entry(
-               __field( void *,                timer           )
+               __field( void *,                hrtimer         )
                __field( clockid_t,             clockid         )
                __field( enum hrtimer_mode,     mode            )
        ),
 
        TP_fast_assign(
-               __entry->timer          = timer;
+               __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode           = mode;
        ),
 
-       TP_printk("hrtimer %p, clockid %s, mode %s", __entry->timer,
+       TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  __entry->clockid == CLOCK_REALTIME ?
                        "CLOCK_REALTIME" : "CLOCK_MONOTONIC",
                  __entry->mode == HRTIMER_MODE_ABS ?
@@ -170,26 +170,26 @@ TRACE_EVENT(hrtimer_init,
  */
 TRACE_EVENT(hrtimer_start,
 
-       TP_PROTO(struct hrtimer *timer),
+       TP_PROTO(struct hrtimer *hrtimer),
 
-       TP_ARGS(timer),
+       TP_ARGS(hrtimer),
 
        TP_STRUCT__entry(
-               __field( void *,        timer           )
+               __field( void *,        hrtimer         )
                __field( void *,        function        )
                __field( s64,           expires         )
                __field( s64,           softexpires     )
        ),
 
        TP_fast_assign(
-               __entry->timer          = timer;
-               __entry->function       = timer->function;
-               __entry->expires        = hrtimer_get_expires(timer).tv64;
-               __entry->softexpires    = hrtimer_get_softexpires(timer).tv64;
+               __entry->hrtimer        = hrtimer;
+               __entry->function       = hrtimer->function;
+               __entry->expires        = hrtimer_get_expires(hrtimer).tv64;
+               __entry->softexpires    = hrtimer_get_softexpires(hrtimer).tv64;
        ),
 
-       TP_printk("hrtimer %p, func %pf, expires %llu, softexpires %llu",
-                 __entry->timer, __entry->function,
+       TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
+                 __entry->hrtimer, __entry->function,
                  (unsigned long long)ktime_to_ns((ktime_t) {
                                  .tv64 = __entry->expires }),
                  (unsigned long long)ktime_to_ns((ktime_t) {
@@ -206,23 +206,22 @@ TRACE_EVENT(hrtimer_start,
  */
 TRACE_EVENT(hrtimer_expire_entry,
 
-       TP_PROTO(struct hrtimer *timer, ktime_t *now),
+       TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),
 
-       TP_ARGS(timer, now),
+       TP_ARGS(hrtimer, now),
 
        TP_STRUCT__entry(
-               __field( void *,        timer   )
+               __field( void *,        hrtimer )
                __field( s64,           now     )
        ),
 
        TP_fast_assign(
-               __entry->timer  = timer;
-               __entry->now    = now->tv64;
+               __entry->hrtimer        = hrtimer;
+               __entry->now            = now->tv64;
        ),
 
-       TP_printk("hrtimer %p, now %llu", __entry->timer,
-                 (unsigned long long)ktime_to_ns((ktime_t) {
-                                 .tv64 = __entry->now }))
+       TP_printk("hrtimer=%p now=%llu", __entry->hrtimer,
+                 (unsigned long long)ktime_to_ns((ktime_t) { .tv64 = __entry->now }))
  );
 
 /**
@@ -234,40 +233,40 @@ TRACE_EVENT(hrtimer_expire_entry,
  */
 TRACE_EVENT(hrtimer_expire_exit,
 
-       TP_PROTO(struct hrtimer *timer),
+       TP_PROTO(struct hrtimer *hrtimer),
 
-       TP_ARGS(timer),
+       TP_ARGS(hrtimer),
 
        TP_STRUCT__entry(
-               __field( void *,        timer   )
+               __field( void *,        hrtimer )
        ),
 
        TP_fast_assign(
-               __entry->timer  = timer;
+               __entry->hrtimer        = hrtimer;
        ),
 
-       TP_printk("hrtimer %p", __entry->timer)
+       TP_printk("hrtimer=%p", __entry->hrtimer)
 );
 
 /**
  * hrtimer_cancel - called when the hrtimer is canceled
- * @timer:     pointer to struct hrtimer
+ * @hrtimer:   pointer to struct hrtimer
  */
 TRACE_EVENT(hrtimer_cancel,
 
-       TP_PROTO(struct hrtimer *timer),
+       TP_PROTO(struct hrtimer *hrtimer),
 
-       TP_ARGS(timer),
+       TP_ARGS(hrtimer),
 
        TP_STRUCT__entry(
-               __field( void *,        timer   )
+               __field( void *,        hrtimer )
        ),
 
        TP_fast_assign(
-               __entry->timer  = timer;
+               __entry->hrtimer        = hrtimer;
        ),
 
-       TP_printk("hrtimer %p", __entry->timer)
+       TP_printk("hrtimer=%p", __entry->hrtimer)
 );
 
 /**
@@ -302,7 +301,7 @@ TRACE_EVENT(itimer_state,
                __entry->interval_usec  = value->it_interval.tv_usec;
        ),
 
-       TP_printk("which %d, expires %lu, it_value %lu.%lu, it_interval %lu.%lu",
+       TP_printk("which=%d expires=%lu it_value=%lu.%lu it_interval=%lu.%lu",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_usec,
                  __entry->interval_sec, __entry->interval_usec)
@@ -332,7 +331,7 @@ TRACE_EVENT(itimer_expire,
                __entry->pid    = pid_nr(pid);
        ),
 
-           TP_printk("which %d, pid %d, now %lu", __entry->which,
+           TP_printk("which=%d pid=%d now=%lu", __entry->which,
                      (int) __entry->pid, __entry->now)
 );
 
index e4612dbd7ba6070d3d2a28e8a7f4359524cbe77c..d6c974474e70272bd41182ecbbaa4a2102861a0e 100644 (file)
@@ -8,7 +8,7 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-TRACE_EVENT(workqueue_insertion,
+DECLARE_EVENT_CLASS(workqueue,
 
        TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
 
@@ -30,26 +30,18 @@ TRACE_EVENT(workqueue_insertion,
                __entry->thread_pid, __entry->func)
 );
 
-TRACE_EVENT(workqueue_execution,
+DEFINE_EVENT(workqueue, workqueue_insertion,
 
        TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
 
-       TP_ARGS(wq_thread, work),
+       TP_ARGS(wq_thread, work)
+);
 
-       TP_STRUCT__entry(
-               __array(char,           thread_comm,    TASK_COMM_LEN)
-               __field(pid_t,          thread_pid)
-               __field(work_func_t,    func)
-       ),
+DEFINE_EVENT(workqueue, workqueue_execution,
 
-       TP_fast_assign(
-               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
-               __entry->thread_pid     = wq_thread->pid;
-               __entry->func           = work->func;
-       ),
+       TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
 
-       TP_printk("thread=%s:%d func=%pf", __entry->thread_comm,
-               __entry->thread_pid, __entry->func)
+       TP_ARGS(wq_thread, work)
 );
 
 /* Trace the creation of one workqueue thread on a cpu */
index dacb8ef6700071238a3293799f57610909c6f22f..d1b3de9c1a714f7624956c01fb60bd22898d4c6a 100644 (file)
 
 #include <linux/ftrace_event.h>
 
+/*
+ * DECLARE_EVENT_CLASS can be used to add a generic function
+ * handlers for events. That is, if all events have the same
+ * parameters and just have distinct trace points.
+ * Each tracepoint can be defined with DEFINE_EVENT and that
+ * will map the DECLARE_EVENT_CLASS to the tracepoint.
+ *
+ * TRACE_EVENT is a one to one mapping between tracepoint and template.
+ */
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+       DECLARE_EVENT_CLASS(name,                              \
+                            PARAMS(proto),                    \
+                            PARAMS(args),                     \
+                            PARAMS(tstruct),                  \
+                            PARAMS(assign),                   \
+                            PARAMS(print));                   \
+       DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+
+
 #undef __field
 #define __field(type, item)            type    item;
 
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
-       struct ftrace_raw_##name {                              \
-               struct trace_entry      ent;                    \
-               tstruct                                         \
-               char                    __data[0];              \
-       };                                                      \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print) \
+       struct ftrace_raw_##name {                                      \
+               struct trace_entry      ent;                            \
+               tstruct                                                 \
+               char                    __data[0];                      \
+       };
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)      \
        static struct ftrace_event_call event_##name
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef __cpparg
 #define __cpparg(arg...) arg
 
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
        struct ftrace_data_offsets_##call {                             \
                tstruct;                                                \
        };
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
 #undef __field
 #define __field(type, item)                                    \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%u;\tsize:%u;\n",                \
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",    \
                               (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
+                              (unsigned int)sizeof(field.item),        \
+                              (unsigned int)is_signed_type(type));     \
        if (!ret)                                                       \
                return 0;
 
 #undef __array
 #define __array(type, item, len)                                               \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
-                              "offset:%u;\tsize:%u;\n",                \
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",    \
                               (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
+                              (unsigned int)sizeof(field.item),        \
+                              (unsigned int)is_signed_type(type));     \
        if (!ret)                                                       \
                return 0;
 
 #undef __dynamic_array
 #define __dynamic_array(type, item, len)                                      \
        ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
-                              "offset:%u;\tsize:%u;\n",                       \
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",           \
                               (unsigned int)offsetof(typeof(field),           \
                                        __data_loc_##item),                    \
-                              (unsigned int)sizeof(field.__data_loc_##item)); \
+                              (unsigned int)sizeof(field.__data_loc_##item), \
+                              (unsigned int)is_signed_type(type));     \
        if (!ret)                                                              \
                return 0;
 
 #undef TP_perf_assign
 #define TP_perf_assign(args...)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)   \
 static int                                                             \
-ftrace_format_##call(struct ftrace_event_call *unused,                 \
-                     struct trace_seq *s)                              \
+ftrace_format_setup_##call(struct ftrace_event_call *unused,           \
+                          struct trace_seq *s)                         \
 {                                                                      \
        struct ftrace_raw_##call field __attribute__((unused));         \
        int ret = 0;                                                    \
                                                                        \
        tstruct;                                                        \
                                                                        \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static int                                                             \
+ftrace_format_##call(struct ftrace_event_call *unused,                 \
+                    struct trace_seq *s)                               \
+{                                                                      \
+       int ret = 0;                                                    \
+                                                                       \
+       ret = ftrace_format_setup_##call(unused, s);                    \
+       if (!ret)                                                       \
+               return ret;                                             \
+                                                                       \
+       ret = trace_seq_printf(s, "\nprint fmt: " print);               \
+                                                                       \
+       return ret;                                                     \
+}
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)         \
+static int                                                             \
+ftrace_format_##name(struct ftrace_event_call *unused,                 \
+                     struct trace_seq *s)                              \
+{                                                                      \
+       int ret = 0;                                                    \
+                                                                       \
+       ret = ftrace_format_setup_##template(unused, s);                \
+       if (!ret)                                                       \
+               return ret;                                             \
+                                                                       \
        trace_seq_printf(s, "\nprint fmt: " print);                     \
                                                                        \
        return ret;                                                     \
@@ -252,15 +321,57 @@ ftrace_format_##call(struct ftrace_event_call *unused,                    \
                ftrace_print_symbols_seq(p, value, symbols);            \
        })
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static enum print_line_t                                               \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
+ftrace_raw_output_id_##call(int event_id, const char *name,            \
+                           struct trace_iterator *iter, int flags)     \
 {                                                                      \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##call *field;                                \
        struct trace_entry *entry;                                      \
        struct trace_seq *p;                                            \
+       int ret;                                                        \
+                                                                       \
+       entry = iter->ent;                                              \
+                                                                       \
+       if (entry->type != event_id) {                                  \
+               WARN_ON_ONCE(1);                                        \
+               return TRACE_TYPE_UNHANDLED;                            \
+       }                                                               \
+                                                                       \
+       field = (typeof(field))entry;                                   \
+                                                                       \
+       p = &get_cpu_var(ftrace_event_seq);                             \
+       trace_seq_init(p);                                              \
+       ret = trace_seq_printf(s, "%s: ", name);                        \
+       if (ret)                                                        \
+               ret = trace_seq_printf(s, print);                       \
+       put_cpu();                                                      \
+       if (!ret)                                                       \
+               return TRACE_TYPE_PARTIAL_LINE;                         \
+                                                                       \
+       return TRACE_TYPE_HANDLED;                                      \
+}
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)                      \
+static enum print_line_t                                               \
+ftrace_raw_output_##name(struct trace_iterator *iter, int flags)       \
+{                                                                      \
+       return ftrace_raw_output_id_##template(event_##name.id,         \
+                                              #name, iter, flags);     \
+}
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, call, proto, args, print)         \
+static enum print_line_t                                               \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
+{                                                                      \
+       struct trace_seq *s = &iter->seq;                               \
+       struct ftrace_raw_##template *field;                            \
+       struct trace_entry *entry;                                      \
+       struct trace_seq *p;                                            \
        int ret;                                                        \
                                                                        \
        entry = iter->ent;                                              \
@@ -274,14 +385,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)  \
                                                                        \
        p = &get_cpu_var(ftrace_event_seq);                             \
        trace_seq_init(p);                                              \
-       ret = trace_seq_printf(s, #call ": " print);                    \
+       ret = trace_seq_printf(s, "%s: ", #call);                       \
+       if (ret)                                                        \
+               ret = trace_seq_printf(s, print);                       \
        put_cpu();                                                      \
        if (!ret)                                                       \
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
        return TRACE_TYPE_HANDLED;                                      \
 }
-       
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #undef __field_ext
@@ -315,8 +428,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)    \
 #undef __string
 #define __string(item, src) __dynamic_array(char, item, -1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)   \
 static int                                                             \
 ftrace_define_fields_##call(struct ftrace_event_call *event_call)      \
 {                                                                      \
@@ -332,6 +445,13 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)  \
        return ret;                                                     \
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
@@ -358,10 +478,10 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
        __data_size += (len) * sizeof(type);
 
 #undef __string
-#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static inline int ftrace_get_offsets_##call(                           \
        struct ftrace_data_offsets_##call *__data_offsets, proto)       \
 {                                                                      \
@@ -373,6 +493,13 @@ static inline int ftrace_get_offsets_##call(                               \
        return __data_size;                                             \
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -394,21 +521,28 @@ static inline int ftrace_get_offsets_##call(                              \
  *
  */
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, name, proto, args)                      \
                                                                        \
-static void ftrace_profile_##call(proto);                              \
+static void ftrace_profile_##name(proto);                              \
                                                                        \
-static int ftrace_profile_enable_##call(void)                          \
+static int ftrace_profile_enable_##name(struct ftrace_event_call *unused)\
 {                                                                      \
-       return register_trace_##call(ftrace_profile_##call);            \
+       return register_trace_##name(ftrace_profile_##name);            \
 }                                                                      \
                                                                        \
-static void ftrace_profile_disable_##call(void)                                \
+static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\
 {                                                                      \
-       unregister_trace_##call(ftrace_profile_##call);                 \
+       unregister_trace_##name(ftrace_profile_##name);                 \
 }
 
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #endif
@@ -423,7 +557,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
- * static int ftrace_reg_event_<call>(void)
+ * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int ret;
  *
@@ -434,7 +568,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     unregister_trace_<call>(ftrace_event_<call>);
  * }
@@ -469,7 +603,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
  * }
  *
- * static int ftrace_raw_reg_event_<call>(void)
+ * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int ret;
  *
@@ -480,7 +614,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
@@ -489,7 +623,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
  * };
  *
- * static int ftrace_raw_init_event_<call>(void)
+ * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int id;
  *
@@ -547,15 +681,13 @@ static void ftrace_profile_disable_##call(void)                           \
 #define __assign_str(dst, src)                                         \
        strcpy(__get_str(dst), src);
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
                                                                        \
-static struct ftrace_event_call event_##call;                          \
-                                                                       \
-static void ftrace_raw_event_##call(proto)                             \
+static void ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \
+                                      proto)                           \
 {                                                                      \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-       struct ftrace_event_call *event_call = &event_##call;           \
        struct ring_buffer_event *event;                                \
        struct ftrace_raw_##call *entry;                                \
        struct ring_buffer *buffer;                                     \
@@ -569,7 +701,7 @@ static void ftrace_raw_event_##call(proto)                          \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
                                                                        \
        event = trace_current_buffer_lock_reserve(&buffer,              \
-                                event_##call.id,                       \
+                                event_call->id,                        \
                                 sizeof(*entry) + __data_size,          \
                                 irq_flags, pc);                        \
        if (!event)                                                     \
@@ -584,9 +716,17 @@ static void ftrace_raw_event_##call(proto)                         \
        if (!filter_current_check_discard(buffer, event_call, entry, event)) \
                trace_nowake_buffer_unlock_commit(buffer,               \
                                                  event, irq_flags, pc); \
+}
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)                      \
+                                                                       \
+static void ftrace_raw_event_##call(proto)                             \
+{                                                                      \
+       ftrace_raw_event_id_##template(&event_##call, args);            \
 }                                                                      \
                                                                        \
-static int ftrace_raw_reg_event_##call(void *ptr)                      \
+static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        int ret;                                                        \
                                                                        \
@@ -597,7 +737,7 @@ static int ftrace_raw_reg_event_##call(void *ptr)                   \
        return ret;                                                     \
 }                                                                      \
                                                                        \
-static void ftrace_raw_unreg_event_##call(void *ptr)                   \
+static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        unregister_trace_##call(ftrace_raw_event_##call);               \
 }                                                                      \
@@ -606,7 +746,7 @@ static struct trace_event ftrace_event_type_##call = {                      \
        .trace                  = ftrace_raw_output_##call,             \
 };                                                                     \
                                                                        \
-static int ftrace_raw_init_event_##call(void)                          \
+static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        int id;                                                         \
                                                                        \
@@ -616,7 +756,36 @@ static int ftrace_raw_init_event_##call(void)                              \
        event_##call.id = id;                                           \
        INIT_LIST_HEAD(&event_##call.fields);                           \
        return 0;                                                       \
-}                                                                      \
+}
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)                      \
+                                                                       \
+static struct ftrace_event_call __used                                 \
+__attribute__((__aligned__(4)))                                                \
+__attribute__((section("_ftrace_events"))) event_##call = {            \
+       .name                   = #call,                                \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .event                  = &ftrace_event_type_##call,            \
+       .raw_init               = ftrace_raw_init_event_##call,         \
+       .regfunc                = ftrace_raw_reg_event_##call,          \
+       .unregfunc              = ftrace_raw_unreg_event_##call,        \
+       .show_format            = ftrace_format_##template,             \
+       .define_fields          = ftrace_define_fields_##template,      \
+       _TRACE_PROFILE_INIT(call)                                       \
+}
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, call, proto, args, print)         \
                                                                        \
 static struct ftrace_event_call __used                                 \
 __attribute__((__aligned__(4)))                                                \
@@ -628,7 +797,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {         \
        .regfunc                = ftrace_raw_reg_event_##call,          \
        .unregfunc              = ftrace_raw_unreg_event_##call,        \
        .show_format            = ftrace_format_##call,                 \
-       .define_fields          = ftrace_define_fields_##call,          \
+       .define_fields          = ftrace_define_fields_##template,      \
        _TRACE_PROFILE_INIT(call)                                       \
 }
 
@@ -646,6 +815,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {         \
  *     struct ftrace_event_call *event_call = &event_<call>;
  *     extern void perf_tp_event(int, u64, u64, void *, int);
  *     struct ftrace_raw_##call *entry;
+ *     struct perf_trace_buf *trace_buf;
  *     u64 __addr = 0, __count = 1;
  *     unsigned long irq_flags;
  *     struct trace_entry *ent;
@@ -670,14 +840,25 @@ __attribute__((section("_ftrace_events"))) event_##call = {               \
  *     __cpu = smp_processor_id();
  *
  *     if (in_nmi())
- *             raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *             trace_buf = rcu_dereference(perf_trace_buf_nmi);
  *     else
- *             raw_data = rcu_dereference(trace_profile_buf);
+ *             trace_buf = rcu_dereference(perf_trace_buf);
  *
- *     if (!raw_data)
+ *     if (!trace_buf)
  *             goto end;
  *
- *     raw_data = per_cpu_ptr(raw_data, __cpu);
+ *     trace_buf = per_cpu_ptr(trace_buf, __cpu);
+ *
+ *     // Avoid recursion from perf that could mess up the buffer
+ *     if (trace_buf->recursion++)
+ *             goto end_recursion;
+ *
+ *     raw_data = trace_buf->buf;
+ *
+ *     // Make recursion update visible before entering perf_tp_event
+ *     // so that we protect from perf recursions.
+ *
+ *     barrier();
  *
  *     //zero dead bytes from alignment to avoid stack leak to userspace:
  *     *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
@@ -704,21 +885,26 @@ __attribute__((section("_ftrace_events"))) event_##call = {               \
 #undef __perf_count
 #define __perf_count(c) __count = (c)
 
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
-static void ftrace_profile_##call(proto)                               \
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
+static void                                                            \
+ftrace_profile_templ_##call(struct ftrace_event_call *event_call,      \
+                           proto)                                      \
 {                                                                      \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-       struct ftrace_event_call *event_call = &event_##call;           \
-       extern void perf_tp_event(int, u64, u64, void *, int);  \
+       extern int perf_swevent_get_recursion_context(void);            \
+       extern void perf_swevent_put_recursion_context(int rctx);       \
+       extern void perf_tp_event(int, u64, u64, void *, int);          \
        struct ftrace_raw_##call *entry;                                \
        u64 __addr = 0, __count = 1;                                    \
        unsigned long irq_flags;                                        \
        struct trace_entry *ent;                                        \
        int __entry_size;                                               \
        int __data_size;                                                \
+       char *trace_buf;                                                \
        char *raw_data;                                                 \
        int __cpu;                                                      \
+       int rctx;                                                       \
        int pc;                                                         \
                                                                        \
        pc = preempt_count();                                           \
@@ -733,17 +919,22 @@ static void ftrace_profile_##call(proto)                          \
                return;                                                 \
                                                                        \
        local_irq_save(irq_flags);                                      \
+                                                                       \
+       rctx = perf_swevent_get_recursion_context();                    \
+       if (rctx < 0)                                                   \
+               goto end_recursion;                                     \
+                                                                       \
        __cpu = smp_processor_id();                                     \
                                                                        \
        if (in_nmi())                                                   \
-               raw_data = rcu_dereference(trace_profile_buf_nmi);              \
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);        \
        else                                                            \
-               raw_data = rcu_dereference(trace_profile_buf);          \
+               trace_buf = rcu_dereference(perf_trace_buf);            \
                                                                        \
-       if (!raw_data)                                                  \
+       if (!trace_buf)                                                 \
                goto end;                                               \
                                                                        \
-       raw_data = per_cpu_ptr(raw_data, __cpu);                        \
+       raw_data = per_cpu_ptr(trace_buf, __cpu);                       \
                                                                        \
        *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;         \
        entry = (struct ftrace_raw_##call *)raw_data;                   \
@@ -759,10 +950,25 @@ static void ftrace_profile_##call(proto)                          \
                             __entry_size);                             \
                                                                        \
 end:                                                                   \
+       perf_swevent_put_recursion_context(rctx);                       \
+end_recursion:                                                         \
        local_irq_restore(irq_flags);                                   \
                                                                        \
 }
 
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)              \
+static void ftrace_profile_##call(proto)                       \
+{                                                              \
+       struct ftrace_event_call *event_call = &event_##call;   \
+                                                               \
+       ftrace_profile_templ_##template(event_call, args);      \
+}
+
+#undef DEFINE_EVENT_PRINT
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 #endif /* CONFIG_EVENT_PROFILE */
 
index e972f0a40f8d02af648863793adb7e8383673f7b..961fda3556bb828f62508dc3acb440c90f21e32d 100644 (file)
  * A syscall entry in the ftrace syscalls array.
  *
  * @name: name of the syscall
+ * @syscall_nr: number of the syscall
  * @nb_args: number of parameters it takes
  * @types: list of types as strings
  * @args: list of args as strings (args[i] matches types[i])
- * @enter_id: associated ftrace enter event id
- * @exit_id: associated ftrace exit event id
  * @enter_event: associated syscall_enter trace event
  * @exit_event: associated syscall_exit trace event
  */
 struct syscall_metadata {
        const char      *name;
+       int             syscall_nr;
        int             nb_args;
        const char      **types;
        const char      **args;
-       int             enter_id;
-       int             exit_id;
 
        struct ftrace_event_call *enter_event;
        struct ftrace_event_call *exit_event;
@@ -34,29 +32,28 @@ struct syscall_metadata {
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 extern unsigned long arch_syscall_addr(int nr);
-extern int syscall_name_to_nr(char *name);
-void set_syscall_enter_id(int num, int id);
-void set_syscall_exit_id(int num, int id);
-extern struct trace_event event_syscall_enter;
-extern struct trace_event event_syscall_exit;
-extern int reg_event_syscall_enter(void *ptr);
-extern void unreg_event_syscall_enter(void *ptr);
-extern int reg_event_syscall_exit(void *ptr);
-extern void unreg_event_syscall_exit(void *ptr);
+extern int init_syscall_trace(struct ftrace_event_call *call);
+
 extern int syscall_enter_format(struct ftrace_event_call *call,
                                struct trace_seq *s);
 extern int syscall_exit_format(struct ftrace_event_call *call,
                                struct trace_seq *s);
 extern int syscall_enter_define_fields(struct ftrace_event_call *call);
 extern int syscall_exit_define_fields(struct ftrace_event_call *call);
+extern int reg_event_syscall_enter(struct ftrace_event_call *call);
+extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
+extern int reg_event_syscall_exit(struct ftrace_event_call *call);
+extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
+extern int
+ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
 #ifdef CONFIG_EVENT_PROFILE
-int reg_prof_syscall_enter(char *name);
-void unreg_prof_syscall_enter(char *name);
-int reg_prof_syscall_exit(char *name);
-void unreg_prof_syscall_exit(char *name);
+int prof_sysenter_enable(struct ftrace_event_call *call);
+void prof_sysenter_disable(struct ftrace_event_call *call);
+int prof_sysexit_enable(struct ftrace_event_call *call);
+void prof_sysexit_disable(struct ftrace_event_call *call);
 
 #endif
 
index dcf6789bf547e5af8e1c569b743ac72a2af4af93..982c50e2ce534df9a3bf5a704580fd0e13fe7f5b 100644 (file)
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -97,6 +98,7 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
index f7864ac2ecc1ad54c0af6b06b6f9d2da4a93f1ac..3f45e3cf931d917fc1dca145be32618afda895cd 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -977,6 +978,10 @@ NORET_TYPE void do_exit(long code)
 
        proc_exit_connector(tsk);
 
+       /*
+        * FIXME: do that only when needed, using sched_exit tracepoint
+        */
+       flush_ptrace_hw_breakpoint(tsk);
        /*
         * Flush inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644 (file)
index 0000000..cf5ee16
--- /dev/null
@@ -0,0 +1,423 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <linux/hw_breakpoint.h>
+
+/*
+ * Constraints data
+ */
+
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+       unsigned int pinned;
+       unsigned int flexible;
+};
+
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
+{
+       int i;
+       unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+
+       for (i = HBP_NUM -1; i >= 0; i--) {
+               if (tsk_pinned[i] > 0)
+                       return i + 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+{
+       if (cpu >= 0) {
+               slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+               slots->pinned += max_task_bp_pinned(cpu);
+               slots->flexible = per_cpu(nr_bp_flexible, cpu);
+
+               return;
+       }
+
+       for_each_online_cpu(cpu) {
+               unsigned int nr;
+
+               nr = per_cpu(nr_cpu_bp_pinned, cpu);
+               nr += max_task_bp_pinned(cpu);
+
+               if (nr > slots->pinned)
+                       slots->pinned = nr;
+
+               nr = per_cpu(nr_bp_flexible, cpu);
+
+               if (nr > slots->flexible)
+                       slots->flexible = nr;
+       }
+}
+
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+       int count = 0;
+       struct perf_event *bp;
+       struct perf_event_context *ctx = tsk->perf_event_ctxp;
+       unsigned int *tsk_pinned;
+       struct list_head *list;
+       unsigned long flags;
+
+       if (WARN_ONCE(!ctx, "No perf context for this task"))
+               return;
+
+       list = &ctx->event_list;
+
+       spin_lock_irqsave(&ctx->lock, flags);
+
+       /*
+        * The current breakpoint counter is not included in the list
+        * at the open() callback time
+        */
+       list_for_each_entry(bp, list, event_entry) {
+               if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                       count++;
+       }
+
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+               return;
+
+       tsk_pinned = per_cpu(task_bp_pinned, cpu);
+       if (enable) {
+               tsk_pinned[count]++;
+               if (count > 0)
+                       tsk_pinned[count-1]--;
+       } else {
+               tsk_pinned[count]--;
+               if (count > 0)
+                       tsk_pinned[count-1]++;
+       }
+}
+
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+       int cpu = bp->cpu;
+       struct task_struct *tsk = bp->ctx->task;
+
+       /* Pinned counter task profiling */
+       if (tsk) {
+               if (cpu >= 0) {
+                       toggle_bp_task_slot(tsk, cpu, enable);
+                       return;
+               }
+
+               for_each_online_cpu(cpu)
+                       toggle_bp_task_slot(tsk, cpu, enable);
+               return;
+       }
+
+       /* Pinned counter cpu profiling */
+       if (enable)
+               per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+       else
+               per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ */
+int reserve_bp_slot(struct perf_event *bp)
+{
+       struct bp_busy_slots slots = {0};
+       int ret = 0;
+
+       mutex_lock(&nr_bp_mutex);
+
+       fetch_bp_busy_slots(&slots, bp->cpu);
+
+       /* Flexible counters need to keep at least one slot */
+       if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+               ret = -ENOSPC;
+               goto end;
+       }
+
+       toggle_bp_slot(bp, true);
+
+end:
+       mutex_unlock(&nr_bp_mutex);
+
+       return ret;
+}
+
+void release_bp_slot(struct perf_event *bp)
+{
+       mutex_lock(&nr_bp_mutex);
+
+       toggle_bp_slot(bp, false);
+
+       mutex_unlock(&nr_bp_mutex);
+}
+
+
+int __register_perf_hw_breakpoint(struct perf_event *bp)
+{
+       int ret;
+
+       ret = reserve_bp_slot(bp);
+       if (ret)
+               return ret;
+
+       /*
+        * Ptrace breakpoints can be temporary perf events only
+        * meant to reserve a slot. In this case, it is created disabled and
+        * we don't want to check the params right now (as we put a null addr)
+        * But perf tools create events as disabled and we want to check
+        * the params for them.
+        * This is a quick hack that will be removed soon, once we remove
+        * the tmp breakpoints from ptrace
+        */
+       if (!bp->attr.disabled || bp->callback == perf_bp_event)
+               ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+
+       return ret;
+}
+
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+       bp->callback = perf_bp_event;
+
+       return __register_perf_hw_breakpoint(bp);
+}
+
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered,
+                           struct task_struct *tsk)
+{
+       return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
+                         perf_callback_t triggered,
+                         struct task_struct *tsk)
+{
+       /*
+        * FIXME: do it without unregistering
+        * - We don't want to lose our slot
+        * - If the new bp is incorrect, don't lose the older one
+        */
+       unregister_hw_breakpoint(bp);
+
+       return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+       if (!bp)
+               return;
+       perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event **
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                           perf_callback_t triggered)
+{
+       struct perf_event **cpu_events, **pevent, *bp;
+       long err;
+       int cpu;
+
+       cpu_events = alloc_percpu(typeof(*cpu_events));
+       if (!cpu_events)
+               return ERR_PTR(-ENOMEM);
+
+       for_each_possible_cpu(cpu) {
+               pevent = per_cpu_ptr(cpu_events, cpu);
+               bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+
+               *pevent = bp;
+
+               if (IS_ERR(bp)) {
+                       err = PTR_ERR(bp);
+                       goto fail;
+               }
+       }
+
+       return cpu_events;
+
+fail:
+       for_each_possible_cpu(cpu) {
+               pevent = per_cpu_ptr(cpu_events, cpu);
+               if (IS_ERR(*pevent))
+                       break;
+               unregister_hw_breakpoint(*pevent);
+       }
+       free_percpu(cpu_events);
+       /* return the error if any */
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+{
+       int cpu;
+       struct perf_event **pevent;
+
+       for_each_possible_cpu(cpu) {
+               pevent = per_cpu_ptr(cpu_events, cpu);
+               unregister_hw_breakpoint(*pevent);
+       }
+       free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+       .notifier_call = hw_breakpoint_exceptions_notify,
+       /* we need to be notified first */
+       .priority = 0x7fffffff
+};
+
+static int __init init_hw_breakpoint(void)
+{
+       return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+core_initcall(init_hw_breakpoint);
+
+
+struct pmu perf_ops_bp = {
+       .enable         = arch_install_hw_breakpoint,
+       .disable        = arch_uninstall_hw_breakpoint,
+       .read           = hw_breakpoint_pmu_read,
+       .unthrottle     = hw_breakpoint_pmu_unthrottle
+};
index 8b6b8b697c686a297de3c85421e1e5172ff53a27..8e5288a8a3555c419f477e0925f3210e3863cef6 100644 (file)
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
        }
        return module_kallsyms_lookup_name(name);
 }
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                      unsigned long),
index 1494e85b35f29bb5c006c34221eadfedceee934d..e5342a344c43bee9140d9127f1fe907b21149e8d 100644 (file)
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
  */
 static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"preempt_schedule",},
+       {"native_get_debugreg",},
+       {"irq_entries_start",},
+       {"common_interrupt",},
        {NULL}    /* Terminator */
 };
 
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
 
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+       struct kprobe *old_p, *list_p;
+
+       old_p = get_kprobe(p->addr);
+       if (unlikely(!old_p))
+               return NULL;
+
+       if (p != old_p) {
+               list_for_each_entry_rcu(list_p, &old_p->list, list)
+                       if (list_p == p)
+                       /* kprobe p is a valid probe */
+                               goto valid;
+               return NULL;
+       }
+valid:
+       return old_p;
+}
+
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+       int ret = 0;
+       struct kprobe *old_p;
+
+       mutex_lock(&kprobe_mutex);
+       old_p = __get_valid_kprobe(p);
+       if (old_p)
+               ret = -EINVAL;
+       mutex_unlock(&kprobe_mutex);
+       return ret;
+}
+
 int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
                return -EINVAL;
        p->addr = addr;
 
+       ret = check_kprobe_rereg(p);
+       if (ret)
+               return ret;
+
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
-       struct kprobe *old_p, *list_p;
-
-       old_p = get_kprobe(p->addr);
-       if (unlikely(!old_p))
-               return NULL;
-
-       if (p != old_p) {
-               list_for_each_entry_rcu(list_p, &old_p->list, list)
-                       if (list_p == p)
-                       /* kprobe p is a valid probe */
-                               goto valid;
-               return NULL;
-       }
-valid:
-       return old_p;
-}
-
 /*
  * Unregister a kprobe without a scheduler synchronization.
  */
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        arch_remove_kprobe(p);
 }
 
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+       printk(KERN_WARNING "Dumping kprobe:\n");
+       printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+              kp->symbol_name, kp->addr, kp->offset);
+}
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
                                             unsigned long val, void *data)
index 9af56723c09605a07742180983e465500cef2d9d..f5dcd36d3151a0e3563f786f050795d605d25f99 100644 (file)
@@ -49,7 +49,7 @@
 #include "lockdep_internals.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/lockdep.h>
+#include <trace/events/lock.h>
 
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
index 61d5aa5eced3466393582e4f566b63c468ea7cc3..acd24e7643eb8ebdb9a0386b65931294b37c7eaf 100644 (file)
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
 {
        struct die_args args = {
index 7f29643c898549a5e523d07479d581edc4e870f5..6b7ddba1dd640cc94f2af66fd163961e944a94f2 100644 (file)
@@ -28,6 +28,8 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/irq_regs.h>
 
@@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        put_ctx(ctx);
 }
 
+static inline u64 perf_clock(void)
+{
+       return cpu_clock(smp_processor_id());
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+       u64 now = perf_clock();
+
+       ctx->time += now - ctx->timestamp;
+       ctx->timestamp = now;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+       struct perf_event_context *ctx = event->ctx;
+       u64 run_end;
+
+       if (event->state < PERF_EVENT_STATE_INACTIVE ||
+           event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+               return;
+
+       if (ctx->is_active)
+               run_end = ctx->time;
+       else
+               run_end = event->tstamp_stopped;
+
+       event->total_time_enabled = run_end - event->tstamp_enabled;
+
+       if (event->state == PERF_EVENT_STATE_INACTIVE)
+               run_end = event->tstamp_stopped;
+       else
+               run_end = ctx->time;
+
+       event->total_time_running = run_end - event->tstamp_running;
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader != event)
                event->group_leader->nr_siblings--;
 
+       update_event_times(event);
+
+       /*
+        * If event was in error state, then keep it
+        * that way, otherwise bogus counts will be
+        * returned on read(). The only way to get out
+        * of error state is by explicit re-enabling
+        * of the event
+        */
+       if (event->state > PERF_EVENT_STATE_OFF)
+               event->state = PERF_EVENT_STATE_OFF;
+
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
@@ -445,50 +502,11 @@ retry:
         * can remove the event safely, if the call above did not
         * succeed.
         */
-       if (!list_empty(&event->group_entry)) {
+       if (!list_empty(&event->group_entry))
                list_del_event(event, ctx);
-       }
        spin_unlock_irq(&ctx->lock);
 }
 
-static inline u64 perf_clock(void)
-{
-       return cpu_clock(smp_processor_id());
-}
-
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
-       u64 now = perf_clock();
-
-       ctx->time += now - ctx->timestamp;
-       ctx->timestamp = now;
-}
-
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-       struct perf_event_context *ctx = event->ctx;
-       u64 run_end;
-
-       if (event->state < PERF_EVENT_STATE_INACTIVE ||
-           event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-               return;
-
-       event->total_time_enabled = ctx->time - event->tstamp_enabled;
-
-       if (event->state == PERF_EVENT_STATE_INACTIVE)
-               run_end = event->tstamp_stopped;
-       else
-               run_end = ctx->time;
-
-       event->total_time_running = run_end - event->tstamp_running;
-}
-
 /*
  * Update total_time_enabled and total_time_running for all events in a group.
  */
@@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
        update_context_time(ctx);
 
        perf_disable();
-       if (ctx->nr_active)
+       if (ctx->nr_active) {
                list_for_each_entry(event, &ctx->group_list, group_entry)
                        group_sched_out(event, cpuctx, ctx);
-
+       }
        perf_enable();
  out:
        spin_unlock(&ctx->lock);
@@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
                && !ctx1->pin_count && !ctx2->pin_count;
 }
 
-static void __perf_event_read(void *event);
-
 static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
 {
@@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
         */
        switch (event->state) {
        case PERF_EVENT_STATE_ACTIVE:
-               __perf_event_read(event);
-               break;
+               event->pmu->read(event);
+               /* fall-through */
 
        case PERF_EVENT_STATE_INACTIVE:
                update_event_times(event);
@@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        if (!ctx->nr_stat)
                return;
 
+       update_context_time(ctx);
+
        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);
 
@@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
 
-       update_context_time(ctx);
-
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp;
@@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-       unsigned long flags;
 
        /*
         * If this is a task context, we need to check whether it is
@@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
 
-       local_irq_save(flags);
-       if (ctx->is_active)
-               update_context_time(ctx);
-       event->pmu->read(event);
+       spin_lock(&ctx->lock);
+       update_context_time(ctx);
        update_event_times(event);
-       local_irq_restore(flags);
+       spin_unlock(&ctx->lock);
+
+       event->pmu->read(event);
 }
 
 static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
                smp_call_function_single(event->oncpu,
                                         __perf_event_read, event, 1);
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+               struct perf_event_context *ctx = event->ctx;
+               unsigned long flags;
+
+               spin_lock_irqsave(&ctx->lock, flags);
+               update_context_time(ctx);
                update_event_times(event);
+               spin_unlock_irqrestore(&ctx->lock, flags);
        }
 
        return atomic64_read(&event->count);
@@ -1658,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        return ERR_PTR(err);
 }
 
+static void perf_event_free_filter(struct perf_event *event);
+
 static void free_event_rcu(struct rcu_head *head)
 {
        struct perf_event *event;
@@ -1665,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head)
        event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
+       perf_event_free_filter(event);
        kfree(event);
 }
 
@@ -1696,16 +1720,10 @@ static void free_event(struct perf_event *event)
        call_rcu(&event->rcu_head, free_event_rcu);
 }
 
-/*
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
+int perf_event_release_kernel(struct perf_event *event)
 {
-       struct perf_event *event = file->private_data;
        struct perf_event_context *ctx = event->ctx;
 
-       file->private_data = NULL;
-
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_event_remove_from_context(event);
@@ -1720,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+       struct perf_event *event = file->private_data;
+
+       file->private_data = NULL;
+
+       return perf_event_release_kernel(event);
+}
 
 static int perf_event_read_size(struct perf_event *event)
 {
@@ -1746,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
        return size;
 }
 
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
        u64 total = 0;
 
+       *enabled = 0;
+       *running = 0;
+
+       mutex_lock(&event->child_mutex);
        total += perf_event_read(event);
-       list_for_each_entry(child, &event->child_list, child_list)
+       *enabled += event->total_time_enabled +
+                       atomic64_read(&event->child_total_time_enabled);
+       *running += event->total_time_running +
+                       atomic64_read(&event->child_total_time_running);
+
+       list_for_each_entry(child, &event->child_list, child_list) {
                total += perf_event_read(child);
+               *enabled += child->total_time_enabled;
+               *running += child->total_time_running;
+       }
+       mutex_unlock(&event->child_mutex);
 
        return total;
 }
-
-static int perf_event_read_entry(struct perf_event *event,
-                                  u64 read_format, char __user *buf)
-{
-       int n = 0, count = 0;
-       u64 values[2];
-
-       values[n++] = perf_event_read_value(event);
-       if (read_format & PERF_FORMAT_ID)
-               values[n++] = primary_event_id(event);
-
-       count = n * sizeof(u64);
-
-       if (copy_to_user(buf, values, count))
-               return -EFAULT;
-
-       return count;
-}
+EXPORT_SYMBOL_GPL(perf_event_read_value);
 
 static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-       int n = 0, size = 0, err = -EFAULT;
-       u64 values[3];
+       int n = 0, size = 0, ret = -EFAULT;
+       struct perf_event_context *ctx = leader->ctx;
+       u64 values[5];
+       u64 count, enabled, running;
+
+       mutex_lock(&ctx->mutex);
+       count = perf_event_read_value(leader, &enabled, &running);
 
        values[n++] = 1 + leader->nr_siblings;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               values[n++] = leader->total_time_enabled +
-                       atomic64_read(&leader->child_total_time_enabled);
-       }
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               values[n++] = leader->total_time_running +
-                       atomic64_read(&leader->child_total_time_running);
-       }
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = enabled;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = running;
+       values[n++] = count;
+       if (read_format & PERF_FORMAT_ID)
+               values[n++] = primary_event_id(leader);
 
        size = n * sizeof(u64);
 
        if (copy_to_user(buf, values, size))
-               return -EFAULT;
-
-       err = perf_event_read_entry(leader, read_format, buf + size);
-       if (err < 0)
-               return err;
+               goto unlock;
 
-       size += err;
+       ret = size;
 
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-               err = perf_event_read_entry(sub, read_format,
-                               buf + size);
-               if (err < 0)
-                       return err;
+               n = 0;
+
+               values[n++] = perf_event_read_value(sub, &enabled, &running);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_event_id(sub);
+
+               size = n * sizeof(u64);
 
-               size += err;
+               if (copy_to_user(buf + ret, values, size)) {
+                       ret = -EFAULT;
+                       goto unlock;
+               }
+
+               ret += size;
        }
+unlock:
+       mutex_unlock(&ctx->mutex);
 
-       return size;
+       return ret;
 }
 
 static int perf_event_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
 {
+       u64 enabled, running;
        u64 values[4];
        int n = 0;
 
-       values[n++] = perf_event_read_value(event);
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-               values[n++] = event->total_time_enabled +
-                       atomic64_read(&event->child_total_time_enabled);
-       }
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-               values[n++] = event->total_time_running +
-                       atomic64_read(&event->child_total_time_running);
-       }
+       values[n++] = perf_event_read_value(event, &enabled, &running);
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               values[n++] = enabled;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
 
@@ -1861,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
                return -ENOSPC;
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
-       mutex_lock(&event->child_mutex);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_event_read_group(event, read_format, buf);
        else
                ret = perf_event_read_one(event, read_format, buf);
-       mutex_unlock(&event->child_mutex);
 
        return ret;
 }
@@ -1974,7 +2006,8 @@ unlock:
        return ret;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2002,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
                return perf_event_set_output(event, arg);
 
+       case PERF_EVENT_IOC_SET_FILTER:
+               return perf_event_set_filter(event, (void __user *)arg);
+
        default:
                return -ENOTTY;
        }
@@ -2174,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        perf_mmap_free_page((unsigned long)data->user_page);
        for (i = 0; i < data->nr_pages; i++)
                perf_mmap_free_page((unsigned long)data->data_pages[i]);
+       kfree(data);
 }
 
 #else
@@ -2214,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 
        vfree(base);
+       kfree(data);
 }
 
 static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
        }
 
        if (!data->watermark)
-               data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+               data->watermark = max_size / 2;
 
 
        rcu_assign_pointer(event->data, data);
@@ -2319,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
 
        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
        perf_mmap_data_free(data);
-       kfree(data);
 }
 
 static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 static void perf_output_lock(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-       int cpu;
+       int cur, cpu = get_cpu();
 
        handle->locked = 0;
 
-       local_irq_save(handle->flags);
-       cpu = smp_processor_id();
-
-       if (in_nmi() && atomic_read(&data->lock) == cpu)
-               return;
+       for (;;) {
+               cur = atomic_cmpxchg(&data->lock, -1, cpu);
+               if (cur == -1) {
+                       handle->locked = 1;
+                       break;
+               }
+               if (cur == cpu)
+                       break;
 
-       while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
                cpu_relax();
-
-       handle->locked = 1;
+       }
 }
 
 static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2763,7 @@ again:
        if (atomic_xchg(&data->wakeup, 0))
                perf_output_wakeup(handle);
 out:
-       local_irq_restore(handle->flags);
+       put_cpu();
 }
 
 void perf_output_copy(struct perf_output_handle *handle,
@@ -3236,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
 
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_task_match(event))
                        perf_event_task_output(event, task_event);
        }
-       rcu_read_unlock();
 }
 
 static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx = task_event->task_ctx;
 
+       rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
        put_cpu_var(perf_cpu_context);
 
-       rcu_read_lock();
        if (!ctx)
                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
        if (ctx)
@@ -3348,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
 
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_comm_match(event))
                        perf_event_comm_output(event, comm_event);
        }
-       rcu_read_unlock();
 }
 
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        char comm[TASK_COMM_LEN];
 
        memset(comm, 0, sizeof(comm));
-       strncpy(comm, comm_event->task->comm, sizeof(comm));
+       strlcpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));
 
        comm_event->comm = comm;
@@ -3375,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
+       rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
        put_cpu_var(perf_cpu_context);
 
-       rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3472,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
 
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_mmap_match(event, mmap_event))
                        perf_event_mmap_output(event, mmap_event);
        }
-       rcu_read_unlock();
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,11 +3559,11 @@ got_name:
 
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
+       rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
        put_cpu_var(perf_cpu_context);
 
-       rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3679,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                        perf_event_disable(event);
        }
 
-       perf_event_output(event, nmi, data, regs);
+       if (event->overflow_handler)
+               event->overflow_handler(event, nmi, data, regs);
+       else
+               perf_event_output(event, nmi, data, regs);
+
        return ret;
 }
 
@@ -3724,16 +3751,16 @@ again:
        return nr;
 }
 
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    int nmi, struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
-       u64 overflow;
 
        data->period = event->hw.last_period;
-       overflow = perf_swevent_set_period(event);
+       if (!overflow)
+               overflow = perf_swevent_set_period(event);
 
        if (hwc->interrupts == MAX_INTERRUPTS)
                return;
@@ -3766,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 
        atomic64_add(nr, &event->count);
 
+       if (!regs)
+               return;
+
        if (!hwc->sample_period)
                return;
 
-       if (!regs)
+       if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+               return perf_swevent_overflow(event, 1, nmi, data, regs);
+
+       if (atomic64_add_negative(nr, &hwc->period_left))
                return;
 
-       if (!atomic64_add_negative(nr, &hwc->period_left))
-               perf_swevent_overflow(event, nmi, data, regs);
+       perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
 static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
        return 1;
 }
 
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data);
+
+static int perf_exclude_event(struct perf_event *event,
+                             struct pt_regs *regs)
+{
+       if (regs) {
+               if (event->attr.exclude_user && user_mode(regs))
+                       return 1;
+
+               if (event->attr.exclude_kernel && !user_mode(regs))
+                       return 1;
+       }
+
+       return 0;
+}
+
 static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
-                               u32 event_id, struct pt_regs *regs)
+                               u32 event_id,
+                               struct perf_sample_data *data,
+                               struct pt_regs *regs)
 {
        if (!perf_swevent_is_counting(event))
                return 0;
 
        if (event->attr.type != type)
                return 0;
+
        if (event->attr.config != event_id)
                return 0;
 
-       if (regs) {
-               if (event->attr.exclude_user && user_mode(regs))
-                       return 0;
+       if (perf_exclude_event(event, regs))
+               return 0;
 
-               if (event->attr.exclude_kernel && !user_mode(regs))
-                       return 0;
-       }
+       if (event->attr.type == PERF_TYPE_TRACEPOINT &&
+           !perf_tp_event_match(event, data))
+               return 0;
 
        return 1;
 }
@@ -3837,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 {
        struct perf_event *event;
 
-       if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-               return;
-
-       rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-               if (perf_swevent_match(event, type, event_id, regs))
+               if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
-       rcu_read_unlock();
 }
 
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+int perf_swevent_get_recursion_context(void)
 {
+       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+       int rctx;
+
        if (in_nmi())
-               return &cpuctx->recursion[3];
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (cpuctx->recursion[rctx]) {
+               put_cpu_var(perf_cpu_context);
+               return -1;
+       }
 
-       if (in_irq())
-               return &cpuctx->recursion[2];
+       cpuctx->recursion[rctx]++;
+       barrier();
 
-       if (in_softirq())
-               return &cpuctx->recursion[1];
+       return rctx;
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-       return &cpuctx->recursion[0];
+void perf_swevent_put_recursion_context(int rctx)
+{
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       barrier();
+       cpuctx->recursion[rctx]--;
+       put_cpu_var(perf_cpu_context);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr, int nmi,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-       struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
-       int *recursion = perf_swevent_recursion_context(cpuctx);
+       struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
 
-       if (*recursion)
-               goto out;
-
-       (*recursion)++;
-       barrier();
-
+       cpuctx = &__get_cpu_var(perf_cpu_context);
+       rcu_read_lock();
        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
                                 nr, nmi, data, regs);
-       rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3888,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        if (ctx)
                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
        rcu_read_unlock();
-
-       barrier();
-       (*recursion)--;
-
-out:
-       put_cpu_var(perf_cpu_context);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
 {
-       struct perf_sample_data data = {
-               .addr = addr,
-       };
+       struct perf_sample_data data;
+       int rctx;
 
-       do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
-                               &data, regs);
+       rctx = perf_swevent_get_recursion_context();
+       if (rctx < 0)
+               return;
+
+       data.addr = addr;
+       data.raw  = NULL;
+
+       do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+
+       perf_swevent_put_recursion_context(rctx);
 }
 
 static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
 
        data.addr = 0;
+       data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4108,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
+
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
                          int entry_size)
 {
@@ -4126,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
        if (!regs)
                regs = task_pt_regs(current);
 
+       /* Trace events already protected against recursion */
        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
                                &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
-extern int ftrace_profile_enable(int);
-extern void ftrace_profile_disable(int);
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data)
+{
+       void *record = data->raw->data;
+
+       if (likely(!event->filter) || filter_match_preds(event->filter, record))
+               return 1;
+       return 0;
+}
 
 static void tp_perf_event_destroy(struct perf_event *event)
 {
@@ -4157,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 
        return &perf_ops_generic;
 }
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       char *filter_str;
+       int ret;
+
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -EINVAL;
+
+       filter_str = strndup_user(arg, PAGE_SIZE);
+       if (IS_ERR(filter_str))
+               return PTR_ERR(filter_str);
+
+       ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+
+       kfree(filter_str);
+       return ret;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+       ftrace_profile_free_filter(event);
+}
+
 #else
+
+static int perf_tp_event_match(struct perf_event *event,
+                               struct perf_sample_data *data)
+{
+       return 1;
+}
+
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
 }
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+       return -ENOENT;
+}
+
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+
+#endif /* CONFIG_EVENT_PROFILE */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+       release_bp_slot(event);
+}
+
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+       int err;
+       /*
+        * The breakpoint is already filled if we haven't created the counter
+        * through perf syscall
+        * FIXME: manage to get trigerred to NULL if it comes from syscalls
+        */
+       if (!bp->callback)
+               err = register_perf_hw_breakpoint(bp);
+       else
+               err = __register_perf_hw_breakpoint(bp);
+       if (err)
+               return ERR_PTR(err);
+
+       bp->destroy = bp_perf_event_destroy;
+
+       return &perf_ops_bp;
+}
+
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+       struct perf_sample_data sample;
+       struct pt_regs *regs = data;
+
+       sample.addr = bp->attr.bp_addr;
+
+       if (!perf_exclude_event(bp, regs))
+               perf_swevent_add(bp, 1, 1, &sample, regs);
+}
+#else
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+       return NULL;
+}
+
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
 #endif
 
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
        case PERF_COUNT_SW_CONTEXT_SWITCHES:
        case PERF_COUNT_SW_CPU_MIGRATIONS:
+       case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+       case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
+                  perf_callback_t callback,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4270,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
 
        event->state            = PERF_EVENT_STATE_INACTIVE;
 
+       if (!callback && parent_event)
+               callback = parent_event->callback;
+       
+       event->callback = callback;
+
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
 
@@ -4304,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
                pmu = tp_perf_event_init(event);
                break;
 
+       case PERF_TYPE_BREAKPOINT:
+               pmu = bp_perf_event_init(event);
+               break;
+
+
        default:
                break;
        }
@@ -4416,7 +4589,7 @@ err_size:
        goto out;
 }
 
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
 {
        struct perf_event *output_event = NULL;
        struct file *output_file = NULL;
@@ -4546,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
        }
 
        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                    NULL, GFP_KERNEL);
+                                    NULL, NULL, GFP_KERNEL);
        err = PTR_ERR(event);
        if (IS_ERR(event))
                goto err_put_context;
@@ -4594,6 +4767,60 @@ err_put_context:
        return err;
 }
 
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+                                pid_t pid, perf_callback_t callback)
+{
+       struct perf_event *event;
+       struct perf_event_context *ctx;
+       int err;
+
+       /*
+        * Get the target context (task or percpu):
+        */
+
+       ctx = find_get_context(pid, cpu);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_exit;
+       }
+
+       event = perf_event_alloc(attr, cpu, ctx, NULL,
+                                    NULL, callback, GFP_KERNEL);
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
+               goto err_put_context;
+       }
+
+       event->filp = NULL;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, event, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+
+       event->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_event_mutex);
+       list_add_tail(&event->owner_entry, &current->perf_event_list);
+       mutex_unlock(&current->perf_event_mutex);
+
+       return event;
+
+ err_put_context:
+       put_ctx(ctx);
+ err_exit:
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+
 /*
  * inherit a event from parent task to child task:
  */
@@ -4619,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu, child_ctx,
                                           group_leader, parent_event,
-                                          GFP_KERNEL);
+                                          NULL, GFP_KERNEL);
        if (IS_ERR(child_event))
                return child_event;
        get_ctx(child_ctx);
@@ -4637,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
        if (parent_event->attr.freq)
                child_event->hw.sample_period = parent_event->hw.sample_period;
 
+       child_event->overflow_handler = parent_event->overflow_handler;
+
        /*
         * Link it up in the child's context:
         */
@@ -4726,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
        struct perf_event *parent_event;
 
-       update_event_times(child_event);
        perf_event_remove_from_context(child_event);
 
        parent_event = child_event->parent;
@@ -4778,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
         * the events from it.
         */
        unclone_ctx(child_ctx);
+       update_context_time(child_ctx);
        spin_unlock_irqrestore(&child_ctx->lock, flags);
 
        /*
index fe08008133dad06e0acd6efdc98db25c64367565..6b982f2cf524d82e6b72f276f7d9010df9007e47 100644 (file)
@@ -28,7 +28,8 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -856,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        struct sigqueue *q;
        int override_rlimit;
 
-       trace_sched_signal_send(sig, t);
+       trace_signal_generate(sig, info, t);
 
        assert_spin_locked(&t->sighand->siglock);
 
@@ -918,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        break;
                }
        } else if (!is_si_special(info)) {
-               if (sig >= SIGRTMIN && info->si_code != SI_USER)
-               /*
-                * Queue overflow, abort.  We may abort if the signal was rt
-                * and sent by user using something other than kill().
-                */
+               if (sig >= SIGRTMIN && info->si_code != SI_USER) {
+                       /*
+                        * Queue overflow, abort.  We may abort if the
+                        * signal was rt and sent by user using something
+                        * other than kill().
+                        */
+                       trace_signal_overflow_fail(sig, group, info);
                        return -EAGAIN;
+               } else {
+                       /*
+                        * This is a silent loss of information.  We still
+                        * send the signal, but the *info bits are lost.
+                        */
+                       trace_signal_lose_info(sig, group, info);
+               }
        }
 
 out_set:
@@ -1859,6 +1869,9 @@ relock:
                        ka = &sighand->action[signr-1];
                }
 
+               /* Trace actually delivered signals. */
+               trace_signal_deliver(signr, info, ka);
+
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
index b416512ad17ff77eea13b6b391d907f32f76143b..d006554888dc68752a8813e5f6063379c2d98f47 100644 (file)
@@ -339,6 +339,27 @@ config POWER_TRACER
          power management decisions, specifically the C-state and P-state
          behavior.
 
+config KSYM_TRACER
+       bool "Trace read and write access on kernel memory locations"
+       depends on HAVE_HW_BREAKPOINT
+       select TRACING
+       help
+         This tracer helps find read and write operations on any given kernel
+         symbol i.e. /proc/kallsyms.
+
+config PROFILE_KSYM_TRACER
+       bool "Profile all kernel memory accesses on 'watched' variables"
+       depends on KSYM_TRACER
+       help
+         This tracer profiles kernel accesses on variables watched through the
+         ksym tracer ftrace plugin. Depending upon the hardware, all read
+         and write operations on kernel variables can be monitored for
+         accesses.
+
+         The results will be displayed in:
+         /debugfs/tracing/profile_ksym
+
+         Say N if unsure.
 
 config STACK_TRACER
        bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
 
          If unsure, say N.
 
+config KPROBE_EVENT
+       depends on KPROBES
+       depends on X86
+       bool "Enable kprobes-based dynamic events"
+       select TRACING
+       default y
+       help
+         This allows the user to add tracing events (similar to tracepoints) on the fly
+         via the ftrace interface. See Documentation/trace/kprobetrace.txt
+         for more details.
+
+         Those events can be inserted wherever kprobes can probe, and record
+         various register and memory values.
+
+         This option is also required by perf-probe subcommand of perf tools. If
+         you want to use perf tools, this option is strongly recommended.
+
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
index 26f03ac07c2bc2164ce809cea5a48fce15d09ff1..cd9ecd89ec7714d34f16fd541beabd9ce0e504d2 100644 (file)
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
index a72c6e03deecb5224c568ba61f2e123a87bec1a8..a1ca4956ab5ec3673b04ddd839dd9edec26d1bf4 100644 (file)
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
        int ret;
 
        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-                              "offset:0;\tsize:%u;\n",
-                              (unsigned int)sizeof(field.time_stamp));
+                              "offset:0;\tsize:%u;\tsigned:%u;\n",
+                              (unsigned int)sizeof(field.time_stamp),
+                              (unsigned int)is_signed_type(u64));
 
        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
-                              "offset:%u;\tsize:%u;\n",
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), commit),
-                              (unsigned int)sizeof(field.commit));
+                              (unsigned int)sizeof(field.commit),
+                              (unsigned int)is_signed_type(long));
 
        ret = trace_seq_printf(s, "\tfield: char data;\t"
-                              "offset:%u;\tsize:%u;\n",
+                              "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
-                              (unsigned int)BUF_PAGE_SIZE);
+                              (unsigned int)BUF_PAGE_SIZE,
+                              (unsigned int)is_signed_type(char));
 
        return ret;
 }
index acef8b4636f0b605d640567a29096fe6200cec19..1d7f4830a80d93dd2b3c77a837ad18c5123819fc 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
        TRACE_KMEM_ALLOC,
        TRACE_KMEM_FREE,
        TRACE_BLK,
+       TRACE_KSYM,
 
        __TRACE_LAST_TYPE,
 };
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
 struct syscall_trace_exit {
        struct trace_entry      ent;
        int                     nr;
-       unsigned long           ret;
+       long                    ret;
 };
 
+struct kprobe_trace_entry {
+       struct trace_entry      ent;
+       unsigned long           ip;
+       int                     nargs;
+       unsigned long           args[];
+};
+
+#define SIZEOF_KPROBE_TRACE_ENTRY(n)                   \
+       (offsetof(struct kprobe_trace_entry, args) +    \
+       (sizeof(unsigned long) * (n)))
+
+struct kretprobe_trace_entry {
+       struct trace_entry      ent;
+       unsigned long           func;
+       unsigned long           ret_ip;
+       int                     nargs;
+       unsigned long           args[];
+};
+
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                        \
+       (offsetof(struct kretprobe_trace_entry, args) + \
+       (sizeof(unsigned long) * (n)))
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
                          TRACE_KMEM_ALLOC);    \
                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
                          TRACE_KMEM_FREE);     \
+               IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
                __ftrace_bad_type();                                    \
        } while (0)
 
@@ -364,6 +390,8 @@ int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
 
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
+
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
 extern int trace_selftest_startup_hw_branches(struct tracer *trace,
                                              struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+                                        struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
@@ -683,7 +713,6 @@ struct event_filter {
        int                     n_preds;
        struct filter_pred      **preds;
        char                    *filter_string;
-       bool                    no_reset;
 };
 
 struct event_subsystem {
@@ -703,7 +732,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
 enum regex_type {
-       MATCH_FULL,
+       MATCH_FULL = 0,
        MATCH_FRONT_ONLY,
        MATCH_MIDDLE_ONLY,
        MATCH_END_ONLY,
@@ -744,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-       if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+       if (unlikely(call->filter_active) &&
+           !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
        }
index ead3d724599d2d701b29014c1eb575529162d4f9..c16a08f399df53e9d9728d11e039f485b72e4986 100644 (file)
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
        F_printk("type:%u call_site:%lx ptr:%p",
                 __entry->type_id, __entry->call_site, __entry->ptr)
 );
+
+FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
+
+       TRACE_KSYM,
+
+       F_STRUCT(
+               __field(        unsigned long,  ip                        )
+               __field(        unsigned char,  type                      )
+               __array(        char         ,  cmd,       TASK_COMM_LEN  )
+               __field(        unsigned long,  addr                      )
+       ),
+
+       F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
+               (void *)__entry->ip, (unsigned int)__entry->type,
+               (void *)__entry->addr,  __entry->cmd)
+);
index 8d5c171cc9987d924f9fcfd3328fcb966288b9a0..d9c60f80aa0d20958c647a86310b29701b4bcdcc 100644 (file)
@@ -8,17 +8,14 @@
 #include <linux/module.h>
 #include "trace.h"
 
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
 
-char           *trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+char *perf_trace_buf;
+EXPORT_SYMBOL_GPL(perf_trace_buf);
+
+char *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
-char           *trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
 
 /* Count the events in use (per event id, not per instance) */
 static int     total_profile_count;
@@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
                return 0;
 
        if (!total_profile_count) {
-               buf = (char *)alloc_percpu(profile_buf_t);
+               buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf;
 
-               rcu_assign_pointer(trace_profile_buf, buf);
+               rcu_assign_pointer(perf_trace_buf, buf);
 
-               buf = (char *)alloc_percpu(profile_buf_t);
+               buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf_nmi;
 
-               rcu_assign_pointer(trace_profile_buf_nmi, buf);
+               rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
 
-       ret = event->profile_enable();
+       ret = event->profile_enable(event);
        if (!ret) {
                total_profile_count++;
                return 0;
@@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 
 fail_buf_nmi:
        if (!total_profile_count) {
-               free_percpu(trace_profile_buf_nmi);
-               free_percpu(trace_profile_buf);
-               trace_profile_buf_nmi = NULL;
-               trace_profile_buf = NULL;
+               free_percpu(perf_trace_buf_nmi);
+               free_percpu(perf_trace_buf);
+               perf_trace_buf_nmi = NULL;
+               perf_trace_buf = NULL;
        }
 fail_buf:
        atomic_dec(&event->profile_count);
@@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
        if (!atomic_add_negative(-1, &event->profile_count))
                return;
 
-       event->profile_disable();
+       event->profile_disable(event);
 
        if (!--total_profile_count) {
-               buf = trace_profile_buf;
-               rcu_assign_pointer(trace_profile_buf, NULL);
+               buf = perf_trace_buf;
+               rcu_assign_pointer(perf_trace_buf, NULL);
 
-               nmi_buf = trace_profile_buf_nmi;
-               rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+               nmi_buf = perf_trace_buf_nmi;
+               rcu_assign_pointer(perf_trace_buf_nmi, NULL);
 
                /*
                 * Ensure every events in profiling have finished before
index 5e9ffc33f6db7b0e0d992b2de0ce52f380c33476..1d18315dc836e6e15d50cc8d17b4eacec56fd6ba 100644 (file)
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_define_common_fields);
 
-#ifdef CONFIG_MODULES
-
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
 
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
        }
 }
 
-#endif /* CONFIG_MODULES */
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                if (call->enabled) {
                        call->enabled = 0;
                        tracing_stop_cmdline_record();
-                       call->unregfunc(call->data);
+                       call->unregfunc(call);
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
                        tracing_start_cmdline_record();
-                       call->regfunc(call->data);
+                       call->regfunc(call);
                }
                break;
        }
@@ -507,7 +503,7 @@ extern char *__bad_type_size(void);
 #define FIELD(type, name)                                              \
        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
        #type, "common_" #name, offsetof(typeof(field), name),          \
-               sizeof(field.name)
+               sizeof(field.name), is_signed_type(type)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
 
        /* struct trace_entry */
        return trace_seq_printf(s,
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                               "\n",
-                               FIELD(unsigned short, type),
-                               FIELD(unsigned char, flags),
-                               FIELD(unsigned char, preempt_count),
-                               FIELD(int, pid),
-                               FIELD(int, lock_depth));
+                       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+                       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+                       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+                       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+                       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
+                       "\n",
+                       FIELD(unsigned short, type),
+                       FIELD(unsigned char, flags),
+                       FIELD(unsigned char, preempt_count),
+                       FIELD(int, pid),
+                       FIELD(int, lock_depth));
 }
 
 static ssize_t
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        return 0;
 }
 
-#define for_each_event(event, start, end)                      \
-       for (event = start;                                     \
-            (unsigned long)event < (unsigned long)end;         \
-            event++)
+static int __trace_add_event_call(struct ftrace_event_call *call)
+{
+       struct dentry *d_events;
+       int ret;
 
-#ifdef CONFIG_MODULES
+       if (!call->name)
+               return -EINVAL;
 
-static LIST_HEAD(ftrace_module_file_list);
+       if (call->raw_init) {
+               ret = call->raw_init(call);
+               if (ret < 0) {
+                       if (ret != -ENOSYS)
+                               pr_warning("Could not initialize trace "
+                               "events/%s\n", call->name);
+                       return ret;
+               }
+       }
 
-/*
- * Modules must own their file_operations to keep up with
- * reference counting.
- */
-struct ftrace_module_file_ops {
-       struct list_head                list;
-       struct module                   *mod;
-       struct file_operations          id;
-       struct file_operations          enable;
-       struct file_operations          format;
-       struct file_operations          filter;
-};
+       d_events = event_trace_events_dir();
+       if (!d_events)
+               return -ENOENT;
+
+       ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+                               &ftrace_enable_fops, &ftrace_event_filter_fops,
+                               &ftrace_event_format_fops);
+       if (!ret)
+               list_add(&call->list, &ftrace_events);
+
+       return ret;
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+       int ret;
+       mutex_lock(&event_mutex);
+       ret = __trace_add_event_call(call);
+       mutex_unlock(&event_mutex);
+       return ret;
+}
 
 static void remove_subsystem_dir(const char *name)
 {
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
        }
 }
 
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+       ftrace_event_enable_disable(call, 0);
+       if (call->event)
+               __unregister_ftrace_event(call->event);
+       debugfs_remove_recursive(call->dir);
+       list_del(&call->list);
+       trace_destroy_fields(call);
+       destroy_preds(call);
+       remove_subsystem_dir(call->system);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+       mutex_lock(&event_mutex);
+       down_write(&trace_event_mutex);
+       __trace_remove_event_call(call);
+       up_write(&trace_event_mutex);
+       mutex_unlock(&event_mutex);
+}
+
+#define for_each_event(event, start, end)                      \
+       for (event = start;                                     \
+            (unsigned long)event < (unsigned long)end;         \
+            event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+       struct list_head                list;
+       struct module                   *mod;
+       struct file_operations          id;
+       struct file_operations          enable;
+       struct file_operations          format;
+       struct file_operations          filter;
+};
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                       ret = call->raw_init();
+                       ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
                                return;
                }
                call->mod = mod;
-               list_add(&call->list, &ftrace_events);
-               event_create_dir(call, d_events,
-                                &file_ops->id, &file_ops->enable,
-                                &file_ops->filter, &file_ops->format);
+               ret = event_create_dir(call, d_events,
+                                      &file_ops->id, &file_ops->enable,
+                                      &file_ops->filter, &file_ops->format);
+               if (!ret)
+                       list_add(&call->list, &ftrace_events);
        }
 }
 
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                if (call->mod == mod) {
                        found = true;
-                       ftrace_event_enable_disable(call, 0);
-                       if (call->event)
-                               __unregister_ftrace_event(call->event);
-                       debugfs_remove_recursive(call->dir);
-                       list_del(&call->list);
-                       trace_destroy_fields(call);
-                       destroy_preds(call);
-                       remove_subsystem_dir(call->system);
+                       __trace_remove_event_call(call);
                }
        }
 
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                       ret = call->raw_init();
+                       ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void)
                                continue;
                        }
                }
-               list_add(&call->list, &ftrace_events);
-               event_create_dir(call, d_events, &ftrace_event_id_fops,
-                                &ftrace_enable_fops, &ftrace_event_filter_fops,
-                                &ftrace_event_format_fops);
+               ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                      &ftrace_enable_fops,
+                                      &ftrace_event_filter_fops,
+                                      &ftrace_event_format_fops);
+               if (!ret)
+                       list_add(&call->list, &ftrace_events);
        }
 
        while (true) {
index 92672016da284ddaa8a004237425d2d393995faf..50504cb228deded61a141aca151c3d802719b2c2 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
+#include <linux/perf_event.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -29,6 +30,7 @@ enum filter_op_ids
 {
        OP_OR,
        OP_AND,
+       OP_GLOB,
        OP_NE,
        OP_EQ,
        OP_LT,
@@ -46,16 +48,17 @@ struct filter_op {
 };
 
 static struct filter_op filter_ops[] = {
-       { OP_OR, "||", 1 },
-       { OP_AND, "&&", 2 },
-       { OP_NE, "!=", 4 },
-       { OP_EQ, "==", 4 },
-       { OP_LT, "<", 5 },
-       { OP_LE, "<=", 5 },
-       { OP_GT, ">", 5 },
-       { OP_GE, ">=", 5 },
-       { OP_NONE, "OP_NONE", 0 },
-       { OP_OPEN_PAREN, "(", 0 },
+       { OP_OR,        "||",           1 },
+       { OP_AND,       "&&",           2 },
+       { OP_GLOB,      "~",            4 },
+       { OP_NE,        "!=",           4 },
+       { OP_EQ,        "==",           4 },
+       { OP_LT,        "<",            5 },
+       { OP_LE,        "<=",           5 },
+       { OP_GT,        ">",            5 },
+       { OP_GE,        ">=",           5 },
+       { OP_NONE,      "OP_NONE",      0 },
+       { OP_OPEN_PAREN, "(",           0 },
 };
 
 enum {
@@ -329,22 +332,18 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
        return type;
 }
 
-static int filter_build_regex(struct filter_pred *pred)
+static void filter_build_regex(struct filter_pred *pred)
 {
        struct regex *r = &pred->regex;
-       char *search, *dup;
-       enum regex_type type;
-       int not;
-
-       type = filter_parse_regex(r->pattern, r->len, &search, &not);
-       dup = kstrdup(search, GFP_KERNEL);
-       if (!dup)
-               return -ENOMEM;
-
-       strcpy(r->pattern, dup);
-       kfree(dup);
-
-       r->len = strlen(r->pattern);
+       char *search;
+       enum regex_type type = MATCH_FULL;
+       int not = 0;
+
+       if (pred->op == OP_GLOB) {
+               type = filter_parse_regex(r->pattern, r->len, &search, &not);
+               r->len = strlen(search);
+               memmove(r->pattern, search, r->len+1);
+       }
 
        switch (type) {
        case MATCH_FULL:
@@ -362,14 +361,11 @@ static int filter_build_regex(struct filter_pred *pred)
        }
 
        pred->not ^= not;
-
-       return 0;
 }
 
 /* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
 {
-       struct event_filter *filter = call->filter;
        int match, top = 0, val1 = 0, val2 = 0;
        int stack[MAX_FILTER_PRED];
        struct filter_pred *pred;
@@ -542,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
                filter->preds[i]->fn = filter_pred_none;
 }
 
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-       struct event_filter *filter = call->filter;
        int i;
 
        if (!filter)
@@ -557,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
        kfree(filter->preds);
        kfree(filter->filter_string);
        kfree(filter);
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+       __free_preds(call->filter);
        call->filter = NULL;
+       call->filter_active = 0;
 }
 
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
 {
        struct event_filter *filter;
        struct filter_pred *pred;
        int i;
 
-       if (call->filter)
-               return 0;
-
-       filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-       if (!call->filter)
-               return -ENOMEM;
+       filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       if (!filter)
+               return ERR_PTR(-ENOMEM);
 
        filter->n_preds = 0;
 
@@ -587,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
                filter->preds[i] = pred;
        }
 
-       return 0;
+       return filter;
 
 oom:
-       destroy_preds(call);
+       __free_preds(filter);
+       return ERR_PTR(-ENOMEM);
+}
+
+static int init_preds(struct ftrace_event_call *call)
+{
+       if (call->filter)
+               return 0;
 
-       return -ENOMEM;
+       call->filter_active = 0;
+       call->filter = __alloc_preds();
+       if (IS_ERR(call->filter))
+               return PTR_ERR(call->filter);
+
+       return 0;
 }
 
 static int init_subsystem_preds(struct event_subsystem *system)
@@ -615,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
        return 0;
 }
 
-enum {
-       FILTER_DISABLE_ALL,
-       FILTER_INIT_NO_RESET,
-       FILTER_SKIP_NO_RESET,
-};
-
-static void filter_free_subsystem_preds(struct event_subsystem *system,
-                                       int flag)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
        struct ftrace_event_call *call;
 
@@ -633,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
                if (strcmp(call->system, system->name) != 0)
                        continue;
 
-               if (flag == FILTER_INIT_NO_RESET) {
-                       call->filter->no_reset = false;
-                       continue;
-               }
-
-               if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
-                       continue;
-
                filter_disable_preds(call);
                remove_filter_string(call->filter);
        }
@@ -648,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
                              struct ftrace_event_call *call,
+                             struct event_filter *filter,
                              struct filter_pred *pred,
                              filter_pred_fn_t fn)
 {
-       struct event_filter *filter = call->filter;
        int idx, err;
 
        if (filter->n_preds == MAX_FILTER_PRED) {
@@ -666,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
                return err;
 
        filter->n_preds++;
-       call->filter_active = 1;
 
        return 0;
 }
@@ -691,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
 
 static int is_legal_op(struct ftrace_event_field *field, int op)
 {
-       if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+       if (is_string_field(field) &&
+           (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+               return 0;
+       if (!is_string_field(field) && op == OP_GLOB)
                return 0;
 
        return 1;
@@ -742,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 
 static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
+                          struct event_filter *filter,
                           struct filter_pred *pred,
                           bool dry_run)
 {
@@ -776,15 +774,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
        }
 
        if (is_string_field(field)) {
-               ret = filter_build_regex(pred);
-               if (ret)
-                       return ret;
+               filter_build_regex(pred);
 
                if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
-                               fn = filter_pred_strloc;
+                       fn = filter_pred_strloc;
                else {
                        fn = filter_pred_pchar;
                        pred->regex.field_len = strlen(pred->regex.pattern);
@@ -813,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 
 add_pred_fn:
        if (!dry_run)
-               return filter_add_pred_fn(ps, call, pred, fn);
-       return 0;
-}
-
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
-                                    struct event_subsystem *system,
-                                    struct filter_pred *pred,
-                                    char *filter_string,
-                                    bool dry_run)
-{
-       struct ftrace_event_call *call;
-       int err = 0;
-       bool fail = true;
-
-       list_for_each_entry(call, &ftrace_events, list) {
-
-               if (!call->define_fields)
-                       continue;
-
-               if (strcmp(call->system, system->name))
-                       continue;
-
-               if (call->filter->no_reset)
-                       continue;
-
-               err = filter_add_pred(ps, call, pred, dry_run);
-               if (err)
-                       call->filter->no_reset = true;
-               else
-                       fail = false;
-
-               if (!dry_run)
-                       replace_filter_string(call->filter, filter_string);
-       }
-
-       if (fail) {
-               parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-               return err;
-       }
+               return filter_add_pred_fn(ps, call, filter, pred, fn);
        return 0;
 }
 
@@ -1209,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
        return 0;
 }
 
-static int replace_preds(struct event_subsystem *system,
-                        struct ftrace_event_call *call,
+static int replace_preds(struct ftrace_event_call *call,
+                        struct event_filter *filter,
                         struct filter_parse_state *ps,
                         char *filter_string,
                         bool dry_run)
@@ -1257,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
 add_pred:
                if (!pred)
                        return -ENOMEM;
-               if (call)
-                       err = filter_add_pred(ps, call, pred, false);
-               else
-                       err = filter_add_subsystem_pred(ps, system, pred,
-                                               filter_string, dry_run);
+               err = filter_add_pred(ps, call, filter, pred, dry_run);
                filter_free_pred(pred);
                if (err)
                        return err;
@@ -1272,10 +1226,50 @@ add_pred:
        return 0;
 }
 
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+static int replace_system_preds(struct event_subsystem *system,
+                               struct filter_parse_state *ps,
+                               char *filter_string)
 {
+       struct ftrace_event_call *call;
+       bool fail = true;
        int err;
 
+       list_for_each_entry(call, &ftrace_events, list) {
+               struct event_filter *filter = call->filter;
+
+               if (!call->define_fields)
+                       continue;
+
+               if (strcmp(call->system, system->name) != 0)
+                       continue;
+
+               /* try to see if the filter can be applied */
+               err = replace_preds(call, filter, ps, filter_string, true);
+               if (err)
+                       continue;
+
+               /* really apply the filter */
+               filter_disable_preds(call);
+               err = replace_preds(call, filter, ps, filter_string, false);
+               if (err)
+                       filter_disable_preds(call);
+               else {
+                       call->filter_active = 1;
+                       replace_filter_string(filter, filter_string);
+               }
+               fail = false;
+       }
+
+       if (fail) {
+               parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+       int err;
        struct filter_parse_state *ps;
 
        mutex_lock(&event_mutex);
@@ -1287,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_disable_preds(call);
                remove_filter_string(call->filter);
-               mutex_unlock(&event_mutex);
-               return 0;
+               goto out_unlock;
        }
 
        err = -ENOMEM;
@@ -1306,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out;
        }
 
-       err = replace_preds(NULL, call, ps, filter_string, false);
+       err = replace_preds(call, call->filter, ps, filter_string, false);
        if (err)
                append_filter_err(ps, call->filter);
-
+       else
+               call->filter_active = 1;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1324,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
        int err;
-
        struct filter_parse_state *ps;
 
        mutex_lock(&event_mutex);
@@ -1334,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out_unlock;
 
        if (!strcmp(strstrip(filter_string), "0")) {
-               filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+               filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
-               mutex_unlock(&event_mutex);
-               return 0;
+               goto out_unlock;
        }
 
        err = -ENOMEM;
@@ -1354,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out;
        }
 
-       filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
-
-       /* try to see the filter can be applied to which events */
-       err = replace_preds(system, NULL, ps, filter_string, true);
-       if (err) {
+       err = replace_system_preds(system, ps, filter_string);
+       if (err)
                append_filter_err(ps, system->filter);
-               goto out;
+
+out:
+       filter_opstack_clear(ps);
+       postfix_clear(ps);
+       kfree(ps);
+out_unlock:
+       mutex_unlock(&event_mutex);
+
+       return err;
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+       struct event_filter *filter = event->filter;
+
+       event->filter = NULL;
+       __free_preds(filter);
+}
+
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+                             char *filter_str)
+{
+       int err;
+       struct event_filter *filter;
+       struct filter_parse_state *ps;
+       struct ftrace_event_call *call = NULL;
+
+       mutex_lock(&event_mutex);
+
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (call->id == event_id)
+                       break;
        }
 
-       filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+       err = -EINVAL;
+       if (!call)
+               goto out_unlock;
 
-       /* really apply the filter to the events */
-       err = replace_preds(system, NULL, ps, filter_string, false);
-       if (err) {
-               append_filter_err(ps, system->filter);
-               filter_free_subsystem_preds(system, 2);
+       err = -EEXIST;
+       if (event->filter)
+               goto out_unlock;
+
+       filter = __alloc_preds();
+       if (IS_ERR(filter)) {
+               err = PTR_ERR(filter);
+               goto out_unlock;
        }
 
-out:
+       err = -ENOMEM;
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+       if (!ps)
+               goto free_preds;
+
+       parse_init(ps, filter_ops, filter_str);
+       err = filter_parse(ps);
+       if (err)
+               goto free_ps;
+
+       err = replace_preds(call, filter, ps, filter_str, false);
+       if (!err)
+               event->filter = filter;
+
+free_ps:
        filter_opstack_clear(ps);
        postfix_clear(ps);
        kfree(ps);
+
+free_preds:
+       if (err)
+               __free_preds(filter);
+
 out_unlock:
        mutex_unlock(&event_mutex);
 
        return err;
 }
 
+#endif /* CONFIG_EVENT_PROFILE */
+
index c74848ddb85a236916bfb7bdf64d322a6f37008d..dff8c84ddf17589543ca90e71af630cf7b0755b7 100644 (file)
@@ -66,44 +66,47 @@ static void __always_unused ____ftrace_check_##name(void)   \
 #undef __field
 #define __field(type, item)                                            \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%zu;\tsize:%zu;\n",              \
+                              "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), item),           \
-                              sizeof(field.item));                     \
+                              sizeof(field.item), is_signed_type(type)); \
        if (!ret)                                                       \
                return 0;
 
 #undef __field_desc
 #define __field_desc(type, container, item)                            \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%zu;\tsize:%zu;\n",              \
+                              "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), container.item), \
-                              sizeof(field.container.item));           \
+                              sizeof(field.container.item),            \
+                              is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
 
 #undef __array
 #define __array(type, item, len)                                       \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                              "offset:%zu;\tsize:%zu;\n",              \
-                              offsetof(typeof(field), item),   \
-                              sizeof(field.item));             \
+                              "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
+                              offsetof(typeof(field), item),           \
+                              sizeof(field.item), is_signed_type(type)); \
        if (!ret)                                                       \
                return 0;
 
 #undef __array_desc
 #define __array_desc(type, container, item, len)                       \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                              "offset:%zu;\tsize:%zu;\n",              \
+                              "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), container.item), \
-                              sizeof(field.container.item));           \
+                              sizeof(field.container.item),            \
+                              is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
 
 #undef __dynamic_array
 #define __dynamic_array(type, item)                                    \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%zu;\tsize:0;\n",                \
-                              offsetof(typeof(field), item));          \
+                              "offset:%zu;\tsize:0;\tsigned:%u;\n",    \
+                              offsetof(typeof(field), item),           \
+                              is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
 
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused,                      \
 
 #include "trace_entries.h"
 
-
 #undef __field
 #define __field(type, item)                                            \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)  \
 
 #include "trace_entries.h"
 
+static int ftrace_raw_init_event(struct ftrace_event_call *call)
+{
+       INIT_LIST_HEAD(&call->fields);
+       return 0;
+}
 
 #undef __field
 #define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)   \
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)          \
-static int ftrace_raw_init_event_##call(void);                         \
                                                                        \
 struct ftrace_event_call __used                                                \
 __attribute__((__aligned__(4)))                                                \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = {                \
        .name                   = #call,                                \
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_raw_init_event_##call,         \
+       .raw_init               = ftrace_raw_init_event,                \
        .show_format            = ftrace_format_##call,                 \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                     \
-static int ftrace_raw_init_event_##call(void)                          \
-{                                                                      \
-       INIT_LIST_HEAD(&event_##call.fields);                           \
-       return 0;                                                       \
-}                                                                      \
 
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644 (file)
index 0000000..aff5f80
--- /dev/null
@@ -0,0 +1,1523 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+const char *reserved_field_names[] = {
+       "common_type",
+       "common_flags",
+       "common_preempt_count",
+       "common_pid",
+       "common_tgid",
+       "common_lock_depth",
+       FIELD_STRING_IP,
+       FIELD_STRING_NARGS,
+       FIELD_STRING_RETIP,
+       FIELD_STRING_FUNC,
+};
+
+struct fetch_func {
+       unsigned long (*func)(struct pt_regs *, void *);
+       void *data;
+};
+
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+                                         struct pt_regs *regs)
+{
+       return f->func(regs, f->data);
+}
+
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+                                             void *offset)
+{
+       return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+                                          void *num)
+{
+       return regs_get_kernel_stack_nth(regs,
+                                        (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+       unsigned long retval;
+
+       if (probe_kernel_address(addr, retval))
+               return 0;
+       return retval;
+}
+
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+       return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+                                             void *dummy)
+{
+       return regs_return_value(regs);
+}
+
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+                                                  void *dummy)
+{
+       return kernel_stack_pointer(regs);
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+       char *symbol;
+       long offset;
+       unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+       sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+       if (sc->addr)
+               sc->addr += sc->offset;
+       return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+       kfree(sc->symbol);
+       kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+       struct symbol_cache *sc;
+
+       if (!sym || strlen(sym) == 0)
+               return NULL;
+       sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+       if (!sc)
+               return NULL;
+
+       sc->symbol = kstrdup(sym, GFP_KERNEL);
+       if (!sc->symbol) {
+               kfree(sc);
+               return NULL;
+       }
+       sc->offset = offset;
+
+       update_symbol_cache(sc);
+       return sc;
+}
+
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+       struct symbol_cache *sc = data;
+
+       if (sc->addr)
+               return fetch_memory(regs, (void *)sc->addr);
+       else
+               return 0;
+}
+
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+       struct fetch_func orig;
+       long offset;
+};
+
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+       struct indirect_fetch_data *ind = data;
+       unsigned long addr;
+
+       addr = call_fetch(&ind->orig, regs);
+       if (addr) {
+               addr += ind->offset;
+               return fetch_memory(regs, (void *)addr);
+       } else
+               return 0;
+}
+
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+       if (data->orig.func == fetch_indirect)
+               free_indirect_fetch_data(data->orig.data);
+       else if (data->orig.func == fetch_symbol)
+               free_symbol_cache(data->orig.data);
+       kfree(data);
+}
+
+/**
+ * Kprobe event core functions
+ */
+
+struct probe_arg {
+       struct fetch_func       fetch;
+       const char              *name;
+};
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE  1
+#define TP_FLAG_PROFILE        2
+
+struct trace_probe {
+       struct list_head        list;
+       struct kretprobe        rp;     /* Use rp.kp for kprobe use */
+       unsigned long           nhit;
+       unsigned int            flags;  /* For TP_FLAG_* */
+       const char              *symbol;        /* symbol name */
+       struct ftrace_event_call        call;
+       struct trace_event              event;
+       unsigned int            nr_args;
+       struct probe_arg        args[];
+};
+
+#define SIZEOF_TRACE_PROBE(n)                  \
+       (offsetof(struct trace_probe, args) +   \
+       (sizeof(struct probe_arg) * (n)))
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+       return tp->rp.handler != NULL;
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+       return tp->symbol ? tp->symbol : "unknown";
+}
+
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+       int ret = -EINVAL;
+
+       if (ff->func == fetch_argument)
+               ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
+       else if (ff->func == fetch_register) {
+               const char *name;
+               name = regs_query_register_name((unsigned int)((long)ff->data));
+               ret = snprintf(buf, n, "%%%s", name);
+       } else if (ff->func == fetch_stack)
+               ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
+       else if (ff->func == fetch_memory)
+               ret = snprintf(buf, n, "@0x%p", ff->data);
+       else if (ff->func == fetch_symbol) {
+               struct symbol_cache *sc = ff->data;
+               if (sc->offset)
+                       ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
+                                       sc->offset);
+               else
+                       ret = snprintf(buf, n, "@%s", sc->symbol);
+       } else if (ff->func == fetch_retvalue)
+               ret = snprintf(buf, n, "$retval");
+       else if (ff->func == fetch_stack_address)
+               ret = snprintf(buf, n, "$stack");
+       else if (ff->func == fetch_indirect) {
+               struct indirect_fetch_data *id = ff->data;
+               size_t l = 0;
+               ret = snprintf(buf, n, "%+ld(", id->offset);
+               if (ret >= n)
+                       goto end;
+               l += ret;
+               ret = probe_arg_string(buf + l, n - l, &id->orig);
+               if (ret < 0)
+                       goto end;
+               l += ret;
+               ret = snprintf(buf + l, n - l, ")");
+               ret += l;
+       }
+end:
+       if (ret >= n)
+               return -ENOSPC;
+       return ret;
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+                               struct pt_regs *regs);
+
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+                                            const char *event,
+                                            void *addr,
+                                            const char *symbol,
+                                            unsigned long offs,
+                                            int nargs, int is_return)
+{
+       struct trace_probe *tp;
+
+       tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+       if (!tp)
+               return ERR_PTR(-ENOMEM);
+
+       if (symbol) {
+               tp->symbol = kstrdup(symbol, GFP_KERNEL);
+               if (!tp->symbol)
+                       goto error;
+               tp->rp.kp.symbol_name = tp->symbol;
+               tp->rp.kp.offset = offs;
+       } else
+               tp->rp.kp.addr = addr;
+
+       if (is_return)
+               tp->rp.handler = kretprobe_dispatcher;
+       else
+               tp->rp.kp.pre_handler = kprobe_dispatcher;
+
+       if (!event)
+               goto error;
+       tp->call.name = kstrdup(event, GFP_KERNEL);
+       if (!tp->call.name)
+               goto error;
+
+       if (!group)
+               goto error;
+       tp->call.system = kstrdup(group, GFP_KERNEL);
+       if (!tp->call.system)
+               goto error;
+
+       INIT_LIST_HEAD(&tp->list);
+       return tp;
+error:
+       kfree(tp->call.name);
+       kfree(tp->symbol);
+       kfree(tp);
+       return ERR_PTR(-ENOMEM);
+}
+
+static void free_probe_arg(struct probe_arg *arg)
+{
+       if (arg->fetch.func == fetch_symbol)
+               free_symbol_cache(arg->fetch.data);
+       else if (arg->fetch.func == fetch_indirect)
+               free_indirect_fetch_data(arg->fetch.data);
+       kfree(arg->name);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+       int i;
+
+       for (i = 0; i < tp->nr_args; i++)
+               free_probe_arg(&tp->args[i]);
+
+       kfree(tp->call.system);
+       kfree(tp->call.name);
+       kfree(tp->symbol);
+       kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event,
+                                           const char *group)
+{
+       struct trace_probe *tp;
+
+       list_for_each_entry(tp, &probe_list, list)
+               if (strcmp(tp->call.name, event) == 0 &&
+                   strcmp(tp->call.system, group) == 0)
+                       return tp;
+       return NULL;
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+       if (probe_is_return(tp))
+               unregister_kretprobe(&tp->rp);
+       else
+               unregister_kprobe(&tp->rp.kp);
+       list_del(&tp->list);
+       unregister_probe_event(tp);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+       struct trace_probe *old_tp;
+       int ret;
+
+       mutex_lock(&probe_lock);
+
+       /* register as an event */
+       old_tp = find_probe_event(tp->call.name, tp->call.system);
+       if (old_tp) {
+               /* delete old event */
+               unregister_trace_probe(old_tp);
+               free_trace_probe(old_tp);
+       }
+       ret = register_probe_event(tp);
+       if (ret) {
+               pr_warning("Faild to register probe event(%d)\n", ret);
+               goto end;
+       }
+
+       tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+       if (probe_is_return(tp))
+               ret = register_kretprobe(&tp->rp);
+       else
+               ret = register_kprobe(&tp->rp.kp);
+
+       if (ret) {
+               pr_warning("Could not insert probe(%d)\n", ret);
+               if (ret == -EILSEQ) {
+                       pr_warning("Probing address(0x%p) is not an "
+                                  "instruction boundary.\n",
+                                  tp->rp.kp.addr);
+                       ret = -EINVAL;
+               }
+               unregister_probe_event(tp);
+       } else
+               list_add_tail(&tp->list, &probe_list);
+end:
+       mutex_unlock(&probe_lock);
+       return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+       char *tmp;
+       int ret;
+
+       if (!offset)
+               return -EINVAL;
+
+       tmp = strchr(symbol, '+');
+       if (tmp) {
+               /* skip sign because strict_strtol doesn't accept '+' */
+               ret = strict_strtoul(tmp + 1, 0, offset);
+               if (ret)
+                       return ret;
+               *tmp = '\0';
+       } else
+               *offset = 0;
+       return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+{
+       int ret = 0;
+       unsigned long param;
+
+       if (strcmp(arg, "retval") == 0) {
+               if (is_return) {
+                       ff->func = fetch_retvalue;
+                       ff->data = NULL;
+               } else
+                       ret = -EINVAL;
+       } else if (strncmp(arg, "stack", 5) == 0) {
+               if (arg[5] == '\0') {
+                       ff->func = fetch_stack_address;
+                       ff->data = NULL;
+               } else if (isdigit(arg[5])) {
+                       ret = strict_strtoul(arg + 5, 10, &param);
+                       if (ret || param > PARAM_MAX_STACK)
+                               ret = -EINVAL;
+                       else {
+                               ff->func = fetch_stack;
+                               ff->data = (void *)param;
+                       }
+               } else
+                       ret = -EINVAL;
+       } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
+               ret = strict_strtoul(arg + 3, 10, &param);
+               if (ret || param > PARAM_MAX_ARGS)
+                       ret = -EINVAL;
+               else {
+                       ff->func = fetch_argument;
+                       ff->data = (void *)param;
+               }
+       } else
+               ret = -EINVAL;
+       return ret;
+}
+
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+       int ret = 0;
+       unsigned long param;
+       long offset;
+       char *tmp;
+
+       switch (arg[0]) {
+       case '$':
+               ret = parse_probe_vars(arg + 1, ff, is_return);
+               break;
+       case '%':       /* named register */
+               ret = regs_query_register_offset(arg + 1);
+               if (ret >= 0) {
+                       ff->func = fetch_register;
+                       ff->data = (void *)(unsigned long)ret;
+                       ret = 0;
+               }
+               break;
+       case '@':       /* memory or symbol */
+               if (isdigit(arg[1])) {
+                       ret = strict_strtoul(arg + 1, 0, &param);
+                       if (ret)
+                               break;
+                       ff->func = fetch_memory;
+                       ff->data = (void *)param;
+               } else {
+                       ret = split_symbol_offset(arg + 1, &offset);
+                       if (ret)
+                               break;
+                       ff->data = alloc_symbol_cache(arg + 1, offset);
+                       if (ff->data)
+                               ff->func = fetch_symbol;
+                       else
+                               ret = -EINVAL;
+               }
+               break;
+       case '+':       /* indirect memory */
+       case '-':
+               tmp = strchr(arg, '(');
+               if (!tmp) {
+                       ret = -EINVAL;
+                       break;
+               }
+               *tmp = '\0';
+               ret = strict_strtol(arg + 1, 0, &offset);
+               if (ret)
+                       break;
+               if (arg[0] == '-')
+                       offset = -offset;
+               arg = tmp + 1;
+               tmp = strrchr(arg, ')');
+               if (tmp) {
+                       struct indirect_fetch_data *id;
+                       *tmp = '\0';
+                       id = kzalloc(sizeof(struct indirect_fetch_data),
+                                    GFP_KERNEL);
+                       if (!id)
+                               return -ENOMEM;
+                       id->offset = offset;
+                       ret = __parse_probe_arg(arg, &id->orig, is_return);
+                       if (ret)
+                               kfree(id);
+                       else {
+                               ff->func = fetch_indirect;
+                               ff->data = (void *)id;
+                       }
+               } else
+                       ret = -EINVAL;
+               break;
+       default:
+               /* TODO: support custom handler */
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+       if (strlen(arg) > MAX_ARGSTR_LEN) {
+               pr_info("Argument is too long.: %s\n",  arg);
+               return -ENOSPC;
+       }
+       return __parse_probe_arg(arg, ff, is_return);
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+                              struct probe_arg *args, int narg)
+{
+       int i;
+       for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+               if (strcmp(reserved_field_names[i], name) == 0)
+                       return 1;
+       for (i = 0; i < narg; i++)
+               if (strcmp(args[i].name, name) == 0)
+                       return 1;
+       return 0;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+       /*
+        * Argument syntax:
+        *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+        *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+        * Fetch args:
+        *  $argN       : fetch Nth of function argument. (N:0-)
+        *  $retval     : fetch return value
+        *  $stack      : fetch stack address
+        *  $stackN     : fetch Nth of stack (N:0-)
+        *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
+        *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+        *  %REG        : fetch register REG
+        * Indirect memory fetch:
+        *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+        * Alias name of args:
+        *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+        */
+       struct trace_probe *tp;
+       int i, ret = 0;
+       int is_return = 0;
+       char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+       unsigned long offset = 0;
+       void *addr = NULL;
+       char buf[MAX_EVENT_NAME_LEN];
+
+       if (argc < 2) {
+               pr_info("Probe point is not specified.\n");
+               return -EINVAL;
+       }
+
+       if (argv[0][0] == 'p')
+               is_return = 0;
+       else if (argv[0][0] == 'r')
+               is_return = 1;
+       else {
+               pr_info("Probe definition must be started with 'p' or 'r'.\n");
+               return -EINVAL;
+       }
+
+       if (argv[0][1] == ':') {
+               event = &argv[0][2];
+               if (strchr(event, '/')) {
+                       group = event;
+                       event = strchr(group, '/') + 1;
+                       event[-1] = '\0';
+                       if (strlen(group) == 0) {
+                               pr_info("Group name is not specifiled\n");
+                               return -EINVAL;
+                       }
+               }
+               if (strlen(event) == 0) {
+                       pr_info("Event name is not specifiled\n");
+                       return -EINVAL;
+               }
+       }
+
+       if (isdigit(argv[1][0])) {
+               if (is_return) {
+                       pr_info("Return probe point must be a symbol.\n");
+                       return -EINVAL;
+               }
+               /* an address specified */
+               ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+               if (ret) {
+                       pr_info("Failed to parse address.\n");
+                       return ret;
+               }
+       } else {
+               /* a symbol specified */
+               symbol = argv[1];
+               /* TODO: support .init module functions */
+               ret = split_symbol_offset(symbol, &offset);
+               if (ret) {
+                       pr_info("Failed to parse symbol.\n");
+                       return ret;
+               }
+               if (offset && is_return) {
+                       pr_info("Return probe must be used without offset.\n");
+                       return -EINVAL;
+               }
+       }
+       argc -= 2; argv += 2;
+
+       /* setup a probe */
+       if (!group)
+               group = KPROBE_EVENT_SYSTEM;
+       if (!event) {
+               /* Make a new event name */
+               if (symbol)
+                       snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+                                is_return ? 'r' : 'p', symbol, offset);
+               else
+                       snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+                                is_return ? 'r' : 'p', addr);
+               event = buf;
+       }
+       tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+                              is_return);
+       if (IS_ERR(tp)) {
+               pr_info("Failed to allocate trace_probe.(%d)\n",
+                       (int)PTR_ERR(tp));
+               return PTR_ERR(tp);
+       }
+
+       /* parse arguments */
+       ret = 0;
+       for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+               /* Parse argument name */
+               arg = strchr(argv[i], '=');
+               if (arg)
+                       *arg++ = '\0';
+               else
+                       arg = argv[i];
+
+               if (conflict_field_name(argv[i], tp->args, i)) {
+                       pr_info("Argument%d name '%s' conflicts with "
+                               "another field.\n", i, argv[i]);
+                       ret = -EINVAL;
+                       goto error;
+               }
+
+               tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+               if (!tp->args[i].name) {
+                       pr_info("Failed to allocate argument%d name '%s'.\n",
+                               i, argv[i]);
+                       ret = -ENOMEM;
+                       goto error;
+               }
+
+               /* Parse fetch argument */
+               ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+               if (ret) {
+                       pr_info("Parse error at argument%d. (%d)\n", i, ret);
+                       kfree(tp->args[i].name);
+                       goto error;
+               }
+
+               tp->nr_args++;
+       }
+
+       ret = register_trace_probe(tp);
+       if (ret)
+               goto error;
+       return 0;
+
+error:
+       free_trace_probe(tp);
+       return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+       struct trace_probe *tp;
+
+       mutex_lock(&probe_lock);
+       /* TODO: Use batch unregistration */
+       while (!list_empty(&probe_list)) {
+               tp = list_entry(probe_list.next, struct trace_probe, list);
+               unregister_trace_probe(tp);
+               free_trace_probe(tp);
+       }
+       mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+       mutex_lock(&probe_lock);
+       return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+       mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+       struct trace_probe *tp = v;
+       int i, ret;
+       char buf[MAX_ARGSTR_LEN + 1];
+
+       seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+       seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+
+       if (!tp->symbol)
+               seq_printf(m, " 0x%p", tp->rp.kp.addr);
+       else if (tp->rp.kp.offset)
+               seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+       else
+               seq_printf(m, " %s", probe_symbol(tp));
+
+       for (i = 0; i < tp->nr_args; i++) {
+               ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+               if (ret < 0) {
+                       pr_warning("Argument%d decoding error(%d).\n", i, ret);
+                       return ret;
+               }
+               seq_printf(m, " %s=%s", tp->args[i].name, buf);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+       .start  = probes_seq_start,
+       .next   = probes_seq_next,
+       .stop   = probes_seq_stop,
+       .show   = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+       if ((file->f_mode & FMODE_WRITE) &&
+           (file->f_flags & O_TRUNC))
+               cleanup_all_probes();
+
+       return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+       char **argv;
+       int argc = 0, ret = 0;
+
+       argv = argv_split(GFP_KERNEL, buf, &argc);
+       if (!argv)
+               return -ENOMEM;
+
+       if (argc)
+               ret = create_trace_probe(argc, argv);
+
+       argv_free(argv);
+       return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+                           size_t count, loff_t *ppos)
+{
+       char *kbuf, *tmp;
+       int ret;
+       size_t done;
+       size_t size;
+
+       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+       if (!kbuf)
+               return -ENOMEM;
+
+       ret = done = 0;
+       while (done < count) {
+               size = count - done;
+               if (size >= WRITE_BUFSIZE)
+                       size = WRITE_BUFSIZE - 1;
+               if (copy_from_user(kbuf, buffer + done, size)) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               kbuf[size] = '\0';
+               tmp = strchr(kbuf, '\n');
+               if (tmp) {
+                       *tmp = '\0';
+                       size = tmp - kbuf + 1;
+               } else if (done + size < count) {
+                       pr_warning("Line length is too long: "
+                                  "Should be less than %d.", WRITE_BUFSIZE);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               done += size;
+               /* Remove comments */
+               tmp = strchr(kbuf, '#');
+               if (tmp)
+                       *tmp = '\0';
+
+               ret = command_trace_probe(kbuf);
+               if (ret)
+                       goto out;
+       }
+       ret = done;
+out:
+       kfree(kbuf);
+       return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+       .owner          = THIS_MODULE,
+       .open           = probes_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+       .write          = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+       struct trace_probe *tp = v;
+
+       seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+                  tp->rp.kp.nmissed);
+
+       return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+       .start  = probes_seq_start,
+       .next   = probes_seq_next,
+       .stop   = probes_seq_stop,
+       .show   = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+       .owner          = THIS_MODULE,
+       .open           = profile_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+       struct kprobe_trace_entry *entry;
+       struct ring_buffer_event *event;
+       struct ring_buffer *buffer;
+       int size, i, pc;
+       unsigned long irq_flags;
+       struct ftrace_event_call *call = &tp->call;
+
+       tp->nhit++;
+
+       local_save_flags(irq_flags);
+       pc = preempt_count();
+
+       size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+
+       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                 irq_flags, pc);
+       if (!event)
+               return 0;
+
+       entry = ring_buffer_event_data(event);
+       entry->nargs = tp->nr_args;
+       entry->ip = (unsigned long)kp->addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+       if (!filter_current_check_discard(buffer, call, entry, event))
+               trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+       return 0;
+}
+
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+                                         struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+       struct kretprobe_trace_entry *entry;
+       struct ring_buffer_event *event;
+       struct ring_buffer *buffer;
+       int size, i, pc;
+       unsigned long irq_flags;
+       struct ftrace_event_call *call = &tp->call;
+
+       local_save_flags(irq_flags);
+       pc = preempt_count();
+
+       size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+
+       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                 irq_flags, pc);
+       if (!event)
+               return 0;
+
+       entry = ring_buffer_event_data(event);
+       entry->nargs = tp->nr_args;
+       entry->func = (unsigned long)tp->rp.kp.addr;
+       entry->ret_ip = (unsigned long)ri->ret_addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+       if (!filter_current_check_discard(buffer, call, entry, event))
+               trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+       return 0;
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+       struct kprobe_trace_entry *field;
+       struct trace_seq *s = &iter->seq;
+       struct trace_event *event;
+       struct trace_probe *tp;
+       int i;
+
+       field = (struct kprobe_trace_entry *)iter->ent;
+       event = ftrace_find_event(field->ent.type);
+       tp = container_of(event, struct trace_probe, event);
+
+       if (!trace_seq_printf(s, "%s: (", tp->call.name))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, ")"))
+               goto partial;
+
+       for (i = 0; i < field->nargs; i++)
+               if (!trace_seq_printf(s, " %s=%lx",
+                                     tp->args[i].name, field->args[i]))
+                       goto partial;
+
+       if (!trace_seq_puts(s, "\n"))
+               goto partial;
+
+       return TRACE_TYPE_HANDLED;
+partial:
+       return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+       struct kretprobe_trace_entry *field;
+       struct trace_seq *s = &iter->seq;
+       struct trace_event *event;
+       struct trace_probe *tp;
+       int i;
+
+       field = (struct kretprobe_trace_entry *)iter->ent;
+       event = ftrace_find_event(field->ent.type);
+       tp = container_of(event, struct trace_probe, event);
+
+       if (!trace_seq_printf(s, "%s: (", tp->call.name))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, " <- "))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, ")"))
+               goto partial;
+
+       for (i = 0; i < field->nargs; i++)
+               if (!trace_seq_printf(s, " %s=%lx",
+                                     tp->args[i].name, field->args[i]))
+                       goto partial;
+
+       if (!trace_seq_puts(s, "\n"))
+               goto partial;
+
+       return TRACE_TYPE_HANDLED;
+partial:
+       return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags |= TP_FLAG_TRACE;
+       if (probe_is_return(tp))
+               return enable_kretprobe(&tp->rp);
+       else
+               return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags &= ~TP_FLAG_TRACE;
+       if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+               if (probe_is_return(tp))
+                       disable_kretprobe(&tp->rp);
+               else
+                       disable_kprobe(&tp->rp.kp);
+       }
+}
+
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+       INIT_LIST_HEAD(&event_call->fields);
+
+       return 0;
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                      \
+       do {                                                            \
+               ret = trace_define_field(event_call, #type, name,       \
+                                        offsetof(typeof(field), item), \
+                                        sizeof(field.item), is_signed, \
+                                        FILTER_OTHER);                 \
+               if (ret)                                                \
+                       return ret;                                     \
+       } while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+       int ret, i;
+       struct kprobe_trace_entry field;
+       struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+       ret = trace_define_common_fields(event_call);
+       if (!ret)
+               return ret;
+
+       DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+       DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+       /* Set argument names as fields */
+       for (i = 0; i < tp->nr_args; i++)
+               DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+       return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+       int ret, i;
+       struct kretprobe_trace_entry field;
+       struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+       ret = trace_define_common_fields(event_call);
+       if (!ret)
+               return ret;
+
+       DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+       DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+       DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+       /* Set argument names as fields */
+       for (i = 0; i < tp->nr_args; i++)
+               DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+       return 0;
+}
+
+static int __probe_event_show_format(struct trace_seq *s,
+                                    struct trace_probe *tp, const char *fmt,
+                                    const char *arg)
+{
+       int i;
+
+       /* Show format */
+       if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+               return 0;
+
+       for (i = 0; i < tp->nr_args; i++)
+               if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                       return 0;
+
+       if (!trace_seq_printf(s, "\", %s", arg))
+               return 0;
+
+       for (i = 0; i < tp->nr_args; i++)
+               if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
+                       return 0;
+
+       return trace_seq_puts(s, "\n");
+}
+
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name)                                   \
+       do {                                                            \
+               ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"    \
+                               "offset:%u;\tsize:%u;\n", name,         \
+                               (unsigned int)offsetof(typeof(field), item),\
+                               (unsigned int)sizeof(type));            \
+               if (!ret)                                               \
+                       return 0;                                       \
+       } while (0)
+
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+                                   struct trace_seq *s)
+{
+       struct kprobe_trace_entry field __attribute__((unused));
+       int ret, i;
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+       SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+       /* Show fields */
+       for (i = 0; i < tp->nr_args; i++)
+               SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+       trace_seq_puts(s, "\n");
+
+       return __probe_event_show_format(s, tp, "(%lx)",
+                                        "REC->" FIELD_STRING_IP);
+}
+
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+                                      struct trace_seq *s)
+{
+       struct kretprobe_trace_entry field __attribute__((unused));
+       int ret, i;
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+       SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+       SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+       /* Show fields */
+       for (i = 0; i < tp->nr_args; i++)
+               SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+       trace_seq_puts(s, "\n");
+
+       return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+                                        "REC->" FIELD_STRING_FUNC
+                                        ", REC->" FIELD_STRING_RETIP);
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+
+/* Kprobe profile handler */
+static __kprobes int kprobe_profile_func(struct kprobe *kp,
+                                        struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+       struct ftrace_event_call *call = &tp->call;
+       struct kprobe_trace_entry *entry;
+       struct trace_entry *ent;
+       int size, __size, i, pc, __cpu;
+       unsigned long irq_flags;
+       char *trace_buf;
+       char *raw_data;
+       int rctx;
+
+       pc = preempt_count();
+       __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+       size = ALIGN(__size + sizeof(u32), sizeof(u64));
+       size -= sizeof(u32);
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                    "profile buffer not large enough"))
+               return 0;
+
+       /*
+        * Protect the non nmi buffer
+        * This also protects the rcu read side
+        */
+       local_irq_save(irq_flags);
+
+       rctx = perf_swevent_get_recursion_context();
+       if (rctx < 0)
+               goto end_recursion;
+
+       __cpu = smp_processor_id();
+
+       if (in_nmi())
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
+       else
+               trace_buf = rcu_dereference(perf_trace_buf);
+
+       if (!trace_buf)
+               goto end;
+
+       raw_data = per_cpu_ptr(trace_buf, __cpu);
+
+       /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+       entry = (struct kprobe_trace_entry *)raw_data;
+       ent = &entry->ent;
+
+       tracing_generic_entry_update(ent, irq_flags, pc);
+       ent->type = call->id;
+       entry->nargs = tp->nr_args;
+       entry->ip = (unsigned long)kp->addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+       perf_tp_event(call->id, entry->ip, 1, entry, size);
+
+end:
+       perf_swevent_put_recursion_context(rctx);
+end_recursion:
+       local_irq_restore(irq_flags);
+
+       return 0;
+}
+
+/* Kretprobe profile handler */
+static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+                                           struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+       struct ftrace_event_call *call = &tp->call;
+       struct kretprobe_trace_entry *entry;
+       struct trace_entry *ent;
+       int size, __size, i, pc, __cpu;
+       unsigned long irq_flags;
+       char *trace_buf;
+       char *raw_data;
+       int rctx;
+
+       pc = preempt_count();
+       __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+       size = ALIGN(__size + sizeof(u32), sizeof(u64));
+       size -= sizeof(u32);
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                    "profile buffer not large enough"))
+               return 0;
+
+       /*
+        * Protect the non nmi buffer
+        * This also protects the rcu read side
+        */
+       local_irq_save(irq_flags);
+
+       rctx = perf_swevent_get_recursion_context();
+       if (rctx < 0)
+               goto end_recursion;
+
+       __cpu = smp_processor_id();
+
+       if (in_nmi())
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
+       else
+               trace_buf = rcu_dereference(perf_trace_buf);
+
+       if (!trace_buf)
+               goto end;
+
+       raw_data = per_cpu_ptr(trace_buf, __cpu);
+
+       /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+       entry = (struct kretprobe_trace_entry *)raw_data;
+       ent = &entry->ent;
+
+       tracing_generic_entry_update(ent, irq_flags, pc);
+       ent->type = call->id;
+       entry->nargs = tp->nr_args;
+       entry->func = (unsigned long)tp->rp.kp.addr;
+       entry->ret_ip = (unsigned long)ri->ret_addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+       perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+
+end:
+       perf_swevent_put_recursion_context(rctx);
+end_recursion:
+       local_irq_restore(irq_flags);
+
+       return 0;
+}
+
+static int probe_profile_enable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags |= TP_FLAG_PROFILE;
+
+       if (probe_is_return(tp))
+               return enable_kretprobe(&tp->rp);
+       else
+               return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_profile_disable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags &= ~TP_FLAG_PROFILE;
+
+       if (!(tp->flags & TP_FLAG_TRACE)) {
+               if (probe_is_return(tp))
+                       disable_kretprobe(&tp->rp);
+               else
+                       disable_kprobe(&tp->rp.kp);
+       }
+}
+#endif /* CONFIG_EVENT_PROFILE */
+
+
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+
+       if (tp->flags & TP_FLAG_TRACE)
+               kprobe_trace_func(kp, regs);
+#ifdef CONFIG_EVENT_PROFILE
+       if (tp->flags & TP_FLAG_PROFILE)
+               kprobe_profile_func(kp, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+       return 0;       /* We don't tweek kernel, so just return 0 */
+}
+
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+
+       if (tp->flags & TP_FLAG_TRACE)
+               kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_EVENT_PROFILE
+       if (tp->flags & TP_FLAG_PROFILE)
+               kretprobe_profile_func(ri, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+       return 0;       /* We don't tweek kernel, so just return 0 */
+}
+
+static int register_probe_event(struct trace_probe *tp)
+{
+       struct ftrace_event_call *call = &tp->call;
+       int ret;
+
+       /* Initialize ftrace_event_call */
+       if (probe_is_return(tp)) {
+               tp->event.trace = print_kretprobe_event;
+               call->raw_init = probe_event_raw_init;
+               call->show_format = kretprobe_event_show_format;
+               call->define_fields = kretprobe_event_define_fields;
+       } else {
+               tp->event.trace = print_kprobe_event;
+               call->raw_init = probe_event_raw_init;
+               call->show_format = kprobe_event_show_format;
+               call->define_fields = kprobe_event_define_fields;
+       }
+       call->event = &tp->event;
+       call->id = register_ftrace_event(&tp->event);
+       if (!call->id)
+               return -ENODEV;
+       call->enabled = 0;
+       call->regfunc = probe_event_enable;
+       call->unregfunc = probe_event_disable;
+
+#ifdef CONFIG_EVENT_PROFILE
+       atomic_set(&call->profile_count, -1);
+       call->profile_enable = probe_profile_enable;
+       call->profile_disable = probe_profile_disable;
+#endif
+       call->data = tp;
+       ret = trace_add_event_call(call);
+       if (ret) {
+               pr_info("Failed to register kprobe event: %s\n", call->name);
+               unregister_ftrace_event(&tp->event);
+       }
+       return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+       /* tp->event is unregistered in trace_remove_event_call() */
+       trace_remove_event_call(&tp->call);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+       if (!d_tracer)
+               return 0;
+
+       entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+                                   NULL, &kprobe_events_ops);
+
+       /* Event list interface */
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'kprobe_events' entry\n");
+
+       /* Profile interface */
+       entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+                                   NULL, &kprobe_profile_ops);
+
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'kprobe_profile' entry\n");
+       return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                       int a4, int a5, int a6)
+{
+       return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+       int ret;
+       int (*target)(int, int, int, int, int, int);
+
+       target = kprobe_trace_selftest_target;
+
+       pr_info("Testing kprobe tracing: ");
+
+       ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+                                 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+       if (WARN_ON_ONCE(ret))
+               pr_warning("error enabling function entry\n");
+
+       ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+                                 "$retval");
+       if (WARN_ON_ONCE(ret))
+               pr_warning("error enabling function return\n");
+
+       ret = target(1, 2, 3, 4, 5, 6);
+
+       cleanup_all_probes();
+
+       pr_cont("OK\n");
+       return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644 (file)
index 0000000..ddfa0fd
--- /dev/null
@@ -0,0 +1,550 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+
+struct trace_ksym {
+       struct perf_event       **ksym_hbp;
+       struct perf_event_attr  attr;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+       unsigned long           counter;
+#endif
+       struct hlist_node       ksym_hlist;
+};
+
+static struct trace_array *ksym_trace_array;
+
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+
+static HLIST_HEAD(ksym_filter_head);
+
+static DEFINE_MUTEX(ksym_tracer_mutex);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+
+#define MAX_UL_INT 0xffffffff
+
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+       struct hlist_node *node;
+       struct trace_ksym *entry;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+               if ((entry->attr.bp_addr == hbp_hit_addr) &&
+                   (entry->counter <= MAX_UL_INT)) {
+                       entry->counter++;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
+{
+       struct ring_buffer_event *event;
+       struct ksym_trace_entry *entry;
+       struct pt_regs *regs = data;
+       struct ring_buffer *buffer;
+       int pc;
+
+       if (!ksym_tracing_enabled)
+               return;
+
+       buffer = ksym_trace_array->buffer;
+
+       pc = preempt_count();
+
+       event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
+                                                       sizeof(*entry), 0, pc);
+       if (!event)
+               return;
+
+       entry           = ring_buffer_event_data(event);
+       entry->ip       = instruction_pointer(regs);
+       entry->type     = hw_breakpoint_type(hbp);
+       entry->addr     = hw_breakpoint_addr(hbp);
+       strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+       ksym_collect_stats(hw_breakpoint_addr(hbp));
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+
+       trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *str)
+{
+       int access = 0;
+
+       if (str[0] == 'r')
+               access |= HW_BREAKPOINT_R;
+
+       if (str[1] == 'w')
+               access |= HW_BREAKPOINT_W;
+
+       if (str[2] == 'x')
+               access |= HW_BREAKPOINT_X;
+
+       switch (access) {
+       case HW_BREAKPOINT_R:
+       case HW_BREAKPOINT_W:
+       case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+               return access;
+       default:
+               return -EINVAL;
+       }
+}
+
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+                                                       unsigned long *addr)
+{
+       int ret;
+
+       *ksymname = strsep(&input_string, ":");
+       *addr = kallsyms_lookup_name(*ksymname);
+
+       /* Check for malformed request: (2), (1) and (5) */
+       if ((!input_string) ||
+           (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+           (*addr == 0))
+               return -EINVAL;;
+
+       ret = ksym_trace_get_access_type(input_string);
+
+       return ret;
+}
+
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+       struct trace_ksym *entry;
+       int ret = -ENOMEM;
+
+       if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+               printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+               " new requests for tracing can be accepted now.\n",
+                       KSYM_TRACER_MAX);
+               return -ENOSPC;
+       }
+
+       entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+       if (!entry)
+               return -ENOMEM;
+
+       hw_breakpoint_init(&entry->attr);
+
+       entry->attr.bp_type = op;
+       entry->attr.bp_addr = addr;
+       entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
+
+       ret = -EAGAIN;
+       entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+                                       ksym_hbp_handler);
+
+       if (IS_ERR(entry->ksym_hbp)) {
+               ret = PTR_ERR(entry->ksym_hbp);
+               printk(KERN_INFO "ksym_tracer request failed. Try again"
+                                       " later!!\n");
+               goto err;
+       }
+
+       hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+       ksym_filter_entry_count++;
+
+       return 0;
+
+err:
+       kfree(entry);
+
+       return ret;
+}
+
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+                                               size_t count, loff_t *ppos)
+{
+       struct trace_ksym *entry;
+       struct hlist_node *node;
+       struct trace_seq *s;
+       ssize_t cnt = 0;
+       int ret;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+       trace_seq_init(s);
+
+       mutex_lock(&ksym_tracer_mutex);
+
+       hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+               ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+               if (entry->attr.bp_type == HW_BREAKPOINT_R)
+                       ret = trace_seq_puts(s, "r--\n");
+               else if (entry->attr.bp_type == HW_BREAKPOINT_W)
+                       ret = trace_seq_puts(s, "-w-\n");
+               else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+                       ret = trace_seq_puts(s, "rw-\n");
+               WARN_ON_ONCE(!ret);
+       }
+
+       cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+       mutex_unlock(&ksym_tracer_mutex);
+
+       kfree(s);
+
+       return cnt;
+}
+
+static void __ksym_trace_reset(void)
+{
+       struct trace_ksym *entry;
+       struct hlist_node *node, *node1;
+
+       mutex_lock(&ksym_tracer_mutex);
+       hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+                                                               ksym_hlist) {
+               unregister_wide_hw_breakpoint(entry->ksym_hbp);
+               ksym_filter_entry_count--;
+               hlist_del_rcu(&(entry->ksym_hlist));
+               synchronize_rcu();
+               kfree(entry);
+       }
+       mutex_unlock(&ksym_tracer_mutex);
+}
+
+static ssize_t ksym_trace_filter_write(struct file *file,
+                                       const char __user *buffer,
+                                               size_t count, loff_t *ppos)
+{
+       struct trace_ksym *entry;
+       struct hlist_node *node;
+       char *input_string, *ksymname = NULL;
+       unsigned long ksym_addr = 0;
+       int ret, op, changed = 0;
+
+       input_string = kzalloc(count + 1, GFP_KERNEL);
+       if (!input_string)
+               return -ENOMEM;
+
+       if (copy_from_user(input_string, buffer, count)) {
+               kfree(input_string);
+               return -EFAULT;
+       }
+       input_string[count] = '\0';
+
+       strstrip(input_string);
+
+       /*
+        * Clear all breakpoints if:
+        * 1: echo > ksym_trace_filter
+        * 2: echo 0 > ksym_trace_filter
+        * 3: echo "*:---" > ksym_trace_filter
+        */
+       if (!input_string[0] || !strcmp(input_string, "0") ||
+           !strcmp(input_string, "*:---")) {
+               __ksym_trace_reset();
+               kfree(input_string);
+               return count;
+       }
+
+       ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+       if (ret < 0) {
+               kfree(input_string);
+               return ret;
+       }
+
+       mutex_lock(&ksym_tracer_mutex);
+
+       ret = -EINVAL;
+       hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+               if (entry->attr.bp_addr == ksym_addr) {
+                       /* Check for malformed request: (6) */
+                       if (entry->attr.bp_type != op)
+                               changed = 1;
+                       else
+                               goto out;
+                       break;
+               }
+       }
+       if (changed) {
+               unregister_wide_hw_breakpoint(entry->ksym_hbp);
+               entry->attr.bp_type = op;
+               ret = 0;
+               if (op > 0) {
+                       entry->ksym_hbp =
+                               register_wide_hw_breakpoint(&entry->attr,
+                                       ksym_hbp_handler);
+                       if (IS_ERR(entry->ksym_hbp))
+                               ret = PTR_ERR(entry->ksym_hbp);
+                       else
+                               goto out;
+               }
+               /* Error or "symbol:---" case: drop it */
+               ksym_filter_entry_count--;
+               hlist_del_rcu(&(entry->ksym_hlist));
+               synchronize_rcu();
+               kfree(entry);
+               goto out;
+       } else {
+               /* Check for malformed request: (4) */
+               if (op == 0)
+                       goto out;
+               ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+       }
+out:
+       mutex_unlock(&ksym_tracer_mutex);
+
+       kfree(input_string);
+
+       if (!ret)
+               ret = count;
+       return ret;
+}
+
+static const struct file_operations ksym_tracing_fops = {
+       .open           = tracing_open_generic,
+       .read           = ksym_trace_filter_read,
+       .write          = ksym_trace_filter_write,
+};
+
+static void ksym_trace_reset(struct trace_array *tr)
+{
+       ksym_tracing_enabled = 0;
+       __ksym_trace_reset();
+}
+
+static int ksym_trace_init(struct trace_array *tr)
+{
+       int cpu, ret = 0;
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+       ksym_tracing_enabled = 1;
+       ksym_trace_array = tr;
+
+       return ret;
+}
+
+static void ksym_trace_print_header(struct seq_file *m)
+{
+       seq_puts(m,
+                "#       TASK-PID   CPU#      Symbol                    "
+                "Type    Function\n");
+       seq_puts(m,
+                "#          |        |          |                       "
+                " |         |\n");
+}
+
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+       struct trace_entry *entry = iter->ent;
+       struct trace_seq *s = &iter->seq;
+       struct ksym_trace_entry *field;
+       char str[KSYM_SYMBOL_LEN];
+       int ret;
+
+       if (entry->type != TRACE_KSYM)
+               return TRACE_TYPE_UNHANDLED;
+
+       trace_assign_type(field, entry);
+
+       ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+                               entry->pid, iter->cpu, (char *)field->addr);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       switch (field->type) {
+       case HW_BREAKPOINT_R:
+               ret = trace_seq_printf(s, " R  ");
+               break;
+       case HW_BREAKPOINT_W:
+               ret = trace_seq_printf(s, " W  ");
+               break;
+       case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+               ret = trace_seq_printf(s, " RW ");
+               break;
+       default:
+               return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       sprint_symbol(str, field->ip);
+       ret = trace_seq_printf(s, "%s\n", str);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+struct tracer ksym_tracer __read_mostly =
+{
+       .name           = "ksym_tracer",
+       .init           = ksym_trace_init,
+       .reset          = ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest       = trace_selftest_startup_ksym,
+#endif
+       .print_header   = ksym_trace_print_header,
+       .print_line     = ksym_trace_output
+};
+
+__init static int init_ksym_trace(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+       ksym_filter_entry_count = 0;
+
+       entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
+                                   NULL, &ksym_tracing_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'ksym_trace_filter' file\n");
+
+       return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+
+
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+       seq_puts(m, "  Access Type ");
+       seq_puts(m, "  Symbol                                       Counter\n");
+       seq_puts(m, "  ----------- ");
+       seq_puts(m, "  ------                                       -------\n");
+       return 0;
+}
+
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+       struct hlist_node *stat = v;
+       struct trace_ksym *entry;
+       int access_type = 0;
+       char fn_name[KSYM_NAME_LEN];
+
+       entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+
+       access_type = entry->attr.bp_type;
+
+       switch (access_type) {
+       case HW_BREAKPOINT_R:
+               seq_puts(m, "  R           ");
+               break;
+       case HW_BREAKPOINT_W:
+               seq_puts(m, "  W           ");
+               break;
+       case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+               seq_puts(m, "  RW          ");
+               break;
+       default:
+               seq_puts(m, "  NA          ");
+       }
+
+       if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
+               seq_printf(m, "  %-36s", fn_name);
+       else
+               seq_printf(m, "  %-36s", "<NA>");
+       seq_printf(m, " %15lu\n", entry->counter);
+
+       return 0;
+}
+
+static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+{
+       return ksym_filter_head.first;
+}
+
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+       struct hlist_node *stat = v;
+
+       return stat->next;
+}
+
+static struct tracer_stat ksym_tracer_stats = {
+       .name = "ksym_tracer",
+       .stat_start = ksym_tracer_stat_start,
+       .stat_next = ksym_tracer_stat_next,
+       .stat_headers = ksym_tracer_stat_headers,
+       .stat_show = ksym_tracer_stat_show
+};
+
+__init static int ksym_tracer_stat_init(void)
+{
+       int ret;
+
+       ret = register_stat_tracer(&ksym_tracer_stats);
+       if (ret) {
+               printk(KERN_WARNING "Warning: could not register "
+                                   "ksym tracer stats\n");
+               return 1;
+       }
+
+       return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
index d2cdbabb4eadd4b0780950b46e9e4deda92198be..dc98309e839a7ca63ff20b05786e87d51e350c92 100644 (file)
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
        case TRACE_HW_BRANCHES:
+       case TRACE_KSYM:
                return 1;
        }
        return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
        return ret;
 }
 #endif /* CONFIG_HW_BRANCH_TRACER */
+
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       ret = tracer_init(trace, tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
+       ksym_selftest_dummy = 0;
+       /* Register the read-write tracing request */
+
+       ret = process_new_ksym_entry("ksym_selftest_dummy",
+                                    HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+                                       (unsigned long)(&ksym_selftest_dummy));
+
+       if (ret < 0) {
+               printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+               goto ret_path;
+       }
+       /* Perform a read and a write operation over the dummy variable to
+        * trigger the tracer
+        */
+       if (ksym_selftest_dummy == 0)
+               ksym_selftest_dummy++;
+
+       /* stop the tracing. */
+       tracing_stop();
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+       tracing_start();
+
+       /* read & write operations - one each is performed on the dummy variable
+        * triggering two entries in the trace buffer
+        */
+       if (!ret && count != 2) {
+               printk(KERN_CONT "Ksym tracer startup test failed");
+               ret = -1;
+       }
+
+ret_path:
+       return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
+
index ddee9c593732557b5451e741e0d58d9046f76ae6..57501d90096abbba5ee3a87de54476a794ba176e 100644 (file)
@@ -51,32 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
        return syscalls_metadata[nr];
 }
 
-int syscall_name_to_nr(char *name)
-{
-       int i;
-
-       if (!syscalls_metadata)
-               return -1;
-
-       for (i = 0; i < NR_syscalls; i++) {
-               if (syscalls_metadata[i]) {
-                       if (!strcmp(syscalls_metadata[i]->name, name))
-                               return i;
-               }
-       }
-       return -1;
-}
-
-void set_syscall_enter_id(int num, int id)
-{
-       syscalls_metadata[num]->enter_id = id;
-}
-
-void set_syscall_exit_id(int num, int id)
-{
-       syscalls_metadata[num]->exit_id = id;
-}
-
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -93,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
 
-       if (entry->enter_id != ent->type) {
+       if (entry->enter_event->id != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -148,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
 
-       if (entry->exit_id != ent->type) {
+       if (entry->exit_event->id != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -166,24 +140,19 @@ extern char *__bad_type_size(void);
 #define SYSCALL_FIELD(type, name)                                      \
        sizeof(type) != sizeof(trace.name) ?                            \
                __bad_type_size() :                                     \
-               #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
+               #type, #name, offsetof(typeof(trace), name),            \
+               sizeof(trace.name), is_signed_type(type)
 
 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 {
        int i;
-       int nr;
        int ret;
-       struct syscall_metadata *entry;
+       struct syscall_metadata *entry = call->data;
        struct syscall_trace_enter trace;
        int offset = offsetof(struct syscall_trace_enter, args);
 
-       nr = syscall_name_to_nr(call->data);
-       entry = syscall_nr_to_meta(nr);
-
-       if (!entry)
-               return 0;
-
-       ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+       ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                              "\tsigned:%u;\n",
                               SYSCALL_FIELD(int, nr));
        if (!ret)
                return 0;
@@ -193,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
                                        entry->args[i]);
                if (!ret)
                        return 0;
-               ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
-                                      sizeof(unsigned long));
+               ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
+                                      "\tsigned:%u;\n", offset,
+                                      sizeof(unsigned long),
+                                      is_signed_type(unsigned long));
                if (!ret)
                        return 0;
                offset += sizeof(unsigned long);
@@ -226,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
        struct syscall_trace_exit trace;
 
        ret = trace_seq_printf(s,
-                              "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                              "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+                              "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                              "\tsigned:%u;\n"
+                              "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                              "\tsigned:%u;\n",
                               SYSCALL_FIELD(int, nr),
                               SYSCALL_FIELD(long, ret));
        if (!ret)
@@ -239,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
-       struct syscall_metadata *meta;
+       struct syscall_metadata *meta = call->data;
        int ret;
-       int nr;
        int i;
        int offset = offsetof(typeof(trace), args);
 
-       nr = syscall_name_to_nr(call->data);
-       meta = syscall_nr_to_meta(nr);
-
-       if (!meta)
-               return 0;
-
        ret = trace_define_common_fields(call);
        if (ret)
                return ret;
 
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       if (ret)
+               return ret;
+
        for (i = 0; i < meta->nb_args; i++) {
                ret = trace_define_field(call, meta->types[i],
                                         meta->args[i], offset,
@@ -275,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        if (ret)
                return ret;
 
-       ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
+       ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+       if (ret)
+               return ret;
+
+       ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
                                 FILTER_OTHER);
 
        return ret;
@@ -302,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
 
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-       event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
-                                                 size, 0, 0);
+       event = trace_current_buffer_lock_reserve(&buffer,
+                       sys_data->enter_event->id, size, 0, 0);
        if (!event)
                return;
 
@@ -334,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
 
-       event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
-                               sizeof(*entry), 0, 0);
+       event = trace_current_buffer_lock_reserve(&buffer,
+                       sys_data->exit_event->id, sizeof(*entry), 0, 0);
        if (!event)
                return;
 
@@ -348,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-       char *name;
 
-       name = (char *)ptr;
-       num = syscall_name_to_nr(name);
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
@@ -372,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
        return ret;
 }
 
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
-       char *name;
 
-       name = (char *)ptr;
-       num = syscall_name_to_nr(name);
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -389,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
 
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-       char *name;
 
-       name = (char *)ptr;
-       num = syscall_name_to_nr(name);
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
@@ -413,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
        return ret;
 }
 
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
-       char *name;
 
-       name = (char *)ptr;
-       num = syscall_name_to_nr(name);
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -430,13 +396,17 @@ void unreg_event_syscall_exit(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
 
-struct trace_event event_syscall_enter = {
-       .trace                  = print_syscall_enter,
-};
+int init_syscall_trace(struct ftrace_event_call *call)
+{
+       int id;
 
-struct trace_event event_syscall_exit = {
-       .trace                  = print_syscall_exit,
-};
+       id = register_ftrace_event(call->event);
+       if (!id)
+               return -ENODEV;
+       call->id = id;
+       INIT_LIST_HEAD(&call->fields);
+       return 0;
+}
 
 int __init init_ftrace_syscalls(void)
 {
@@ -454,6 +424,10 @@ int __init init_ftrace_syscalls(void)
        for (i = 0; i < NR_syscalls; i++) {
                addr = arch_syscall_addr(i);
                meta = find_syscall_meta(addr);
+               if (!meta)
+                       continue;
+
+               meta->syscall_nr = i;
                syscalls_metadata[i] = meta;
        }
 
@@ -473,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        unsigned long flags;
+       char *trace_buf;
        char *raw_data;
        int syscall_nr;
+       int rctx;
        int size;
        int cpu;
 
@@ -498,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        /* Protect the per cpu buffer, begin the rcu read side */
        local_irq_save(flags);
 
+       rctx = perf_swevent_get_recursion_context();
+       if (rctx < 0)
+               goto end_recursion;
+
        cpu = smp_processor_id();
 
-       if (in_nmi())
-               raw_data = rcu_dereference(trace_profile_buf_nmi);
-       else
-               raw_data = rcu_dereference(trace_profile_buf);
+       trace_buf = rcu_dereference(perf_trace_buf);
 
-       if (!raw_data)
+       if (!trace_buf)
                goto end;
 
-       raw_data = per_cpu_ptr(raw_data, cpu);
+       raw_data = per_cpu_ptr(trace_buf, cpu);
 
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
 
        rec = (struct syscall_trace_enter *) raw_data;
        tracing_generic_entry_update(&rec->ent, 0, 0);
-       rec->ent.type = sys_data->enter_id;
+       rec->ent.type = sys_data->enter_event->id;
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-       perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+       perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
 
 end:
+       perf_swevent_put_recursion_context(rctx);
+end_recursion:
        local_irq_restore(flags);
 }
 
-int reg_prof_syscall_enter(char *name)
+int prof_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
 
-       num = syscall_name_to_nr(name);
-       if (num < 0 || num >= NR_syscalls)
-               return -ENOSYS;
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
        mutex_lock(&syscall_trace_lock);
        if (!sys_prof_refcount_enter)
@@ -548,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
        return ret;
 }
 
-void unreg_prof_syscall_enter(char *name)
+void prof_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
 
-       num = syscall_name_to_nr(name);
-       if (num < 0 || num >= NR_syscalls)
-               return;
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
        mutex_lock(&syscall_trace_lock);
        sys_prof_refcount_enter--;
@@ -570,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        struct syscall_trace_exit *rec;
        unsigned long flags;
        int syscall_nr;
+       char *trace_buf;
        char *raw_data;
+       int rctx;
        int size;
        int cpu;
 
@@ -596,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
        /* Protect the per cpu buffer, begin the rcu read side */
        local_irq_save(flags);
+
+       rctx = perf_swevent_get_recursion_context();
+       if (rctx < 0)
+               goto end_recursion;
+
        cpu = smp_processor_id();
 
-       if (in_nmi())
-               raw_data = rcu_dereference(trace_profile_buf_nmi);
-       else
-               raw_data = rcu_dereference(trace_profile_buf);
+       trace_buf = rcu_dereference(perf_trace_buf);
 
-       if (!raw_data)
+       if (!trace_buf)
                goto end;
 
-       raw_data = per_cpu_ptr(raw_data, cpu);
+       raw_data = per_cpu_ptr(trace_buf, cpu);
 
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -614,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        rec = (struct syscall_trace_exit *)raw_data;
 
        tracing_generic_entry_update(&rec->ent, 0, 0);
-       rec->ent.type = sys_data->exit_id;
+       rec->ent.type = sys_data->exit_event->id;
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
 
-       perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+       perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
 
 end:
+       perf_swevent_put_recursion_context(rctx);
+end_recursion:
        local_irq_restore(flags);
 }
 
-int reg_prof_syscall_exit(char *name)
+int prof_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
 
-       num = syscall_name_to_nr(name);
-       if (num < 0 || num >= NR_syscalls)
-               return -ENOSYS;
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
        mutex_lock(&syscall_trace_lock);
        if (!sys_prof_refcount_exit)
@@ -647,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
        return ret;
 }
 
-void unreg_prof_syscall_exit(char *name)
+void prof_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
 
-       num = syscall_name_to_nr(name);
-       if (num < 0 || num >= NR_syscalls)
-               return;
+       num = ((struct syscall_metadata *)call->data)->syscall_nr;
 
        mutex_lock(&syscall_trace_lock);
        sys_prof_refcount_exit--;
index b92bde3c6a89e7fdb09721f1e29a5aa3105c604a..e4be84ac3d381c217982cdffc71ec92fa7cd6c9b 100644 (file)
@@ -40,5 +40,11 @@ config SAMPLE_KRETPROBES
        default m
        depends on SAMPLE_KPROBES && KRETPROBES
 
+config SAMPLE_HW_BREAKPOINT
+       tristate "Build kernel hardware breakpoint examples -- loadable module only"
+       depends on HAVE_HW_BREAKPOINT && m
+       help
+         This builds kernel hardware breakpoint example modules.
+
 endif # SAMPLES
 
index 43343a03b1f4a8cbeca3b1371802135366bbaea5..0f15e6d77fd641a516f3ed53ba2d16a3a273108d 100644 (file)
@@ -1,3 +1,4 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)  += kobject/ kprobes/ tracepoints/ trace_events/
+obj-$(CONFIG_SAMPLES)  += kobject/ kprobes/ tracepoints/ trace_events/ \
+                          hw_breakpoint/
diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile
new file mode 100644 (file)
index 0000000..0f5c31c
--- /dev/null
@@ -0,0 +1 @@
+obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o
diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c
new file mode 100644 (file)
index 0000000..2952550
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * usage: insmod data_breakpoint.ko ksym=<ksym_name>
+ *
+ * This file is a kernel module that places a breakpoint over ksym_name kernel
+ * variable using Hardware Breakpoint register. The corresponding handler which
+ * prints a backtrace is invoked everytime a write operation is performed on
+ * that variable.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ *
+ * Author: K.Prasad <prasad@linux.vnet.ibm.com>
+ */
+#include <linux/module.h>      /* Needed by all modules */
+#include <linux/kernel.h>      /* Needed for KERN_INFO */
+#include <linux/init.h>                /* Needed for the macros */
+#include <linux/kallsyms.h>
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+
+struct perf_event **sample_hbp;
+
+static char ksym_name[KSYM_NAME_LEN] = "pid_max";
+module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO);
+MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any"
+                       " write operations on the kernel symbol");
+
+static void sample_hbp_handler(struct perf_event *temp, void *data)
+{
+       printk(KERN_INFO "%s value is changed\n", ksym_name);
+       dump_stack();
+       printk(KERN_INFO "Dump stack from sample_hbp_handler\n");
+}
+
+static int __init hw_break_module_init(void)
+{
+       int ret;
+       DEFINE_BREAKPOINT_ATTR(attr);
+
+       attr.bp_addr = kallsyms_lookup_name(ksym_name);
+       attr.bp_len = HW_BREAKPOINT_LEN_4;
+       attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+
+       sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler);
+       if (IS_ERR(sample_hbp)) {
+               ret = PTR_ERR(sample_hbp);
+               goto fail;
+       }
+
+       printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
+
+       return 0;
+
+fail:
+       printk(KERN_INFO "Breakpoint registration failed\n");
+
+       return ret;
+}
+
+static void __exit hw_break_module_exit(void)
+{
+       unregister_wide_hw_breakpoint(sample_hbp);
+       printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name);
+}
+
+module_init(hw_break_module_init);
+module_exit(hw_break_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("K.Prasad");
+MODULE_DESCRIPTION("ksym breakpoint");
index ea9f8a58678f3d9baaa0e073c9233584874700fa..241310e59cd6e6db33a24761a74cfda3254969f1 100755 (executable)
@@ -1852,10 +1852,17 @@ sub tracepoint_munge($) {
        my $tracepointname = 0;
        my $tracepointargs = 0;
 
-       if($prototype =~ m/TRACE_EVENT\((.*?),/) {
+       if ($prototype =~ m/TRACE_EVENT\((.*?),/) {
                $tracepointname = $1;
        }
-       if($prototype =~ m/TP_PROTO\((.*?)\)/) {
+       if ($prototype =~ m/DEFINE_SINGLE_EVENT\((.*?),/) {
+               $tracepointname = $1;
+       }
+       if ($prototype =~ m/DEFINE_EVENT\((.*?),(.*?),/) {
+               $tracepointname = $2;
+       }
+       $tracepointname =~ s/^\s+//; #strip leading whitespace
+       if ($prototype =~ m/TP_PROTO\((.*?)\)/) {
                $tracepointargs = $1;
        }
        if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
@@ -1920,7 +1927,9 @@ sub process_state3_function($$) {
        if ($prototype =~ /SYSCALL_DEFINE/) {
                syscall_munge();
        }
-       if ($prototype =~ /TRACE_EVENT/) {
+       if ($prototype =~ /TRACE_EVENT/ || $prototype =~ /DEFINE_EVENT/ ||
+           $prototype =~ /DEFINE_SINGLE_EVENT/)
+       {
                tracepoint_munge($file);
        }
        dump_function($prototype, $file);
index 0854f110bf7f79a7de7617bd37a3c75e06d11430..fe08660ce0bd05841869ed59f762eceed7c089d7 100644 (file)
@@ -12,6 +12,7 @@ perf*.1
 perf*.xml
 perf*.html
 common-cmds.h
+perf.data
 tags
 TAGS
 cscope*
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
new file mode 100644 (file)
index 0000000..ae525ac
--- /dev/null
@@ -0,0 +1,120 @@
+perf-bench(1)
+============
+
+NAME
+----
+perf-bench - General framework for benchmark suites
+
+SYNOPSIS
+--------
+[verse]
+'perf bench' [<common options>] <subsystem> <suite> [<options>]
+
+DESCRIPTION
+-----------
+This 'perf bench' command is general framework for benchmark suites.
+
+COMMON OPTIONS
+--------------
+-f::
+--format=::
+Specify format style.
+Current available format styles are,
+
+'default'::
+Default style. This is mainly for human reading.
+---------------------
+% perf bench sched pipe                      # with no style specify
+(executing 1000000 pipe operations between two tasks)
+        Total time:5.855 sec
+                5.855061 usecs/op
+               170792 ops/sec
+---------------------
+
+'simple'::
+This simple style is friendly for automated
+processing by scripts.
+---------------------
+% perf bench --format=simple sched pipe      # specified simple
+5.988
+---------------------
+
+SUBSYSTEM
+---------
+
+'sched'::
+       Scheduler and IPC mechanisms.
+
+SUITES FOR 'sched'
+~~~~~~~~~~~~~~~~~~
+*messaging*::
+Suite for evaluating performance of scheduler and IPC mechanisms.
+Based on hackbench by Rusty Russell.
+
+Options of *pipe*
+^^^^^^^^^^^^^^^^^
+-p::
+--pipe::
+Use pipe() instead of socketpair()
+
+-t::
+--thread::
+Be multi thread instead of multi process
+
+-g::
+--group=::
+Specify number of groups
+
+-l::
+--loop=::
+Specify number of loops
+
+Example of *messaging*
+^^^^^^^^^^^^^^^^^^^^^^
+
+---------------------
+% perf bench sched messaging                 # run with default
+options (20 sender and receiver processes per group)
+(10 groups == 400 processes run)
+
+      Total time:0.308 sec
+
+% perf bench sched messaging -t -g 20        # be multi-thread,with 20 groups
+(20 sender and receiver threads per group)
+(20 groups == 800 threads run)
+
+      Total time:0.582 sec
+---------------------
+
+*pipe*::
+Suite for pipe() system call.
+Based on pipe-test-1m.c by Ingo Molnar.
+
+Options of *pipe*
+^^^^^^^^^^^^^^^^^
+-l::
+--loop=::
+Specify number of loops.
+
+Example of *pipe*
+^^^^^^^^^^^^^^^^^
+
+---------------------
+% perf bench sched pipe
+(executing 1000000 pipe operations between two tasks)
+
+        Total time:8.091 sec
+                8.091833 usecs/op
+                123581 ops/sec
+
+% perf bench sched pipe -l 1000              # loop 1000
+(executing 1000 pipe operations between two tasks)
+
+        Total time:0.016 sec
+                16.948000 usecs/op
+                59004 ops/sec
+---------------------
+
+SEE ALSO
+--------
+linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
new file mode 100644 (file)
index 0000000..01b642c
--- /dev/null
@@ -0,0 +1,34 @@
+perf-buildid-list(1)
+====================
+
+NAME
+----
+perf-buildid-list - List the buildids in a perf.data file
+
+SYNOPSIS
+--------
+[verse]
+'perf buildid-list <options>'
+
+DESCRIPTION
+-----------
+This command displays the buildids found in a perf.data file, so that other
+tools can be used to fetch packages with matching symbol tables for use by
+perf report.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+-f::
+--force::
+       Don't do ownership validation.
+-v::
+--verbose::
+       Be more verbose.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-top[1],
+linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
new file mode 100644 (file)
index 0000000..44b0ce3
--- /dev/null
@@ -0,0 +1,44 @@
+perf-kmem(1)
+==============
+
+NAME
+----
+perf-kmem - Tool to trace/measure kernel memory(slab) properties
+
+SYNOPSIS
+--------
+[verse]
+'perf kmem' {record} [<options>]
+
+DESCRIPTION
+-----------
+There's two variants of perf kmem:
+
+  'perf kmem record <command>' to record the kmem events
+  of an arbitrary workload.
+
+  'perf kmem' to report kernel memory statistics.
+
+OPTIONS
+-------
+-i <file>::
+--input=<file>::
+       Select the input file (default: perf.data)
+
+--stat=<caller|alloc>::
+       Select per callsite or per allocation statistics
+
+-s <key[,key2...]>::
+--sort=<key[,key2...]>::
+       Sort the output (default: frag,hit,bytes)
+
+-l <num>::
+--line=<num>::
+       Print n lines only
+
+--raw-ip::
+       Print raw ip instead of symbol
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
new file mode 100644 (file)
index 0000000..9270594
--- /dev/null
@@ -0,0 +1,49 @@
+perf-probe(1)
+=============
+
+NAME
+----
+perf-probe - Define new dynamic tracepoints
+
+SYNOPSIS
+--------
+[verse]
+'perf probe' [options] --add 'PROBE' [--add 'PROBE' ...]
+or
+'perf probe' [options] 'PROBE' ['PROBE' ...]
+
+
+DESCRIPTION
+-----------
+This command defines dynamic tracepoint events, by symbol and registers
+without debuginfo, or by C expressions (C line numbers, C function names,
+and C local variables) with debuginfo.
+
+
+OPTIONS
+-------
+-k::
+--vmlinux=PATH::
+       Specify vmlinux path which has debuginfo (Dwarf binary).
+
+-v::
+--verbose::
+        Be more verbose (show parsed arguments, etc).
+
+-a::
+--add::
+       Define a probe point (see PROBE SYNTAX for detail)
+
+PROBE SYNTAX
+------------
+Probe points are defined by following syntax.
+
+ "FUNC[+OFFS|:RLN|%return][@SRC]|SRC:ALN [ARG ...]"
+
+'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, 'RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. In addition, 'SRC' specifies a source file which has that function.
+It is also possible to specify a probe point by the source line number by using 'SRC:ALN' syntax, where 'SRC' is the source file path and 'ALN' is the line number.
+'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc).
+
+SEE ALSO
+--------
+linkperf:perf-trace[1], linkperf:perf-record[1]
index 0ff23de9e4539599acdba5a8ade8325117e75caa..fc46c0b40f6e431bd5124885544b5827a96234c8 100644 (file)
@@ -26,11 +26,19 @@ OPTIONS
 
 -e::
 --event=::
-       Select the PMU event. Selection can be a symbolic event name
-       (use 'perf list' to list all events) or a raw PMU
-       event (eventsel+umask) in the form of rNNN where NNN is a
-       hexadecimal event descriptor.
+       Select the PMU event. Selection can be:
 
+        - a symbolic event name        (use 'perf list' to list all events)
+
+        - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a
+         hexadecimal event descriptor.
+
+        - a hardware breakpoint event in the form of '\mem:addr[:access]'
+          where addr is the address in memory you want to break in.
+          Access is the memory access type (read, write, execute) it can
+          be passed as follows: '\mem:addr[:[r][w][x]]'.
+          If you want to profile read-write accesses in 0x1000, just set
+          'mem:0x1000:rw'.
 -a::
         System-wide collection.
 
index 59f0b846cd7141a1c76de2486dd531b45b624334..9dccb180b7af3c878e41b8807de2533efbe269c4 100644 (file)
@@ -24,11 +24,11 @@ OPTIONS
 --dsos=::
        Only consider symbols in these dsos. CSV that understands
        file://filename entries.
--n
---show-nr-samples
+-n::
+--show-nr-samples::
        Show the number of samples for each symbol
--T
---threads
+-T::
+--threads::
        Show per-thread event counters
 -C::
 --comms=::
index a7910099d6fd5111470b543d9aa2bcfe57fc96c4..4b1788355ecac3d305bf72e6f58d5a477e08ba7b 100644 (file)
@@ -31,9 +31,12 @@ OPTIONS
 -w::
 --width=::
         Select the width of the SVG file (default: 1000)
--p::
+-P::
 --power-only::
         Only output the CPU power section of the diagram
+-p::
+--process::
+        Select the processes to display, by name or PID
 
 
 SEE ALSO
diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-trace-perl.txt
new file mode 100644 (file)
index 0000000..c5f55f4
--- /dev/null
@@ -0,0 +1,219 @@
+perf-trace-perl(1)
+==================
+
+NAME
+----
+perf-trace-perl - Process trace data with a Perl script
+
+SYNOPSIS
+--------
+[verse]
+'perf trace' [-s [lang]:script[.ext] ]
+
+DESCRIPTION
+-----------
+
+This perf trace option is used to process perf trace data using perf's
+built-in Perl interpreter.  It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Perl script, if any.
+
+STARTER SCRIPTS
+---------------
+
+You can avoid reading the rest of this document by running 'perf trace
+-g perl' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/perl for typical examples showing how to
+do basic things like aggregate event data, print results, etc.  Also,
+the check-perf-trace.pl script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf trace is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace.  If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_handled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -c 1 -f -a -M -R -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above options: -c 1 says to sample every event, -a to enable
+system-wide collection, -M to multiplex the output, and -R to collect
+raw samples.
+
+The format file for the sched_wakep event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+        field:unsigned short common_type;
+        field:unsigned char common_flags;
+        field:unsigned char common_preempt_count;
+        field:int common_pid;
+        field:int common_lock_depth;
+
+        field:char comm[TASK_COMM_LEN];
+        field:pid_t pid;
+        field:int prio;
+        field:int success;
+        field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+sub sched::sched_wakeup
+{
+   my ($event_name, $context, $common_cpu, $common_secs,
+       $common_nsecs, $common_pid, $common_comm,
+       $comm, $pid, $prio, $success, $target_cpu) = @_;
+}
+----
+
+The handler function takes the form subsystem::event_name.
+
+The $common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ $event_name               the name of the event as text
+ $context                  an opaque 'cookie' used in calls back into perf
+ $common_cpu               the cpu the event occurred on
+ $common_secs              the secs portion of the event timestamp
+ $common_nsecs             the nsecs portion of the event timestamp
+ $common_pid               the pid of the current task
+ $common_comm              the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script.  The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf trace Perl script should start by setting up a Perl module
+search path and 'use'ing a few support modules (see module
+descriptions below):
+
+----
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+ use lib "./Perf-Trace-Util/lib";
+ use Perf::Trace::Core;
+ use Perf::Trace::Context;
+ use Perf::Trace::Util;
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+ sub trace_begin
+ {
+ }
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+sub trace_end
+{
+}
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it.  The standard set
+ of common arguments are passed into it:
+
+----
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs,
+        $common_nsecs, $common_pid, $common_comm) = @_;
+}
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf trace Perl modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various Perf::Trace::* Perl modules.  To use the functions and
+variables from the given module, add the corresponding 'use
+Perf::Trace::XXX' line to your perf trace script.
+
+Perf::Trace::Core Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields.  These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+  flag_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the flag field $field_name of event $event_name
+  symbol_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the symbolic field $field_name of event $event_name
+
+Perf::Trace::Context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+Perf::Trace::Context defines a set of functions that can be used to
+access this data in the context of the current event.  Each of these
+functions expects a $context variable, which is the same as the
+$context variable passed into every event handler as the second
+argument.
+
+ common_pc($context) - returns common_preempt count for the current event
+ common_flags($context) - returns common_flags for the current event
+ common_lock_depth($context) - returns common_lock_depth for the current event
+
+Perf::Trace::Util Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Various utility functions for use with perf trace:
+
+  nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
+  nsecs_secs($nsecs) - returns whole secs portion given nsecs
+  nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
+  nsecs_str($nsecs) - returns printable string in the form secs.nsecs
+  avg($total, $n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-trace[1]
index 41ed75398ca98efd9211d367e4e64f5de6da06e0..07065efa60e09b9a8ca3559583b7cdfd63ac2c0a 100644 (file)
@@ -20,6 +20,15 @@ OPTIONS
 --dump-raw-trace=::
         Display verbose dump of the trace data.
 
+-s::
+--script=::
+        Process trace data with the given script ([lang]:script[.ext]).
+
+-g::
+--gen-script=::
+        Generate perf-trace.[ext] starter script for given language,
+        using current perf.data.
+
 SEE ALSO
 --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-trace-perl[1]
index 7e190d522cd5848123b0d98220e6555fd3e70c69..23ec66098bdc45c6c81ca9559fd16dc46f3b5ffb 100644 (file)
@@ -2,6 +2,7 @@
 all::
 
 # Define V=1 to have a more verbose compile.
+# Define V=2 to have an even more verbose compile.
 #
 # Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
 # or vsnprintf() return -1 instead of number of characters which would
@@ -145,6 +146,10 @@ all::
 # Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
 # your external grep (e.g., if your system lacks grep, if its grep is
 # broken, or spawning external process is slower than built-in grep perf has).
+#
+# Define LDFLAGS=-static to build a static binary.
+#
+# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
 
 PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
        @$(SHELL_PATH) util/PERF-VERSION-GEN
@@ -157,20 +162,6 @@ uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
 uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
 uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
-#
-# Add -m32 for cross-builds:
-#
-ifdef NO_64BIT
-  MBITS := -m32
-else
-  #
-  # If we're on a 64-bit kernel, use -m64:
-  #
-  ifneq ($(patsubst %64,%,$(uname_M)),$(uname_M))
-    MBITS := -m64
-  endif
-endif
-
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 #
@@ -200,8 +191,15 @@ EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wold-style-definition
 EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-prototypes
 EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wdeclaration-after-statement
 
-CFLAGS = $(MBITS) -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -fstack-protector-all -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS)
-LDFLAGS = -lpthread -lrt -lelf -lm
+ifeq ("$(origin DEBUG)", "command line")
+  PERF_DEBUG = $(DEBUG)
+endif
+ifndef PERF_DEBUG
+  CFLAGS_OPTIMIZE = -O6
+endif
+
+CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
+EXTLIBS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
@@ -252,6 +250,9 @@ PTHREAD_LIBS = -lpthread
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
 
+ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y)
+  CFLAGS := $(CFLAGS) -fstack-protector-all
+endif
 
 
 ### --- END CONFIGURATION SECTION ---
@@ -327,8 +328,28 @@ LIB_FILE=libperf.a
 LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
 LIB_H += ../../include/linux/list.h
+LIB_H += ../../include/linux/stringify.h
+LIB_H += util/include/linux/bitmap.h
+LIB_H += util/include/linux/bitops.h
+LIB_H += util/include/linux/compiler.h
+LIB_H += util/include/linux/ctype.h
+LIB_H += util/include/linux/kernel.h
 LIB_H += util/include/linux/list.h
+LIB_H += util/include/linux/module.h
+LIB_H += util/include/linux/poison.h
+LIB_H += util/include/linux/prefetch.h
+LIB_H += util/include/linux/rbtree.h
+LIB_H += util/include/linux/string.h
+LIB_H += util/include/linux/types.h
+LIB_H += util/include/asm/asm-offsets.h
+LIB_H += util/include/asm/bitops.h
+LIB_H += util/include/asm/byteorder.h
+LIB_H += util/include/asm/swab.h
+LIB_H += util/include/asm/system.h
+LIB_H += util/include/asm/uaccess.h
 LIB_H += perf.h
+LIB_H += util/debugfs.h
+LIB_H += util/event.h
 LIB_H += util/types.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
@@ -342,15 +363,22 @@ LIB_H += util/strlist.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
-LIB_H += util/module.h
 LIB_H += util/color.h
 LIB_H += util/values.h
+LIB_H += util/sort.h
+LIB_H += util/hist.h
+LIB_H += util/thread.h
+LIB_H += util/data_map.h
+LIB_H += util/probe-finder.h
+LIB_H += util/probe-event.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
 LIB_OBJS += util/config.o
 LIB_OBJS += util/ctype.o
+LIB_OBJS += util/debugfs.o
 LIB_OBJS += util/environment.o
+LIB_OBJS += util/event.o
 LIB_OBJS += util/exec_cmd.o
 LIB_OBJS += util/help.o
 LIB_OBJS += util/levenshtein.o
@@ -358,6 +386,9 @@ LIB_OBJS += util/parse-options.o
 LIB_OBJS += util/parse-events.o
 LIB_OBJS += util/path.o
 LIB_OBJS += util/rbtree.o
+LIB_OBJS += util/bitmap.o
+LIB_OBJS += util/hweight.o
+LIB_OBJS += util/find_next_bit.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
@@ -367,7 +398,6 @@ LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
 LIB_OBJS += util/symbol.o
-LIB_OBJS += util/module.o
 LIB_OBJS += util/color.o
 LIB_OBJS += util/pager.o
 LIB_OBJS += util/header.o
@@ -379,11 +409,25 @@ LIB_OBJS += util/thread.o
 LIB_OBJS += util/trace-event-parse.o
 LIB_OBJS += util/trace-event-read.o
 LIB_OBJS += util/trace-event-info.o
+LIB_OBJS += util/trace-event-perl.o
 LIB_OBJS += util/svghelper.o
+LIB_OBJS += util/sort.o
+LIB_OBJS += util/hist.o
+LIB_OBJS += util/data_map.o
+LIB_OBJS += util/probe-event.o
 
 BUILTIN_OBJS += builtin-annotate.o
+
+BUILTIN_OBJS += builtin-bench.o
+
+# Benchmark modules
+BUILTIN_OBJS += bench/sched-messaging.o
+BUILTIN_OBJS += bench/sched-pipe.o
+BUILTIN_OBJS += bench/mem-memcpy.o
+
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-sched.o
+BUILTIN_OBJS += builtin-buildid-list.o
 BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
@@ -391,9 +435,16 @@ BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-timechart.o
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
+BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
 
 PERFLIBS = $(LIB_FILE)
 
+ifeq ($(V), 2)
+       QUIET_STDERR = ">/dev/null"
+else
+       QUIET_STDERR = ">/dev/null 2>&1"
+endif
 #
 # Platform specific tweaks
 #
@@ -421,36 +472,58 @@ ifeq ($(uname_S),Darwin)
        PTHREAD_LIBS =
 endif
 
-ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
-       ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
+ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+       msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
+endif
+
+       ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
                BASIC_CFLAGS += -DLIBELF_NO_MMAP
        endif
 else
        msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
 endif
 
+ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+       msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231);
+       BASIC_CFLAGS += -DNO_LIBDWARF
+else
+       EXTLIBS += -lelf -ldwarf
+       LIB_OBJS += util/probe-finder.o
+endif
+
+PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
+PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+
+ifneq ($(shell sh -c "(echo '\#include <EXTERN.h>'; echo '\#include <perl.h>'; echo 'int main(void) { perl_alloc(); return 0; }') | $(CC) -x c - $(PERL_EMBED_CCOPTS) -o /dev/null $(PERL_EMBED_LDOPTS) > /dev/null 2>&1 && echo y"), y)
+       BASIC_CFLAGS += -DNO_LIBPERL
+else
+       ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
+       LIB_OBJS += scripts/perl/Perf-Trace-Util/Context.o
+endif
+
 ifdef NO_DEMANGLE
        BASIC_CFLAGS += -DNO_DEMANGLE
 else
-       has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd > /dev/null 2>&1 && echo y")
+       has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y")
 
        ifeq ($(has_bfd),y)
                EXTLIBS += -lbfd
        else
-               has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y")
+               has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty "$(QUIET_STDERR)" && echo y")
                ifeq ($(has_bfd_iberty),y)
                        EXTLIBS += -lbfd -liberty
                else
-                       has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y")
+                       has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz "$(QUIET_STDERR)" && echo y")
                        ifeq ($(has_bfd_iberty_z),y)
                                EXTLIBS += -lbfd -liberty -lz
                        else
-                               has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -liberty > /dev/null 2>&1 && echo y")
+                               has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty "$(QUIET_STDERR)" && echo y")
                                ifeq ($(has_cplus_demangle),y)
                                        EXTLIBS += -liberty
                                        BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
                                else
-                                       msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling)
+                                       msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
                                        BASIC_CFLAGS += -DNO_DEMANGLE
                                endif
                        endif
@@ -787,6 +860,25 @@ util/config.o: util/config.c PERF-CFLAGS
 util/rbtree.o: ../../lib/rbtree.c PERF-CFLAGS
        $(QUIET_CC)$(CC) -o util/rbtree.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
+# some perf warning policies can't fit to lib/bitmap.c, eg: it warns about variable shadowing
+# from <string.h> that comes from kernel headers wrapping.
+KBITMAP_FLAGS=`echo $(ALL_CFLAGS) | sed s/-Wshadow// | sed s/-Wswitch-default// | sed s/-Wextra//`
+
+util/bitmap.o: ../../lib/bitmap.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o util/bitmap.o -c $(KBITMAP_FLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+util/hweight.o: ../../lib/hweight.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o util/hweight.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+util/find_next_bit.o: ../../lib/find_next_bit.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o util/find_next_bit.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+util/trace-event-perl.o: util/trace-event-perl.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o util/trace-event-perl.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
+
+scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c PERF-CFLAGS
+       $(QUIET_CC)$(CC) -o scripts/perl/Perf-Trace-Util/Context.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+
 perf-%$X: %.o $(PERFLIBS)
        $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
@@ -894,6 +986,13 @@ export perfexec_instdir
 install: all
        $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
        $(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
+       $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
+       $(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
+       $(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'
+       $(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
+       $(INSTALL) scripts/perl/Perf-Trace-Util/Makefile.PL -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util'
+       $(INSTALL) scripts/perl/Perf-Trace-Util/README -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util'
 ifdef BUILT_INS
        $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
        $(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
@@ -979,7 +1078,7 @@ distclean: clean
 #      $(RM) configure
 
 clean:
-       $(RM) *.o */*.o $(LIB_FILE)
+       $(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE)
        $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
        $(RM) $(TEST_PROGRAMS)
        $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
new file mode 100644 (file)
index 0000000..f7781c6
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef BENCH_H
+#define BENCH_H
+
+extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
+extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
+extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used);
+
+#define BENCH_FORMAT_DEFAULT_STR       "default"
+#define BENCH_FORMAT_DEFAULT           0
+#define BENCH_FORMAT_SIMPLE_STR                "simple"
+#define BENCH_FORMAT_SIMPLE            1
+
+#define BENCH_FORMAT_UNKNOWN           -1
+
+extern int bench_format;
+
+#endif
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
new file mode 100644 (file)
index 0000000..8977317
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * mem-memcpy.c
+ *
+ * memcpy: Simple memory copy in various ways
+ *
+ * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
+ */
+#include <ctype.h>
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../util/string.h"
+#include "../util/header.h"
+#include "bench.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <errno.h>
+
+#define K 1024
+
+static const char      *length_str     = "1MB";
+static const char      *routine        = "default";
+static int             use_clock       = 0;
+static int             clock_fd;
+
+static const struct option options[] = {
+       OPT_STRING('l', "length", &length_str, "1MB",
+                   "Specify length of memory to copy. "
+                   "available unit: B, MB, GB (upper and lower)"),
+       OPT_STRING('r', "routine", &routine, "default",
+                   "Specify routine to copy"),
+       OPT_BOOLEAN('c', "clock", &use_clock,
+                   "Use CPU clock for measuring"),
+       OPT_END()
+};
+
+struct routine {
+       const char *name;
+       const char *desc;
+       void * (*fn)(void *dst, const void *src, size_t len);
+};
+
+struct routine routines[] = {
+       { "default",
+         "Default memcpy() provided by glibc",
+         memcpy },
+       { NULL,
+         NULL,
+         NULL   }
+};
+
+static const char * const bench_mem_memcpy_usage[] = {
+       "perf bench mem memcpy <options>",
+       NULL
+};
+
+static struct perf_event_attr clock_attr = {
+       .type           = PERF_TYPE_HARDWARE,
+       .config         = PERF_COUNT_HW_CPU_CYCLES
+};
+
+static void init_clock(void)
+{
+       clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0);
+
+       if (clock_fd < 0 && errno == ENOSYS)
+               die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+       else
+               BUG_ON(clock_fd < 0);
+}
+
+static u64 get_clock(void)
+{
+       int ret;
+       u64 clk;
+
+       ret = read(clock_fd, &clk, sizeof(u64));
+       BUG_ON(ret != sizeof(u64));
+
+       return clk;
+}
+
+static double timeval2double(struct timeval *ts)
+{
+       return (double)ts->tv_sec +
+               (double)ts->tv_usec / (double)1000000;
+}
+
+int bench_mem_memcpy(int argc, const char **argv,
+                    const char *prefix __used)
+{
+       int i;
+       void *dst, *src;
+       size_t length;
+       double bps = 0.0;
+       struct timeval tv_start, tv_end, tv_diff;
+       u64 clock_start, clock_end, clock_diff;
+
+       clock_start = clock_end = clock_diff = 0ULL;
+       argc = parse_options(argc, argv, options,
+                            bench_mem_memcpy_usage, 0);
+
+       tv_diff.tv_sec = 0;
+       tv_diff.tv_usec = 0;
+       length = (size_t)perf_atoll((char *)length_str);
+
+       if ((s64)length <= 0) {
+               fprintf(stderr, "Invalid length:%s\n", length_str);
+               return 1;
+       }
+
+       for (i = 0; routines[i].name; i++) {
+               if (!strcmp(routines[i].name, routine))
+                       break;
+       }
+       if (!routines[i].name) {
+               printf("Unknown routine:%s\n", routine);
+               printf("Available routines...\n");
+               for (i = 0; routines[i].name; i++) {
+                       printf("\t%s ... %s\n",
+                              routines[i].name, routines[i].desc);
+               }
+               return 1;
+       }
+
+       dst = zalloc(length);
+       if (!dst)
+               die("memory allocation failed - maybe length is too large?\n");
+
+       src = zalloc(length);
+       if (!src)
+               die("memory allocation failed - maybe length is too large?\n");
+
+       if (bench_format == BENCH_FORMAT_DEFAULT) {
+               printf("# Copying %s Bytes from %p to %p ...\n\n",
+                      length_str, src, dst);
+       }
+
+       if (use_clock) {
+               init_clock();
+               clock_start = get_clock();
+       } else {
+               BUG_ON(gettimeofday(&tv_start, NULL));
+       }
+
+       routines[i].fn(dst, src, length);
+
+       if (use_clock) {
+               clock_end = get_clock();
+               clock_diff = clock_end - clock_start;
+       } else {
+               BUG_ON(gettimeofday(&tv_end, NULL));
+               timersub(&tv_end, &tv_start, &tv_diff);
+               bps = (double)((double)length / timeval2double(&tv_diff));
+       }
+
+       switch (bench_format) {
+       case BENCH_FORMAT_DEFAULT:
+               if (use_clock) {
+                       printf(" %14lf Clock/Byte\n",
+                              (double)clock_diff / (double)length);
+               } else {
+                       if (bps < K)
+                               printf(" %14lf B/Sec\n", bps);
+                       else if (bps < K * K)
+                               printf(" %14lfd KB/Sec\n", bps / 1024);
+                       else if (bps < K * K * K)
+                               printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
+                       else {
+                               printf(" %14lf GB/Sec\n",
+                                      bps / 1024 / 1024 / 1024);
+                       }
+               }
+               break;
+       case BENCH_FORMAT_SIMPLE:
+               if (use_clock) {
+                       printf("%14lf\n",
+                              (double)clock_diff / (double)length);
+               } else
+                       printf("%lf\n", bps);
+               break;
+       default:
+               /* reaching this means there's some disaster: */
+               die("unknown format: %d\n", bench_format);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
new file mode 100644 (file)
index 0000000..605a2a9
--- /dev/null
@@ -0,0 +1,336 @@
+/*
+ *
+ * builtin-bench-messaging.c
+ *
+ * messaging: Benchmark for scheduler and IPC mechanisms
+ *
+ * Based on hackbench by Rusty Russell <rusty@rustcorp.com.au>
+ * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
+ *
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../builtin.h"
+#include "bench.h"
+
+/* Test groups of 20 processes spraying to 20 receivers */
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <sys/poll.h>
+#include <limits.h>
+
+#define DATASIZE 100
+
+static int use_pipes = 0;
+static unsigned int loops = 100;
+static unsigned int thread_mode = 0;
+static unsigned int num_groups = 10;
+
+struct sender_context {
+       unsigned int num_fds;
+       int ready_out;
+       int wakefd;
+       int out_fds[0];
+};
+
+struct receiver_context {
+       unsigned int num_packets;
+       int in_fds[2];
+       int ready_out;
+       int wakefd;
+};
+
+static void barf(const char *msg)
+{
+       fprintf(stderr, "%s (error: %s)\n", msg, strerror(errno));
+       exit(1);
+}
+
+static void fdpair(int fds[2])
+{
+       if (use_pipes) {
+               if (pipe(fds) == 0)
+                       return;
+       } else {
+               if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) == 0)
+                       return;
+       }
+
+       barf(use_pipes ? "pipe()" : "socketpair()");
+}
+
+/* Block until we're ready to go */
+static void ready(int ready_out, int wakefd)
+{
+       char dummy;
+       struct pollfd pollfd = { .fd = wakefd, .events = POLLIN };
+
+       /* Tell them we're ready. */
+       if (write(ready_out, &dummy, 1) != 1)
+               barf("CLIENT: ready write");
+
+       /* Wait for "GO" signal */
+       if (poll(&pollfd, 1, -1) != 1)
+               barf("poll");
+}
+
+/* Sender sprays loops messages down each file descriptor */
+static void *sender(struct sender_context *ctx)
+{
+       char data[DATASIZE];
+       unsigned int i, j;
+
+       ready(ctx->ready_out, ctx->wakefd);
+
+       /* Now pump to every receiver. */
+       for (i = 0; i < loops; i++) {
+               for (j = 0; j < ctx->num_fds; j++) {
+                       int ret, done = 0;
+
+again:
+                       ret = write(ctx->out_fds[j], data + done,
+                                   sizeof(data)-done);
+                       if (ret < 0)
+                               barf("SENDER: write");
+                       done += ret;
+                       if (done < DATASIZE)
+                               goto again;
+               }
+       }
+
+       return NULL;
+}
+
+
+/* One receiver per fd */
+static void *receiver(struct receiver_context* ctx)
+{
+       unsigned int i;
+
+       if (!thread_mode)
+               close(ctx->in_fds[1]);
+
+       /* Wait for start... */
+       ready(ctx->ready_out, ctx->wakefd);
+
+       /* Receive them all */
+       for (i = 0; i < ctx->num_packets; i++) {
+               char data[DATASIZE];
+               int ret, done = 0;
+
+again:
+               ret = read(ctx->in_fds[0], data + done, DATASIZE - done);
+               if (ret < 0)
+                       barf("SERVER: read");
+               done += ret;
+               if (done < DATASIZE)
+                       goto again;
+       }
+
+       return NULL;
+}
+
+static pthread_t create_worker(void *ctx, void *(*func)(void *))
+{
+       pthread_attr_t attr;
+       pthread_t childid;
+       int err;
+
+       if (!thread_mode) {
+               /* process mode */
+               /* Fork the receiver. */
+               switch (fork()) {
+               case -1:
+                       barf("fork()");
+                       break;
+               case 0:
+                       (*func) (ctx);
+                       exit(0);
+                       break;
+               default:
+                       break;
+               }
+
+               return (pthread_t)0;
+       }
+
+       if (pthread_attr_init(&attr) != 0)
+               barf("pthread_attr_init:");
+
+#ifndef __ia64__
+       if (pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN) != 0)
+               barf("pthread_attr_setstacksize");
+#endif
+
+       err = pthread_create(&childid, &attr, func, ctx);
+       if (err != 0) {
+               fprintf(stderr, "pthread_create failed: %s (%d)\n",
+                       strerror(err), err);
+               exit(-1);
+       }
+       return childid;
+}
+
+static void reap_worker(pthread_t id)
+{
+       int proc_status;
+       void *thread_status;
+
+       if (!thread_mode) {
+               /* process mode */
+               wait(&proc_status);
+               if (!WIFEXITED(proc_status))
+                       exit(1);
+       } else {
+               pthread_join(id, &thread_status);
+       }
+}
+
+/* One group of senders and receivers */
+static unsigned int group(pthread_t *pth,
+               unsigned int num_fds,
+               int ready_out,
+               int wakefd)
+{
+       unsigned int i;
+       struct sender_context *snd_ctx = malloc(sizeof(struct sender_context)
+                       + num_fds * sizeof(int));
+
+       if (!snd_ctx)
+               barf("malloc()");
+
+       for (i = 0; i < num_fds; i++) {
+               int fds[2];
+               struct receiver_context *ctx = malloc(sizeof(*ctx));
+
+               if (!ctx)
+                       barf("malloc()");
+
+
+               /* Create the pipe between client and server */
+               fdpair(fds);
+
+               ctx->num_packets = num_fds * loops;
+               ctx->in_fds[0] = fds[0];
+               ctx->in_fds[1] = fds[1];
+               ctx->ready_out = ready_out;
+               ctx->wakefd = wakefd;
+
+               pth[i] = create_worker(ctx, (void *)receiver);
+
+               snd_ctx->out_fds[i] = fds[1];
+               if (!thread_mode)
+                       close(fds[0]);
+       }
+
+       /* Now we have all the fds, fork the senders */
+       for (i = 0; i < num_fds; i++) {
+               snd_ctx->ready_out = ready_out;
+               snd_ctx->wakefd = wakefd;
+               snd_ctx->num_fds = num_fds;
+
+               pth[num_fds+i] = create_worker(snd_ctx, (void *)sender);
+       }
+
+       /* Close the fds we have left */
+       if (!thread_mode)
+               for (i = 0; i < num_fds; i++)
+                       close(snd_ctx->out_fds[i]);
+
+       /* Return number of children to reap */
+       return num_fds * 2;
+}
+
+static const struct option options[] = {
+       OPT_BOOLEAN('p', "pipe", &use_pipes,
+                   "Use pipe() instead of socketpair()"),
+       OPT_BOOLEAN('t', "thread", &thread_mode,
+                   "Be multi thread instead of multi process"),
+       OPT_INTEGER('g', "group", &num_groups,
+                   "Specify number of groups"),
+       OPT_INTEGER('l', "loop", &loops,
+                   "Specify number of loops"),
+       OPT_END()
+};
+
+static const char * const bench_sched_message_usage[] = {
+       "perf bench sched messaging <options>",
+       NULL
+};
+
+int bench_sched_messaging(int argc, const char **argv,
+                   const char *prefix __used)
+{
+       unsigned int i, total_children;
+       struct timeval start, stop, diff;
+       unsigned int num_fds = 20;
+       int readyfds[2], wakefds[2];
+       char dummy;
+       pthread_t *pth_tab;
+
+       argc = parse_options(argc, argv, options,
+                            bench_sched_message_usage, 0);
+
+       pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t));
+       if (!pth_tab)
+               barf("main:malloc()");
+
+       fdpair(readyfds);
+       fdpair(wakefds);
+
+       total_children = 0;
+       for (i = 0; i < num_groups; i++)
+               total_children += group(pth_tab+total_children, num_fds,
+                                       readyfds[1], wakefds[0]);
+
+       /* Wait for everyone to be ready */
+       for (i = 0; i < total_children; i++)
+               if (read(readyfds[0], &dummy, 1) != 1)
+                       barf("Reading for readyfds");
+
+       gettimeofday(&start, NULL);
+
+       /* Kick them off */
+       if (write(wakefds[1], &dummy, 1) != 1)
+               barf("Writing to start them");
+
+       /* Reap them all */
+       for (i = 0; i < total_children; i++)
+               reap_worker(pth_tab[i]);
+
+       gettimeofday(&stop, NULL);
+
+       timersub(&stop, &start, &diff);
+
+       switch (bench_format) {
+       case BENCH_FORMAT_DEFAULT:
+               printf("# %d sender and receiver %s per group\n",
+                      num_fds, thread_mode ? "threads" : "processes");
+               printf("# %d groups == %d %s run\n\n",
+                      num_groups, num_groups * 2 * num_fds,
+                      thread_mode ? "threads" : "processes");
+               printf(" %14s: %lu.%03lu [sec]\n", "Total time",
+                      diff.tv_sec, diff.tv_usec/1000);
+               break;
+       case BENCH_FORMAT_SIMPLE:
+               printf("%lu.%03lu\n", diff.tv_sec, diff.tv_usec/1000);
+               break;
+       default:
+               /* reaching here is something disaster */
+               fprintf(stderr, "Unknown format:%d\n", bench_format);
+               exit(1);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
new file mode 100644 (file)
index 0000000..238185f
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ *
+ * builtin-bench-pipe.c
+ *
+ * pipe: Benchmark for pipe()
+ *
+ * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com>
+ *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c
+ * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
+ *
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../builtin.h"
+#include "bench.h"
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#define LOOPS_DEFAULT 1000000
+static int loops = LOOPS_DEFAULT;
+
+static const struct option options[] = {
+       OPT_INTEGER('l', "loop", &loops,
+                   "Specify number of loops"),
+       OPT_END()
+};
+
+static const char * const bench_sched_pipe_usage[] = {
+       "perf bench sched pipe <options>",
+       NULL
+};
+
+int bench_sched_pipe(int argc, const char **argv,
+                    const char *prefix __used)
+{
+       int pipe_1[2], pipe_2[2];
+       int m = 0, i;
+       struct timeval start, stop, diff;
+       unsigned long long result_usec = 0;
+
+       /*
+        * why does "ret" exist?
+        * discarding returned value of read(), write()
+        * causes error in building environment for perf
+        */
+       int ret, wait_stat;
+       pid_t pid, retpid;
+
+       argc = parse_options(argc, argv, options,
+                            bench_sched_pipe_usage, 0);
+
+       assert(!pipe(pipe_1));
+       assert(!pipe(pipe_2));
+
+       pid = fork();
+       assert(pid >= 0);
+
+       gettimeofday(&start, NULL);
+
+       if (!pid) {
+               for (i = 0; i < loops; i++) {
+                       ret = read(pipe_1[0], &m, sizeof(int));
+                       ret = write(pipe_2[1], &m, sizeof(int));
+               }
+       } else {
+               for (i = 0; i < loops; i++) {
+                       ret = write(pipe_1[1], &m, sizeof(int));
+                       ret = read(pipe_2[0], &m, sizeof(int));
+               }
+       }
+
+       gettimeofday(&stop, NULL);
+       timersub(&stop, &start, &diff);
+
+       if (pid) {
+               retpid = waitpid(pid, &wait_stat, 0);
+               assert((retpid == pid) && WIFEXITED(wait_stat));
+               return 0;
+       }
+
+       switch (bench_format) {
+       case BENCH_FORMAT_DEFAULT:
+               printf("# Extecuted %d pipe operations between two tasks\n\n",
+                       loops);
+
+               result_usec = diff.tv_sec * 1000000;
+               result_usec += diff.tv_usec;
+
+               printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+                      diff.tv_sec, diff.tv_usec/1000);
+
+               printf(" %14lf usecs/op\n",
+                      (double)result_usec / (double)loops);
+               printf(" %14d ops/sec\n",
+                      (int)((double)loops /
+                            ((double)result_usec / (double)1000000)));
+               break;
+
+       case BENCH_FORMAT_SIMPLE:
+               printf("%lu.%03lu\n",
+                      diff.tv_sec, diff.tv_usec / 1000);
+               break;
+
+       default:
+               /* reaching here is something disaster */
+               fprintf(stderr, "Unknown format:%d\n", bench_format);
+               exit(1);
+               break;
+       }
+
+       return 0;
+}
index 1ec741615814dddfefb7ad11391e2dfa26ce8b3a..0bf2e8f9af5776538237fa7e842c33e070b4dbe0 100644 (file)
 #include "perf.h"
 #include "util/debug.h"
 
+#include "util/event.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 #include "util/thread.h"
+#include "util/sort.h"
+#include "util/hist.h"
+#include "util/data_map.h"
 
 static char            const *input_name = "perf.data";
 
-static char            default_sort_order[] = "comm,symbol";
-static char            *sort_order = default_sort_order;
-
 static int             force;
-static int             input;
-static int             show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int             full_paths;
 
 static int             print_line;
 
-static unsigned long   page_size;
-static unsigned long   mmap_window = 32;
-
-static struct rb_root  threads;
-static struct thread   *last_match;
-
+struct sym_hist {
+       u64             sum;
+       u64             ip[0];
+};
 
 struct sym_ext {
        struct rb_node  node;
@@ -49,247 +46,38 @@ struct sym_ext {
        char            *path;
 };
 
-/*
- * histogram, sorted on item, collects counts
- */
-
-static struct rb_root hist;
-
-struct hist_entry {
-       struct rb_node   rb_node;
-
-       struct thread    *thread;
-       struct map       *map;
-       struct dso       *dso;
-       struct symbol    *sym;
-       u64      ip;
-       char             level;
-
-       uint32_t         count;
-};
-
-/*
- * configurable sorting bits
- */
-
-struct sort_entry {
-       struct list_head list;
-
-       const char *header;
-
-       int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-       int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-       size_t  (*print)(FILE *fp, struct hist_entry *);
-};
-
-/* --sort pid */
-
-static int64_t
-sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       return right->thread->pid - left->thread->pid;
-}
-
-static size_t
-sort__thread_print(FILE *fp, struct hist_entry *self)
-{
-       return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
-}
-
-static struct sort_entry sort_thread = {
-       .header = "         Command:  Pid",
-       .cmp    = sort__thread_cmp,
-       .print  = sort__thread_print,
-};
-
-/* --sort comm */
-
-static int64_t
-sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       return right->thread->pid - left->thread->pid;
-}
-
-static int64_t
-sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
-{
-       char *comm_l = left->thread->comm;
-       char *comm_r = right->thread->comm;
-
-       if (!comm_l || !comm_r) {
-               if (!comm_l && !comm_r)
-                       return 0;
-               else if (!comm_l)
-                       return -1;
-               else
-                       return 1;
-       }
-
-       return strcmp(comm_l, comm_r);
-}
-
-static size_t
-sort__comm_print(FILE *fp, struct hist_entry *self)
-{
-       return fprintf(fp, "%16s", self->thread->comm);
-}
-
-static struct sort_entry sort_comm = {
-       .header         = "         Command",
-       .cmp            = sort__comm_cmp,
-       .collapse       = sort__comm_collapse,
-       .print          = sort__comm_print,
-};
-
-/* --sort dso */
-
-static int64_t
-sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       struct dso *dso_l = left->dso;
-       struct dso *dso_r = right->dso;
-
-       if (!dso_l || !dso_r) {
-               if (!dso_l && !dso_r)
-                       return 0;
-               else if (!dso_l)
-                       return -1;
-               else
-                       return 1;
-       }
-
-       return strcmp(dso_l->name, dso_r->name);
-}
-
-static size_t
-sort__dso_print(FILE *fp, struct hist_entry *self)
-{
-       if (self->dso)
-               return fprintf(fp, "%-25s", self->dso->name);
-
-       return fprintf(fp, "%016llx         ", (u64)self->ip);
-}
-
-static struct sort_entry sort_dso = {
-       .header = "Shared Object            ",
-       .cmp    = sort__dso_cmp,
-       .print  = sort__dso_print,
-};
-
-/* --sort symbol */
-
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       u64 ip_l, ip_r;
-
-       if (left->sym == right->sym)
-               return 0;
-
-       ip_l = left->sym ? left->sym->start : left->ip;
-       ip_r = right->sym ? right->sym->start : right->ip;
-
-       return (int64_t)(ip_r - ip_l);
-}
-
-static size_t
-sort__sym_print(FILE *fp, struct hist_entry *self)
-{
-       size_t ret = 0;
-
-       if (verbose)
-               ret += fprintf(fp, "%#018llx  ", (u64)self->ip);
-
-       if (self->sym) {
-               ret += fprintf(fp, "[%c] %s",
-                       self->dso == kernel_dso ? 'k' : '.', self->sym->name);
-       } else {
-               ret += fprintf(fp, "%#016llx", (u64)self->ip);
-       }
-
-       return ret;
-}
-
-static struct sort_entry sort_sym = {
-       .header = "Symbol",
-       .cmp    = sort__sym_cmp,
-       .print  = sort__sym_print,
-};
-
-static int sort__need_collapse = 0;
-
-struct sort_dimension {
-       const char              *name;
-       struct sort_entry       *entry;
-       int                     taken;
+struct sym_priv {
+       struct sym_hist *hist;
+       struct sym_ext  *ext;
 };
 
-static struct sort_dimension sort_dimensions[] = {
-       { .name = "pid",        .entry = &sort_thread,  },
-       { .name = "comm",       .entry = &sort_comm,    },
-       { .name = "dso",        .entry = &sort_dso,     },
-       { .name = "symbol",     .entry = &sort_sym,     },
+static struct symbol_conf symbol_conf = {
+       .priv_size        = sizeof(struct sym_priv),
+       .try_vmlinux_path = true,
 };
 
-static LIST_HEAD(hist_entry__sort_list);
+static const char *sym_hist_filter;
 
-static int sort_dimension__add(char *tok)
+static int symbol_filter(struct map *map __used, struct symbol *sym)
 {
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-               struct sort_dimension *sd = &sort_dimensions[i];
-
-               if (sd->taken)
-                       continue;
-
-               if (strncasecmp(tok, sd->name, strlen(tok)))
-                       continue;
-
-               if (sd->entry->collapse)
-                       sort__need_collapse = 1;
-
-               list_add_tail(&sd->entry->list, &hist_entry__sort_list);
-               sd->taken = 1;
+       if (sym_hist_filter == NULL ||
+           strcmp(sym->name, sym_hist_filter) == 0) {
+               struct sym_priv *priv = symbol__priv(sym);
+               const int size = (sizeof(*priv->hist) +
+                                 (sym->end - sym->start) * sizeof(u64));
 
+               priv->hist = malloc(size);
+               if (priv->hist)
+                       memset(priv->hist, 0, size);
                return 0;
        }
-
-       return -ESRCH;
-}
-
-static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       struct sort_entry *se;
-       int64_t cmp = 0;
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               cmp = se->cmp(left, right);
-               if (cmp)
-                       break;
-       }
-
-       return cmp;
-}
-
-static int64_t
-hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
-{
-       struct sort_entry *se;
-       int64_t cmp = 0;
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               int64_t (*f)(struct hist_entry *, struct hist_entry *);
-
-               f = se->collapse ?: se->cmp;
-
-               cmp = f(left, right);
-               if (cmp)
-                       break;
-       }
-
-       return cmp;
+       /*
+        * FIXME: We should really filter it out, as we don't want to go thru symbols
+        * we're not interested, and if a DSO ends up with no symbols, delete it too,
+        * but right now the kernel loading routines in symbol.c bail out if no symbols
+        * are found, fix it later.
+        */
+       return 0;
 }
 
 /*
@@ -299,380 +87,81 @@ static void hist_hit(struct hist_entry *he, u64 ip)
 {
        unsigned int sym_size, offset;
        struct symbol *sym = he->sym;
+       struct sym_priv *priv;
+       struct sym_hist *h;
 
        he->count++;
 
-       if (!sym || !sym->hist)
+       if (!sym || !he->map)
+               return;
+
+       priv = symbol__priv(sym);
+       if (!priv->hist)
                return;
 
        sym_size = sym->end - sym->start;
        offset = ip - sym->start;
 
+       if (verbose)
+               fprintf(stderr, "%s: ip=%Lx\n", __func__,
+                       he->map->unmap_ip(he->map, ip));
+
        if (offset >= sym_size)
                return;
 
-       sym->hist_sum++;
-       sym->hist[offset]++;
+       h = priv->hist;
+       h->sum++;
+       h->ip[offset]++;
 
        if (verbose >= 3)
                printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
                        (void *)(unsigned long)he->sym->start,
                        he->sym->name,
                        (void *)(unsigned long)ip, ip - he->sym->start,
-                       sym->hist[offset]);
+                       h->ip[offset]);
 }
 
-static int
-hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-               struct symbol *sym, u64 ip, char level)
+static int hist_entry__add(struct addr_location *al, u64 count)
 {
-       struct rb_node **p = &hist.rb_node;
-       struct rb_node *parent = NULL;
-       struct hist_entry *he;
-       struct hist_entry entry = {
-               .thread = thread,
-               .map    = map,
-               .dso    = dso,
-               .sym    = sym,
-               .ip     = ip,
-               .level  = level,
-               .count  = 1,
-       };
-       int cmp;
-
-       while (*p != NULL) {
-               parent = *p;
-               he = rb_entry(parent, struct hist_entry, rb_node);
-
-               cmp = hist_entry__cmp(&entry, he);
-
-               if (!cmp) {
-                       hist_hit(he, ip);
-
-                       return 0;
-               }
-
-               if (cmp < 0)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       he = malloc(sizeof(*he));
-       if (!he)
+       bool hit;
+       struct hist_entry *he = __hist_entry__add(al, NULL, count, &hit);
+       if (he == NULL)
                return -ENOMEM;
-       *he = entry;
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &hist);
-
+       hist_hit(he, al->addr);
        return 0;
 }
 
-static void hist_entry__free(struct hist_entry *he)
-{
-       free(he);
-}
-
-/*
- * collapse the histogram
- */
-
-static struct rb_root collapse_hists;
-
-static void collapse__insert_entry(struct hist_entry *he)
-{
-       struct rb_node **p = &collapse_hists.rb_node;
-       struct rb_node *parent = NULL;
-       struct hist_entry *iter;
-       int64_t cmp;
-
-       while (*p != NULL) {
-               parent = *p;
-               iter = rb_entry(parent, struct hist_entry, rb_node);
-
-               cmp = hist_entry__collapse(iter, he);
-
-               if (!cmp) {
-                       iter->count += he->count;
-                       hist_entry__free(he);
-                       return;
-               }
-
-               if (cmp < 0)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &collapse_hists);
-}
-
-static void collapse__resort(void)
-{
-       struct rb_node *next;
-       struct hist_entry *n;
-
-       if (!sort__need_collapse)
-               return;
-
-       next = rb_first(&hist);
-       while (next) {
-               n = rb_entry(next, struct hist_entry, rb_node);
-               next = rb_next(&n->rb_node);
-
-               rb_erase(&n->rb_node, &hist);
-               collapse__insert_entry(n);
-       }
-}
-
-/*
- * reverse the map, sort on count.
- */
-
-static struct rb_root output_hists;
-
-static void output__insert_entry(struct hist_entry *he)
+static int process_sample_event(event_t *event)
 {
-       struct rb_node **p = &output_hists.rb_node;
-       struct rb_node *parent = NULL;
-       struct hist_entry *iter;
+       struct addr_location al;
 
-       while (*p != NULL) {
-               parent = *p;
-               iter = rb_entry(parent, struct hist_entry, rb_node);
+       dump_printf("(IP, %d): %d: %p\n", event->header.misc,
+                   event->ip.pid, (void *)(long)event->ip.ip);
 
-               if (he->count > iter->count)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &output_hists);
-}
-
-static void output__resort(void)
-{
-       struct rb_node *next;
-       struct hist_entry *n;
-       struct rb_root *tree = &hist;
-
-       if (sort__need_collapse)
-               tree = &collapse_hists;
-
-       next = rb_first(tree);
-
-       while (next) {
-               n = rb_entry(next, struct hist_entry, rb_node);
-               next = rb_next(&n->rb_node);
-
-               rb_erase(&n->rb_node, tree);
-               output__insert_entry(n);
-       }
-}
-
-static unsigned long total = 0,
-                    total_mmap = 0,
-                    total_comm = 0,
-                    total_fork = 0,
-                    total_unknown = 0;
-
-static int
-process_sample_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       char level;
-       int show = 0;
-       struct dso *dso = NULL;
-       struct thread *thread;
-       u64 ip = event->ip.ip;
-       struct map *map = NULL;
-
-       thread = threads__findnew(event->ip.pid, &threads, &last_match);
-
-       dump_printf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->header.misc,
-               event->ip.pid,
-               (void *)(long)ip);
-
-       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-       if (thread == NULL) {
+       if (event__preprocess_sample(event, &al, symbol_filter) < 0) {
                fprintf(stderr, "problem processing %d event, skipping it.\n",
                        event->header.type);
                return -1;
        }
 
-       if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
-               show = SHOW_KERNEL;
-               level = 'k';
-
-               dso = kernel_dso;
-
-               dump_printf(" ...... dso: %s\n", dso->name);
-
-       } else if (event->header.misc & PERF_RECORD_MISC_USER) {
-
-               show = SHOW_USER;
-               level = '.';
-
-               map = thread__find_map(thread, ip);
-               if (map != NULL) {
-                       ip = map->map_ip(map, ip);
-                       dso = map->dso;
-               } else {
-                       /*
-                        * If this is outside of all known maps,
-                        * and is a negative address, try to look it
-                        * up in the kernel dso, as it might be a
-                        * vsyscall (which executes in user-mode):
-                        */
-                       if ((long long)ip < 0)
-                               dso = kernel_dso;
-               }
-               dump_printf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
-
-       } else {
-               show = SHOW_HV;
-               level = 'H';
-               dump_printf(" ...... dso: [hypervisor]\n");
-       }
-
-       if (show & show_mask) {
-               struct symbol *sym = NULL;
-
-               if (dso)
-                       sym = dso->find_symbol(dso, ip);
-
-               if (hist_entry__add(thread, map, dso, sym, ip, level)) {
-                       fprintf(stderr,
-               "problem incrementing symbol count, skipping event\n");
-                       return -1;
-               }
-       }
-       total++;
-
-       return 0;
-}
-
-static int
-process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-       struct map *map = map__new(&event->mmap, NULL, 0);
-
-       thread = threads__findnew(event->mmap.pid, &threads, &last_match);
-
-       dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->mmap.pid,
-               (void *)(long)event->mmap.start,
-               (void *)(long)event->mmap.len,
-               (void *)(long)event->mmap.pgoff,
-               event->mmap.filename);
-
-       if (thread == NULL || map == NULL) {
-               dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
-               return 0;
-       }
-
-       thread__insert_map(thread, map);
-       total_mmap++;
-
-       return 0;
-}
-
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-
-       thread = threads__findnew(event->comm.pid, &threads, &last_match);
-       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->comm.comm, event->comm.pid);
-
-       if (thread == NULL ||
-           thread__set_comm(thread, event->comm.comm)) {
-               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
-               return -1;
-       }
-       total_comm++;
-
-       return 0;
-}
-
-static int
-process_fork_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-       struct thread *parent;
-
-       thread = threads__findnew(event->fork.pid, &threads, &last_match);
-       parent = threads__findnew(event->fork.ppid, &threads, &last_match);
-       dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->fork.pid, event->fork.ppid);
-
-       /*
-        * A thread clone will have the same PID for both
-        * parent and child.
-        */
-       if (thread == parent)
-               return 0;
-
-       if (!thread || !parent || thread__fork(thread, parent)) {
-               dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
-               return -1;
-       }
-       total_fork++;
-
-       return 0;
-}
-
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       switch (event->header.type) {
-       case PERF_RECORD_SAMPLE:
-               return process_sample_event(event, offset, head);
-
-       case PERF_RECORD_MMAP:
-               return process_mmap_event(event, offset, head);
-
-       case PERF_RECORD_COMM:
-               return process_comm_event(event, offset, head);
-
-       case PERF_RECORD_FORK:
-               return process_fork_event(event, offset, head);
-       /*
-        * We dont process them right now but they are fine:
-        */
-
-       case PERF_RECORD_THROTTLE:
-       case PERF_RECORD_UNTHROTTLE:
-               return 0;
-
-       default:
+       if (hist_entry__add(&al, 1)) {
+               fprintf(stderr, "problem incrementing symbol count, "
+                               "skipping event\n");
                return -1;
        }
 
        return 0;
 }
 
-static int
-parse_line(FILE *file, struct symbol *sym, u64 start, u64 len)
+static int parse_line(FILE *file, struct hist_entry *he, u64 len)
 {
+       struct symbol *sym = he->sym;
        char *line = NULL, *tmp, *tmp2;
        static const char *prev_line;
        static const char *prev_color;
        unsigned int offset;
        size_t line_len;
+       u64 start;
        s64 line_ip;
        int ret;
        char *c;
@@ -709,22 +198,26 @@ parse_line(FILE *file, struct symbol *sym, u64 start, u64 len)
                        line_ip = -1;
        }
 
+       start = he->map->unmap_ip(he->map, sym->start);
+
        if (line_ip != -1) {
                const char *path = NULL;
                unsigned int hits = 0;
                double percent = 0.0;
                const char *color;
-               struct sym_ext *sym_ext = sym->priv;
+               struct sym_priv *priv = symbol__priv(sym);
+               struct sym_ext *sym_ext = priv->ext;
+               struct sym_hist *h = priv->hist;
 
                offset = line_ip - start;
                if (offset < len)
-                       hits = sym->hist[offset];
+                       hits = h->ip[offset];
 
                if (offset < len && sym_ext) {
                        path = sym_ext[offset].path;
                        percent = sym_ext[offset].percent;
-               } else if (sym->hist_sum)
-                       percent = 100.0 * hits / sym->hist_sum;
+               } else if (h->sum)
+                       percent = 100.0 * hits / h->sum;
 
                color = get_percent_color(percent);
 
@@ -777,9 +270,10 @@ static void insert_source_line(struct sym_ext *sym_ext)
        rb_insert_color(&sym_ext->node, &root_sym_ext);
 }
 
-static void free_source_line(struct symbol *sym, int len)
+static void free_source_line(struct hist_entry *he, int len)
 {
-       struct sym_ext *sym_ext = sym->priv;
+       struct sym_priv *priv = symbol__priv(he->sym);
+       struct sym_ext *sym_ext = priv->ext;
        int i;
 
        if (!sym_ext)
@@ -789,26 +283,30 @@ static void free_source_line(struct symbol *sym, int len)
                free(sym_ext[i].path);
        free(sym_ext);
 
-       sym->priv = NULL;
+       priv->ext = NULL;
        root_sym_ext = RB_ROOT;
 }
 
 /* Get the filename:line for the colored entries */
 static void
-get_source_line(struct symbol *sym, u64 start, int len, const char *filename)
+get_source_line(struct hist_entry *he, int len, const char *filename)
 {
+       struct symbol *sym = he->sym;
+       u64 start;
        int i;
        char cmd[PATH_MAX * 2];
        struct sym_ext *sym_ext;
+       struct sym_priv *priv = symbol__priv(sym);
+       struct sym_hist *h = priv->hist;
 
-       if (!sym->hist_sum)
+       if (!h->sum)
                return;
 
-       sym->priv = calloc(len, sizeof(struct sym_ext));
-       if (!sym->priv)
+       sym_ext = priv->ext = calloc(len, sizeof(struct sym_ext));
+       if (!priv->ext)
                return;
 
-       sym_ext = sym->priv;
+       start = he->map->unmap_ip(he->map, sym->start);
 
        for (i = 0; i < len; i++) {
                char *path = NULL;
@@ -816,7 +314,7 @@ get_source_line(struct symbol *sym, u64 start, int len, const char *filename)
                u64 offset;
                FILE *fp;
 
-               sym_ext[i].percent = 100.0 * sym->hist[i] / sym->hist_sum;
+               sym_ext[i].percent = 100.0 * h->ip[i] / h->sum;
                if (sym_ext[i].percent <= 0.5)
                        continue;
 
@@ -870,33 +368,34 @@ static void print_summary(const char *filename)
        }
 }
 
-static void annotate_sym(struct dso *dso, struct symbol *sym)
+static void annotate_sym(struct hist_entry *he)
 {
-       const char *filename = dso->name, *d_filename;
-       u64 start, end, len;
+       struct map *map = he->map;
+       struct dso *dso = map->dso;
+       struct symbol *sym = he->sym;
+       const char *filename = dso->long_name, *d_filename;
+       u64 len;
        char command[PATH_MAX*2];
        FILE *file;
 
        if (!filename)
                return;
-       if (sym->module)
-               filename = sym->module->path;
-       else if (dso == kernel_dso)
-               filename = vmlinux_name;
-
-       start = sym->obj_start;
-       if (!start)
-               start = sym->start;
+
+       if (verbose)
+               fprintf(stderr, "%s: filename=%s, sym=%s, start=%Lx, end=%Lx\n",
+                       __func__, filename, sym->name,
+                       map->unmap_ip(map, sym->start),
+                       map->unmap_ip(map, sym->end));
+
        if (full_paths)
                d_filename = filename;
        else
                d_filename = basename(filename);
 
-       end = start + sym->end - sym->start + 1;
        len = sym->end - sym->start;
 
        if (print_line) {
-               get_source_line(sym, start, len, filename);
+               get_source_line(he, len, filename);
                print_summary(filename);
        }
 
@@ -905,10 +404,12 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
        printf("------------------------------------------------\n");
 
        if (verbose >= 2)
-               printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+               printf("annotating [%p] %30s : [%p] %30s\n",
+                      dso, dso->long_name, sym, sym->name);
 
        sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s|grep -v %s",
-                       (u64)start, (u64)end, filename, filename);
+               map->unmap_ip(map, sym->start), map->unmap_ip(map, sym->end),
+               filename, filename);
 
        if (verbose >= 3)
                printf("doing: %s\n", command);
@@ -918,159 +419,78 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
                return;
 
        while (!feof(file)) {
-               if (parse_line(file, sym, start, len) < 0)
+               if (parse_line(file, he, len) < 0)
                        break;
        }
 
        pclose(file);
        if (print_line)
-               free_source_line(sym, len);
+               free_source_line(he, len);
 }
 
 static void find_annotations(void)
 {
        struct rb_node *nd;
-       struct dso *dso;
-       int count = 0;
-
-       list_for_each_entry(dso, &dsos, node) {
-
-               for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
-                       struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
-
-                       if (sym->hist) {
-                               annotate_sym(dso, sym);
-                               count++;
-                       }
-               }
-       }
-
-       if (!count)
-               printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
-}
-
-static int __cmd_annotate(void)
-{
-       int ret, rc = EXIT_FAILURE;
-       unsigned long offset = 0;
-       unsigned long head = 0;
-       struct stat input_stat;
-       event_t *event;
-       uint32_t size;
-       char *buf;
-
-       register_idle_thread(&threads, &last_match);
-
-       input = open(input_name, O_RDONLY);
-       if (input < 0) {
-               perror("failed to open file");
-               exit(-1);
-       }
-
-       ret = fstat(input, &input_stat);
-       if (ret < 0) {
-               perror("failed to stat file");
-               exit(-1);
-       }
-
-       if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
-               fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
-               exit(-1);
-       }
-
-       if (!input_stat.st_size) {
-               fprintf(stderr, "zero-sized file, nothing to do!\n");
-               exit(0);
-       }
-
-       if (load_kernel() < 0) {
-               perror("failed to load kernel symbols");
-               return EXIT_FAILURE;
-       }
-
-remap:
-       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-                          MAP_SHARED, input, offset);
-       if (buf == MAP_FAILED) {
-               perror("failed to mmap file");
-               exit(-1);
-       }
-
-more:
-       event = (event_t *)(buf + head);
 
-       size = event->header.size;
-       if (!size)
-               size = 8;
+       for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
+               struct sym_priv *priv;
 
-       if (head + event->header.size >= page_size * mmap_window) {
-               unsigned long shift = page_size * (head / page_size);
-               int munmap_ret;
-
-               munmap_ret = munmap(buf, page_size * mmap_window);
-               assert(munmap_ret == 0);
-
-               offset += shift;
-               head -= shift;
-               goto remap;
-       }
-
-       size = event->header.size;
-
-       dump_printf("%p [%p]: event: %d\n",
-                       (void *)(offset + head),
-                       (void *)(long)event->header.size,
-                       event->header.type);
-
-       if (!size || process_event(event, offset, head) < 0) {
-
-               dump_printf("%p [%p]: skipping unknown header type: %d\n",
-                       (void *)(offset + head),
-                       (void *)(long)(event->header.size),
-                       event->header.type);
+               if (he->sym == NULL)
+                       continue;
 
-               total_unknown++;
+               priv = symbol__priv(he->sym);
+               if (priv->hist == NULL)
+                       continue;
 
+               annotate_sym(he);
                /*
-                * assume we lost track of the stream, check alignment, and
-                * increment a single u64 in the hope to catch on again 'soon'.
+                * Since we have a hist_entry per IP for the same symbol, free
+                * he->sym->hist to signal we already processed this symbol.
                 */
-
-               if (unlikely(head & 7))
-                       head &= ~7ULL;
-
-               size = 8;
+               free(priv->hist);
+               priv->hist = NULL;
        }
+}
 
-       head += size;
+static struct perf_file_handler file_handler = {
+       .process_sample_event   = process_sample_event,
+       .process_mmap_event     = event__process_mmap,
+       .process_comm_event     = event__process_comm,
+       .process_fork_event     = event__process_task,
+};
 
-       if (offset + head < (unsigned long)input_stat.st_size)
-               goto more;
+static int __cmd_annotate(void)
+{
+       struct perf_header *header;
+       struct thread *idle;
+       int ret;
 
-       rc = EXIT_SUCCESS;
-       close(input);
+       idle = register_idle_thread();
+       register_perf_file_handler(&file_handler);
 
-       dump_printf("      IP events: %10ld\n", total);
-       dump_printf("    mmap events: %10ld\n", total_mmap);
-       dump_printf("    comm events: %10ld\n", total_comm);
-       dump_printf("    fork events: %10ld\n", total_fork);
-       dump_printf(" unknown events: %10ld\n", total_unknown);
+       ret = mmap_dispatch_perf_file(&header, input_name, 0, 0,
+                                     &event__cwdlen, &event__cwd);
+       if (ret)
+               return ret;
 
-       if (dump_trace)
+       if (dump_trace) {
+               event__print_totals();
                return 0;
+       }
 
-       if (verbose >= 3)
-               threads__fprintf(stdout, &threads);
+       if (verbose > 3)
+               threads__fprintf(stdout);
 
-       if (verbose >= 2)
+       if (verbose > 2)
                dsos__fprintf(stdout);
 
        collapse__resort();
-       output__resort();
+       output__resort(event__total[0]);
 
        find_annotations();
 
-       return rc;
+       return ret;
 }
 
 static const char * const annotate_usage[] = {
@@ -1088,8 +508,9 @@ static const struct option options[] = {
                    "be more verbose (show symbol address, etc)"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
-       OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
-       OPT_BOOLEAN('m', "modules", &modules,
+       OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+                  "file", "vmlinux pathname"),
+       OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
                    "load module symbols - WARNING: use only with -k and LIVE kernel"),
        OPT_BOOLEAN('l', "print-line", &print_line,
                    "print matching source lines (may be slow)"),
@@ -1115,9 +536,8 @@ static void setup_sorting(void)
 
 int cmd_annotate(int argc, const char **argv, const char *prefix __used)
 {
-       symbol__init();
-
-       page_size = getpagesize();
+       if (symbol__init(&symbol_conf) < 0)
+               return -1;
 
        argc = parse_options(argc, argv, options, annotate_usage, 0);
 
@@ -1134,10 +554,13 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
                sym_hist_filter = argv[0];
        }
 
-       if (!sym_hist_filter)
-               usage_with_options(annotate_usage, options);
-
        setup_pager();
 
+       if (field_sep && *field_sep == '.') {
+               fputs("'.' is the only non valid --field-separator argument\n",
+                               stderr);
+               exit(129);
+       }
+
        return __cmd_annotate();
 }
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
new file mode 100644 (file)
index 0000000..e043eb8
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ *
+ * builtin-bench.c
+ *
+ * General benchmarking subsystem provided by perf
+ *
+ * Copyright (C) 2009, Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
+ *
+ */
+
+/*
+ *
+ * Available subsystem list:
+ *  sched ... scheduler and IPC mechanism
+ *  mem   ... memory access performance
+ *
+ */
+
+#include "perf.h"
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "builtin.h"
+#include "bench/bench.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct bench_suite {
+       const char *name;
+       const char *summary;
+       int (*fn)(int, const char **, const char *);
+};
+
+static struct bench_suite sched_suites[] = {
+       { "messaging",
+         "Benchmark for scheduler and IPC mechanisms",
+         bench_sched_messaging },
+       { "pipe",
+         "Flood of communication over pipe() between two processes",
+         bench_sched_pipe      },
+       { NULL,
+         NULL,
+         NULL                  }
+};
+
+static struct bench_suite mem_suites[] = {
+       { "memcpy",
+         "Simple memory copy in various ways",
+         bench_mem_memcpy },
+       { NULL,
+         NULL,
+         NULL             }
+};
+
+struct bench_subsys {
+       const char *name;
+       const char *summary;
+       struct bench_suite *suites;
+};
+
+static struct bench_subsys subsystems[] = {
+       { "sched",
+         "scheduler and IPC mechanism",
+         sched_suites },
+       { "mem",
+         "memory access performance",
+         mem_suites },
+       { NULL,
+         NULL,
+         NULL       }
+};
+
+static void dump_suites(int subsys_index)
+{
+       int i;
+
+       printf("List of available suites for %s...\n\n",
+              subsystems[subsys_index].name);
+
+       for (i = 0; subsystems[subsys_index].suites[i].name; i++)
+               printf("\t%s: %s\n",
+                      subsystems[subsys_index].suites[i].name,
+                      subsystems[subsys_index].suites[i].summary);
+
+       printf("\n");
+       return;
+}
+
+static char *bench_format_str;
+int bench_format = BENCH_FORMAT_DEFAULT;
+
+static const struct option bench_options[] = {
+       OPT_STRING('f', "format", &bench_format_str, "default",
+                   "Specify format style"),
+       OPT_END()
+};
+
+static const char * const bench_usage[] = {
+       "perf bench [<common options>] <subsystem> <suite> [<options>]",
+       NULL
+};
+
+static void print_usage(void)
+{
+       int i;
+
+       printf("Usage: \n");
+       for (i = 0; bench_usage[i]; i++)
+               printf("\t%s\n", bench_usage[i]);
+       printf("\n");
+
+       printf("List of available subsystems...\n\n");
+
+       for (i = 0; subsystems[i].name; i++)
+               printf("\t%s: %s\n",
+                      subsystems[i].name, subsystems[i].summary);
+       printf("\n");
+}
+
+static int bench_str2int(char *str)
+{
+       if (!str)
+               return BENCH_FORMAT_DEFAULT;
+
+       if (!strcmp(str, BENCH_FORMAT_DEFAULT_STR))
+               return BENCH_FORMAT_DEFAULT;
+       else if (!strcmp(str, BENCH_FORMAT_SIMPLE_STR))
+               return BENCH_FORMAT_SIMPLE;
+
+       return BENCH_FORMAT_UNKNOWN;
+}
+
+int cmd_bench(int argc, const char **argv, const char *prefix __used)
+{
+       int i, j, status = 0;
+
+       if (argc < 2) {
+               /* No subsystem specified. */
+               print_usage();
+               goto end;
+       }
+
+       argc = parse_options(argc, argv, bench_options, bench_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
+
+       bench_format = bench_str2int(bench_format_str);
+       if (bench_format == BENCH_FORMAT_UNKNOWN) {
+               printf("Unknown format descriptor:%s\n", bench_format_str);
+               goto end;
+       }
+
+       if (argc < 1) {
+               print_usage();
+               goto end;
+       }
+
+       for (i = 0; subsystems[i].name; i++) {
+               if (strcmp(subsystems[i].name, argv[0]))
+                       continue;
+
+               if (argc < 2) {
+                       /* No suite specified. */
+                       dump_suites(i);
+                       goto end;
+               }
+
+               for (j = 0; subsystems[i].suites[j].name; j++) {
+                       if (strcmp(subsystems[i].suites[j].name, argv[1]))
+                               continue;
+
+                       if (bench_format == BENCH_FORMAT_DEFAULT)
+                               printf("# Running %s/%s benchmark...\n",
+                                      subsystems[i].name,
+                                      subsystems[i].suites[j].name);
+                       status = subsystems[i].suites[j].fn(argc - 1,
+                                                           argv + 1, prefix);
+                       goto end;
+               }
+
+               if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
+                       dump_suites(i);
+                       goto end;
+               }
+
+               printf("Unknown suite:%s for %s\n", argv[1], argv[0]);
+               status = 1;
+               goto end;
+       }
+
+       printf("Unknown subsystem:%s\n", argv[0]);
+       status = 1;
+
+end:
+       return status;
+}
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
new file mode 100644 (file)
index 0000000..7dee9d1
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * builtin-buildid-list.c
+ *
+ * Builtin buildid-list command: list buildids in perf.data
+ *
+ * Copyright (C) 2009, Red Hat Inc.
+ * Copyright (C) 2009, Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+#include "builtin.h"
+#include "perf.h"
+#include "util/cache.h"
+#include "util/data_map.h"
+#include "util/debug.h"
+#include "util/header.h"
+#include "util/parse-options.h"
+#include "util/symbol.h"
+
+static char const *input_name = "perf.data";
+static int force;
+
+static const char *const buildid_list_usage[] = {
+       "perf report [<options>]",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_STRING('i', "input", &input_name, "file",
+                   "input file name"),
+       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose"),
+       OPT_END()
+};
+
+static int perf_file_section__process_buildids(struct perf_file_section *self,
+                                              int feat, int fd)
+{
+       if (feat != HEADER_BUILD_ID)
+               return 0;
+
+       if (lseek(fd, self->offset, SEEK_SET) < 0) {
+               pr_warning("Failed to lseek to %Ld offset for buildids!\n",
+                          self->offset);
+               return -1;
+       }
+
+       if (perf_header__read_build_ids(fd, self->offset, self->size)) {
+               pr_warning("Failed to read buildids!\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+static int __cmd_buildid_list(void)
+{
+       int err = -1;
+       struct perf_header *header;
+       struct perf_file_header f_header;
+       struct stat input_stat;
+       int input = open(input_name, O_RDONLY);
+
+       if (input < 0) {
+               pr_err("failed to open file: %s", input_name);
+               if (!strcmp(input_name, "perf.data"))
+                       pr_err("  (try 'perf record' first)");
+               pr_err("\n");
+               goto out;
+       }
+
+       err = fstat(input, &input_stat);
+       if (err < 0) {
+               perror("failed to stat file");
+               goto out_close;
+       }
+
+       if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
+               pr_err("file %s not owned by current user or root\n",
+                      input_name);
+               goto out_close;
+       }
+
+       if (!input_stat.st_size) {
+               pr_info("zero-sized file, nothing to do!\n");
+               goto out_close;
+       }
+
+       err = -1;
+       header = perf_header__new();
+       if (header == NULL)
+               goto out_close;
+
+       if (perf_file_header__read(&f_header, header, input) < 0) {
+               pr_warning("incompatible file format");
+               goto out_close;
+       }
+
+       err = perf_header__process_sections(header, input,
+                                        perf_file_section__process_buildids);
+
+       if (err < 0)
+               goto out_close;
+
+       dsos__fprintf_buildid(stdout);
+out_close:
+       close(input);
+out:
+       return err;
+}
+
+int cmd_buildid_list(int argc, const char **argv, const char *prefix __used)
+{
+       argc = parse_options(argc, argv, options, buildid_list_usage, 0);
+       setup_pager();
+       return __cmd_buildid_list();
+}
index 4fb8734a796e992513cbae3f0cb2a5d8851ec113..9f810b17c25c58bfb1af1522dc6ec3d4eee85362 100644 (file)
@@ -61,8 +61,7 @@ static const char *get_man_viewer_info(const char *name)
 {
        struct man_viewer_info_list *viewer;
 
-       for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
-       {
+       for (viewer = man_viewer_info_list; viewer; viewer = viewer->next) {
                if (!strcasecmp(name, viewer->name))
                        return viewer->info;
        }
@@ -115,7 +114,7 @@ static int check_emacsclient_version(void)
        return 0;
 }
 
-static void exec_woman_emacs(const charpath, const char *page)
+static void exec_woman_emacs(const char *path, const char *page)
 {
        if (!check_emacsclient_version()) {
                /* This works only with emacsclient version >= 22. */
@@ -129,7 +128,7 @@ static void exec_woman_emacs(const char* path, const char *page)
        }
 }
 
-static void exec_man_konqueror(const charpath, const char *page)
+static void exec_man_konqueror(const char *path, const char *page)
 {
        const char *display = getenv("DISPLAY");
        if (display && *display) {
@@ -157,7 +156,7 @@ static void exec_man_konqueror(const char* path, const char *page)
        }
 }
 
-static void exec_man_man(const charpath, const char *page)
+static void exec_man_man(const char *path, const char *page)
 {
        if (!path)
                path = "man";
@@ -180,7 +179,7 @@ static void add_man_viewer(const char *name)
 
        while (*p)
                p = &((*p)->next);
-       *p = calloc(1, (sizeof(**p) + len + 1));
+       *p = zalloc(sizeof(**p) + len + 1);
        strncpy((*p)->name, name, len);
 }
 
@@ -195,7 +194,7 @@ static void do_add_man_viewer_info(const char *name,
                                   size_t len,
                                   const char *value)
 {
-       struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
+       struct man_viewer_info_list *new = zalloc(sizeof(*new) + len + 1);
 
        strncpy(new->name, name, len);
        new->info = strdup(value);
@@ -364,9 +363,8 @@ static void show_man_page(const char *perf_cmd)
 
        setup_man_path();
        for (viewer = man_viewer_list; viewer; viewer = viewer->next)
-       {
                exec_viewer(viewer->name, page); /* will return when unable */
-       }
+
        if (fallback)
                exec_viewer(fallback, page);
        exec_viewer("man", page);
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644 (file)
index 0000000..047fef7
--- /dev/null
@@ -0,0 +1,807 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const              *input_name = "perf.data";
+
+static struct perf_header      *header;
+static u64                     sample_type;
+
+static int                     alloc_flag;
+static int                     caller_flag;
+
+static int                     alloc_lines = -1;
+static int                     caller_lines = -1;
+
+static bool                    raw_ip;
+
+static char                    default_sort_order[] = "frag,hit,bytes";
+
+static int                     *cpunode_map;
+static int                     max_cpu_num;
+
+struct alloc_stat {
+       u64     call_site;
+       u64     ptr;
+       u64     bytes_req;
+       u64     bytes_alloc;
+       u32     hit;
+       u32     pingpong;
+
+       short   alloc_cpu;
+
+       struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+static unsigned long nr_allocs, nr_cross_allocs;
+
+struct raw_event_sample {
+       u32 size;
+       char data[0];
+};
+
+#define PATH_SYS_NODE  "/sys/devices/system/node"
+
+static void init_cpunode_map(void)
+{
+       FILE *fp;
+       int i;
+
+       fp = fopen("/sys/devices/system/cpu/kernel_max", "r");
+       if (!fp) {
+               max_cpu_num = 4096;
+               return;
+       }
+
+       if (fscanf(fp, "%d", &max_cpu_num) < 1)
+               die("Failed to read 'kernel_max' from sysfs");
+       max_cpu_num++;
+
+       cpunode_map = calloc(max_cpu_num, sizeof(int));
+       if (!cpunode_map)
+               die("calloc");
+       for (i = 0; i < max_cpu_num; i++)
+               cpunode_map[i] = -1;
+       fclose(fp);
+}
+
+static void setup_cpunode_map(void)
+{
+       struct dirent *dent1, *dent2;
+       DIR *dir1, *dir2;
+       unsigned int cpu, mem;
+       char buf[PATH_MAX];
+
+       init_cpunode_map();
+
+       dir1 = opendir(PATH_SYS_NODE);
+       if (!dir1)
+               return;
+
+       while (true) {
+               dent1 = readdir(dir1);
+               if (!dent1)
+                       break;
+
+               if (sscanf(dent1->d_name, "node%u", &mem) < 1)
+                       continue;
+
+               snprintf(buf, PATH_MAX, "%s/%s", PATH_SYS_NODE, dent1->d_name);
+               dir2 = opendir(buf);
+               if (!dir2)
+                       continue;
+               while (true) {
+                       dent2 = readdir(dir2);
+                       if (!dent2)
+                               break;
+                       if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
+                               continue;
+                       cpunode_map[cpu] = mem;
+               }
+       }
+}
+
+static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
+                             int bytes_req, int bytes_alloc, int cpu)
+{
+       struct rb_node **node = &root_alloc_stat.rb_node;
+       struct rb_node *parent = NULL;
+       struct alloc_stat *data = NULL;
+
+       while (*node) {
+               parent = *node;
+               data = rb_entry(*node, struct alloc_stat, node);
+
+               if (ptr > data->ptr)
+                       node = &(*node)->rb_right;
+               else if (ptr < data->ptr)
+                       node = &(*node)->rb_left;
+               else
+                       break;
+       }
+
+       if (data && data->ptr == ptr) {
+               data->hit++;
+               data->bytes_req += bytes_req;
+               data->bytes_alloc += bytes_req;
+       } else {
+               data = malloc(sizeof(*data));
+               if (!data)
+                       die("malloc");
+               data->ptr = ptr;
+               data->pingpong = 0;
+               data->hit = 1;
+               data->bytes_req = bytes_req;
+               data->bytes_alloc = bytes_alloc;
+
+               rb_link_node(&data->node, parent, node);
+               rb_insert_color(&data->node, &root_alloc_stat);
+       }
+       data->call_site = call_site;
+       data->alloc_cpu = cpu;
+}
+
+static void insert_caller_stat(unsigned long call_site,
+                             int bytes_req, int bytes_alloc)
+{
+       struct rb_node **node = &root_caller_stat.rb_node;
+       struct rb_node *parent = NULL;
+       struct alloc_stat *data = NULL;
+
+       while (*node) {
+               parent = *node;
+               data = rb_entry(*node, struct alloc_stat, node);
+
+               if (call_site > data->call_site)
+                       node = &(*node)->rb_right;
+               else if (call_site < data->call_site)
+                       node = &(*node)->rb_left;
+               else
+                       break;
+       }
+
+       if (data && data->call_site == call_site) {
+               data->hit++;
+               data->bytes_req += bytes_req;
+               data->bytes_alloc += bytes_req;
+       } else {
+               data = malloc(sizeof(*data));
+               if (!data)
+                       die("malloc");
+               data->call_site = call_site;
+               data->pingpong = 0;
+               data->hit = 1;
+               data->bytes_req = bytes_req;
+               data->bytes_alloc = bytes_alloc;
+
+               rb_link_node(&data->node, parent, node);
+               rb_insert_color(&data->node, &root_caller_stat);
+       }
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+                               struct event *event,
+                               int cpu,
+                               u64 timestamp __used,
+                               struct thread *thread __used,
+                               int node)
+{
+       unsigned long call_site;
+       unsigned long ptr;
+       int bytes_req;
+       int bytes_alloc;
+       int node1, node2;
+
+       ptr = raw_field_value(event, "ptr", raw->data);
+       call_site = raw_field_value(event, "call_site", raw->data);
+       bytes_req = raw_field_value(event, "bytes_req", raw->data);
+       bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+       insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu);
+       insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+       total_requested += bytes_req;
+       total_allocated += bytes_alloc;
+
+       if (node) {
+               node1 = cpunode_map[cpu];
+               node2 = raw_field_value(event, "node", raw->data);
+               if (node1 != node2)
+                       nr_cross_allocs++;
+       }
+       nr_allocs++;
+}
+
+static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
+static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
+
+static struct alloc_stat *search_alloc_stat(unsigned long ptr,
+                                           unsigned long call_site,
+                                           struct rb_root *root,
+                                           sort_fn_t sort_fn)
+{
+       struct rb_node *node = root->rb_node;
+       struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
+
+       while (node) {
+               struct alloc_stat *data;
+               int cmp;
+
+               data = rb_entry(node, struct alloc_stat, node);
+
+               cmp = sort_fn(&key, data);
+               if (cmp < 0)
+                       node = node->rb_left;
+               else if (cmp > 0)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static void process_free_event(struct raw_event_sample *raw,
+                              struct event *event,
+                              int cpu,
+                              u64 timestamp __used,
+                              struct thread *thread __used)
+{
+       unsigned long ptr;
+       struct alloc_stat *s_alloc, *s_caller;
+
+       ptr = raw_field_value(event, "ptr", raw->data);
+
+       s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
+       if (!s_alloc)
+               return;
+
+       if (cpu != s_alloc->alloc_cpu) {
+               s_alloc->pingpong++;
+
+               s_caller = search_alloc_stat(0, s_alloc->call_site,
+                                            &root_caller_stat, callsite_cmp);
+               assert(s_caller);
+               s_caller->pingpong++;
+       }
+       s_alloc->alloc_cpu = -1;
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+                 int cpu, u64 timestamp, struct thread *thread)
+{
+       struct raw_event_sample *raw = more_data;
+       struct event *event;
+       int type;
+
+       type = trace_parse_common_type(raw->data);
+       event = trace_find_event(type);
+
+       if (!strcmp(event->name, "kmalloc") ||
+           !strcmp(event->name, "kmem_cache_alloc")) {
+               process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+               return;
+       }
+
+       if (!strcmp(event->name, "kmalloc_node") ||
+           !strcmp(event->name, "kmem_cache_alloc_node")) {
+               process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+               return;
+       }
+
+       if (!strcmp(event->name, "kfree") ||
+           !strcmp(event->name, "kmem_cache_free")) {
+               process_free_event(raw, event, cpu, timestamp, thread);
+               return;
+       }
+}
+
+static int process_sample_event(event_t *event)
+{
+       u64 ip = event->ip.ip;
+       u64 timestamp = -1;
+       u32 cpu = -1;
+       u64 period = 1;
+       void *more_data = event->ip.__more_data;
+       struct thread *thread = threads__findnew(event->ip.pid);
+
+       if (sample_type & PERF_SAMPLE_TIME) {
+               timestamp = *(u64 *)more_data;
+               more_data += sizeof(u64);
+       }
+
+       if (sample_type & PERF_SAMPLE_CPU) {
+               cpu = *(u32 *)more_data;
+               more_data += sizeof(u32);
+               more_data += sizeof(u32); /* reserved */
+       }
+
+       if (sample_type & PERF_SAMPLE_PERIOD) {
+               period = *(u64 *)more_data;
+               more_data += sizeof(u64);
+       }
+
+       dump_printf("(IP, %d): %d/%d: %p period: %Ld\n",
+               event->header.misc,
+               event->ip.pid, event->ip.tid,
+               (void *)(long)ip,
+               (long long)period);
+
+       if (thread == NULL) {
+               pr_debug("problem processing %d event, skipping it.\n",
+                        event->header.type);
+               return -1;
+       }
+
+       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+       process_raw_event(event, more_data, cpu, timestamp, thread);
+
+       return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+       sample_type = type;
+
+       if (!(sample_type & PERF_SAMPLE_RAW)) {
+               fprintf(stderr,
+                       "No trace sample to read. Did you call perf record "
+                       "without -R?");
+               return -1;
+       }
+
+       return 0;
+}
+
+static struct perf_file_handler file_handler = {
+       .process_sample_event   = process_sample_event,
+       .process_comm_event     = event__process_comm,
+       .sample_type_check      = sample_type_check,
+};
+
+static int read_events(void)
+{
+       register_idle_thread();
+       register_perf_file_handler(&file_handler);
+
+       return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+                                      &event__cwdlen, &event__cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+       if (n_alloc == 0)
+               return 0.0;
+       else
+               return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+       struct rb_node *next;
+
+       printf("%.102s\n", graph_dotted_line);
+       printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
+       printf(" Total_alloc/Per | Total_req/Per   | Hit   | Ping-pong | Frag\n");
+       printf("%.102s\n", graph_dotted_line);
+
+       next = rb_first(root);
+
+       while (next && n_lines--) {
+               struct alloc_stat *data = rb_entry(next, struct alloc_stat,
+                                                  node);
+               struct symbol *sym = NULL;
+               char buf[BUFSIZ];
+               u64 addr;
+
+               if (is_caller) {
+                       addr = data->call_site;
+                       if (!raw_ip)
+                               sym = thread__find_function(kthread, addr, NULL);
+               } else
+                       addr = data->ptr;
+
+               if (sym != NULL)
+                       snprintf(buf, sizeof(buf), "%s+%Lx", sym->name,
+                                addr - sym->start);
+               else
+                       snprintf(buf, sizeof(buf), "%#Lx", addr);
+               printf(" %-34s |", buf);
+
+               printf(" %9llu/%-5lu | %9llu/%-5lu | %6lu | %8lu | %6.3f%%\n",
+                      (unsigned long long)data->bytes_alloc,
+                      (unsigned long)data->bytes_alloc / data->hit,
+                      (unsigned long long)data->bytes_req,
+                      (unsigned long)data->bytes_req / data->hit,
+                      (unsigned long)data->hit,
+                      (unsigned long)data->pingpong,
+                      fragmentation(data->bytes_req, data->bytes_alloc));
+
+               next = rb_next(next);
+       }
+
+       if (n_lines == -1)
+               printf(" ...                                | ...             | ...             | ...    | ...      | ...   \n");
+
+       printf("%.102s\n", graph_dotted_line);
+}
+
+static void print_summary(void)
+{
+       printf("\nSUMMARY\n=======\n");
+       printf("Total bytes requested: %lu\n", total_requested);
+       printf("Total bytes allocated: %lu\n", total_allocated);
+       printf("Total bytes wasted on internal fragmentation: %lu\n",
+              total_allocated - total_requested);
+       printf("Internal fragmentation: %f%%\n",
+              fragmentation(total_requested, total_allocated));
+       printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs);
+}
+
+static void print_result(void)
+{
+       if (caller_flag)
+               __print_result(&root_caller_sorted, caller_lines, 1);
+       if (alloc_flag)
+               __print_result(&root_alloc_sorted, alloc_lines, 0);
+       print_summary();
+}
+
+struct sort_dimension {
+       const char              name[20];
+       sort_fn_t               cmp;
+       struct list_head        list;
+};
+
+static LIST_HEAD(caller_sort);
+static LIST_HEAD(alloc_sort);
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+                       struct list_head *sort_list)
+{
+       struct rb_node **new = &(root->rb_node);
+       struct rb_node *parent = NULL;
+       struct sort_dimension *sort;
+
+       while (*new) {
+               struct alloc_stat *this;
+               int cmp = 0;
+
+               this = rb_entry(*new, struct alloc_stat, node);
+               parent = *new;
+
+               list_for_each_entry(sort, sort_list, list) {
+                       cmp = sort->cmp(data, this);
+                       if (cmp)
+                               break;
+               }
+
+               if (cmp > 0)
+                       new = &((*new)->rb_left);
+               else
+                       new = &((*new)->rb_right);
+       }
+
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+                         struct list_head *sort_list)
+{
+       struct rb_node *node;
+       struct alloc_stat *data;
+
+       for (;;) {
+               node = rb_first(root);
+               if (!node)
+                       break;
+
+               rb_erase(node, root);
+               data = rb_entry(node, struct alloc_stat, node);
+               sort_insert(root_sorted, data, sort_list);
+       }
+}
+
+static void sort_result(void)
+{
+       __sort_result(&root_alloc_stat, &root_alloc_sorted, &alloc_sort);
+       __sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort);
+}
+
+static int __cmd_kmem(void)
+{
+       setup_pager();
+       read_events();
+       sort_result();
+       print_result();
+
+       return 0;
+}
+
+static const char * const kmem_usage[] = {
+       "perf kmem [<options>] {record}",
+       NULL
+};
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       if (l->ptr < r->ptr)
+               return -1;
+       else if (l->ptr > r->ptr)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension ptr_sort_dimension = {
+       .name   = "ptr",
+       .cmp    = ptr_cmp,
+};
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       if (l->call_site < r->call_site)
+               return -1;
+       else if (l->call_site > r->call_site)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension callsite_sort_dimension = {
+       .name   = "callsite",
+       .cmp    = callsite_cmp,
+};
+
+static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       if (l->hit < r->hit)
+               return -1;
+       else if (l->hit > r->hit)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension hit_sort_dimension = {
+       .name   = "hit",
+       .cmp    = hit_cmp,
+};
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       if (l->bytes_alloc < r->bytes_alloc)
+               return -1;
+       else if (l->bytes_alloc > r->bytes_alloc)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension bytes_sort_dimension = {
+       .name   = "bytes",
+       .cmp    = bytes_cmp,
+};
+
+static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       double x, y;
+
+       x = fragmentation(l->bytes_req, l->bytes_alloc);
+       y = fragmentation(r->bytes_req, r->bytes_alloc);
+
+       if (x < y)
+               return -1;
+       else if (x > y)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension frag_sort_dimension = {
+       .name   = "frag",
+       .cmp    = frag_cmp,
+};
+
+static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+       if (l->pingpong < r->pingpong)
+               return -1;
+       else if (l->pingpong > r->pingpong)
+               return 1;
+       return 0;
+}
+
+static struct sort_dimension pingpong_sort_dimension = {
+       .name   = "pingpong",
+       .cmp    = pingpong_cmp,
+};
+
+static struct sort_dimension *avail_sorts[] = {
+       &ptr_sort_dimension,
+       &callsite_sort_dimension,
+       &hit_sort_dimension,
+       &bytes_sort_dimension,
+       &frag_sort_dimension,
+       &pingpong_sort_dimension,
+};
+
+#define NUM_AVAIL_SORTS        \
+       (int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
+
+static int sort_dimension__add(const char *tok, struct list_head *list)
+{
+       struct sort_dimension *sort;
+       int i;
+
+       for (i = 0; i < NUM_AVAIL_SORTS; i++) {
+               if (!strcmp(avail_sorts[i]->name, tok)) {
+                       sort = malloc(sizeof(*sort));
+                       if (!sort)
+                               die("malloc");
+                       memcpy(sort, avail_sorts[i], sizeof(*sort));
+                       list_add_tail(&sort->list, list);
+                       return 0;
+               }
+       }
+
+       return -1;
+}
+
+static int setup_sorting(struct list_head *sort_list, const char *arg)
+{
+       char *tok;
+       char *str = strdup(arg);
+
+       if (!str)
+               die("strdup");
+
+       while (true) {
+               tok = strsep(&str, ",");
+               if (!tok)
+                       break;
+               if (sort_dimension__add(tok, sort_list) < 0) {
+                       error("Unknown --sort key: '%s'", tok);
+                       return -1;
+               }
+       }
+
+       free(str);
+       return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+                         const char *arg, int unset __used)
+{
+       if (!arg)
+               return -1;
+
+       if (caller_flag > alloc_flag)
+               return setup_sorting(&caller_sort, arg);
+       else
+               return setup_sorting(&alloc_sort, arg);
+
+       return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+                         const char *arg, int unset __used)
+{
+       if (!arg)
+               return -1;
+
+       if (strcmp(arg, "alloc") == 0)
+               alloc_flag = (caller_flag + 1);
+       else if (strcmp(arg, "caller") == 0)
+               caller_flag = (alloc_flag + 1);
+       else
+               return -1;
+       return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+                         const char *arg, int unset __used)
+{
+       int lines;
+
+       if (!arg)
+               return -1;
+
+       lines = strtoul(arg, NULL, 10);
+
+       if (caller_flag > alloc_flag)
+               caller_lines = lines;
+       else
+               alloc_lines = lines;
+
+       return 0;
+}
+
+static const struct option kmem_options[] = {
+       OPT_STRING('i', "input", &input_name, "file",
+                  "input file name"),
+       OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+                    "stat selector, Pass 'alloc' or 'caller'.",
+                    parse_stat_opt),
+       OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
+                    "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
+                    parse_sort_opt),
+       OPT_CALLBACK('l', "line", NULL, "num",
+                    "show n lins",
+                    parse_line_opt),
+       OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
+       OPT_END()
+};
+
+static const char *record_args[] = {
+       "record",
+       "-a",
+       "-R",
+       "-M",
+       "-f",
+       "-c", "1",
+       "-e", "kmem:kmalloc",
+       "-e", "kmem:kmalloc_node",
+       "-e", "kmem:kfree",
+       "-e", "kmem:kmem_cache_alloc",
+       "-e", "kmem:kmem_cache_alloc_node",
+       "-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+       unsigned int rec_argc, i, j;
+       const char **rec_argv;
+
+       rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+       rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+       for (i = 0; i < ARRAY_SIZE(record_args); i++)
+               rec_argv[i] = strdup(record_args[i]);
+
+       for (j = 1; j < (unsigned int)argc; j++, i++)
+               rec_argv[i] = argv[j];
+
+       return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+       symbol__init(0);
+
+       argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+       if (argc && !strncmp(argv[0], "rec", 3))
+               return __cmd_record(argc, argv);
+       else if (argc)
+               usage_with_options(kmem_usage, kmem_options);
+
+       if (list_empty(&caller_sort))
+               setup_sorting(&caller_sort, default_sort_order);
+       if (list_empty(&alloc_sort))
+               setup_sorting(&alloc_sort, default_sort_order);
+
+       setup_cpunode_map();
+
+       return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
new file mode 100644 (file)
index 0000000..a58e11b
--- /dev/null
@@ -0,0 +1,242 @@
+/*
+ * builtin-probe.c
+ *
+ * Builtin probe command: Set up probe events by C expression
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#define _GNU_SOURCE
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+
+#undef _GNU_SOURCE
+#include "perf.h"
+#include "builtin.h"
+#include "util/util.h"
+#include "util/event.h"
+#include "util/debug.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h" /* For debugfs_path */
+#include "util/probe-finder.h"
+#include "util/probe-event.h"
+
+/* Default vmlinux search paths */
+#define NR_SEARCH_PATH 3
+const char *default_search_path[NR_SEARCH_PATH] = {
+"/lib/modules/%s/build/vmlinux",               /* Custom build kernel */
+"/usr/lib/debug/lib/modules/%s/vmlinux",       /* Red Hat debuginfo */
+"/boot/vmlinux-debug-%s",                      /* Ubuntu */
+};
+
+#define MAX_PATH_LEN 256
+#define MAX_PROBES 128
+
+/* Session management structure */
+static struct {
+       char *vmlinux;
+       char *release;
+       int need_dwarf;
+       int nr_probe;
+       struct probe_point probes[MAX_PROBES];
+} session;
+
+static bool listing;
+
+/* Parse an event definition. Note that any error must die. */
+static void parse_probe_event(const char *str)
+{
+       struct probe_point *pp = &session.probes[session.nr_probe];
+
+       pr_debug("probe-definition(%d): %s\n", session.nr_probe, str);
+       if (++session.nr_probe == MAX_PROBES)
+               die("Too many probes (> %d) are specified.", MAX_PROBES);
+
+       /* Parse perf-probe event into probe_point */
+       session.need_dwarf = parse_perf_probe_event(str, pp);
+
+       pr_debug("%d arguments\n", pp->nr_args);
+}
+
+static int opt_add_probe_event(const struct option *opt __used,
+                             const char *str, int unset __used)
+{
+       if (str)
+               parse_probe_event(str);
+       return 0;
+}
+
+#ifndef NO_LIBDWARF
+static int open_default_vmlinux(void)
+{
+       struct utsname uts;
+       char fname[MAX_PATH_LEN];
+       int fd, ret, i;
+
+       ret = uname(&uts);
+       if (ret) {
+               pr_debug("uname() failed.\n");
+               return -errno;
+       }
+       session.release = uts.release;
+       for (i = 0; i < NR_SEARCH_PATH; i++) {
+               ret = snprintf(fname, MAX_PATH_LEN,
+                              default_search_path[i], session.release);
+               if (ret >= MAX_PATH_LEN || ret < 0) {
+                       pr_debug("Filename(%d,%s) is too long.\n", i,
+                               uts.release);
+                       errno = E2BIG;
+                       return -E2BIG;
+               }
+               pr_debug("try to open %s\n", fname);
+               fd = open(fname, O_RDONLY);
+               if (fd >= 0)
+                       break;
+       }
+       return fd;
+}
+#endif
+
+static const char * const probe_usage[] = {
+       "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
+       "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
+       "perf probe --list",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show parsed arguments, etc)"),
+#ifndef NO_LIBDWARF
+       OPT_STRING('k', "vmlinux", &session.vmlinux, "file",
+               "vmlinux/module pathname"),
+#endif
+       OPT_BOOLEAN('l', "list", &listing, "list up current probes"),
+       OPT_CALLBACK('a', "add", NULL,
+#ifdef NO_LIBDWARF
+               "FUNC[+OFFS|%return] [ARG ...]",
+#else
+               "FUNC[+OFFS|%return|:RLN][@SRC]|SRC:ALN [ARG ...]",
+#endif
+               "probe point definition, where\n"
+               "\t\tGRP:\tGroup name (optional)\n"
+               "\t\tNAME:\tEvent name\n"
+               "\t\tFUNC:\tFunction name\n"
+               "\t\tOFFS:\tOffset from function entry (in byte)\n"
+               "\t\t%return:\tPut the probe at function return\n"
+#ifdef NO_LIBDWARF
+               "\t\tARG:\tProbe argument (only \n"
+#else
+               "\t\tSRC:\tSource code path\n"
+               "\t\tRLN:\tRelative line number from function entry.\n"
+               "\t\tALN:\tAbsolute line number in file.\n"
+               "\t\tARG:\tProbe argument (local variable name or\n"
+#endif
+               "\t\t\tkprobe-tracer argument format.)\n",
+               opt_add_probe_event),
+       OPT_END()
+};
+
+int cmd_probe(int argc, const char **argv, const char *prefix __used)
+{
+       int i, j, ret;
+#ifndef NO_LIBDWARF
+       int fd;
+#endif
+       struct probe_point *pp;
+
+       argc = parse_options(argc, argv, options, probe_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
+       for (i = 0; i < argc; i++)
+               parse_probe_event(argv[i]);
+
+       if ((session.nr_probe == 0 && !listing) ||
+           (session.nr_probe != 0 && listing))
+               usage_with_options(probe_usage, options);
+
+       if (listing) {
+               show_perf_probe_events();
+               return 0;
+       }
+
+       if (session.need_dwarf)
+#ifdef NO_LIBDWARF
+               die("Debuginfo-analysis is not supported");
+#else  /* !NO_LIBDWARF */
+               pr_debug("Some probes require debuginfo.\n");
+
+       if (session.vmlinux)
+               fd = open(session.vmlinux, O_RDONLY);
+       else
+               fd = open_default_vmlinux();
+       if (fd < 0) {
+               if (session.need_dwarf)
+                       die("Could not open vmlinux/module file.");
+
+               pr_warning("Could not open vmlinux/module file."
+                          " Try to use symbols.\n");
+               goto end_dwarf;
+       }
+
+       /* Searching probe points */
+       for (j = 0; j < session.nr_probe; j++) {
+               pp = &session.probes[j];
+               if (pp->found)
+                       continue;
+
+               lseek(fd, SEEK_SET, 0);
+               ret = find_probepoint(fd, pp);
+               if (ret < 0) {
+                       if (session.need_dwarf)
+                               die("Could not analyze debuginfo.");
+
+                       pr_warning("An error occurred in debuginfo analysis. Try to use symbols.\n");
+                       break;
+               }
+               if (ret == 0)   /* No error but failed to find probe point. */
+                       die("No probe point found.");
+       }
+       close(fd);
+
+end_dwarf:
+#endif /* !NO_LIBDWARF */
+
+       /* Synthesize probes without dwarf */
+       for (j = 0; j < session.nr_probe; j++) {
+               pp = &session.probes[j];
+               if (pp->found)  /* This probe is already found. */
+                       continue;
+
+               ret = synthesize_trace_kprobe_event(pp);
+               if (ret == -E2BIG)
+                       die("probe point definition becomes too long.");
+               else if (ret < 0)
+                       die("Failed to synthesize a probe point.");
+       }
+
+       /* Settng up probe points */
+       add_trace_kprobe_events(session.probes, session.nr_probe);
+       return 0;
+}
+
index a4be453fc8a92e3059fabb1a680ac008fe5569b9..0e519c667e3ac47f8fe9576575f62dc7b1991d83 100644 (file)
 #include "util/header.h"
 #include "util/event.h"
 #include "util/debug.h"
-#include "util/trace-event.h"
+#include "util/symbol.h"
 
 #include <unistd.h>
 #include <sched.h>
 
-#define ALIGN(x, a)            __ALIGN_MASK(x, (typeof(x))(a)-1)
-#define __ALIGN_MASK(x, mask)  (((x)+(mask))&~(mask))
-
 static int                     fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static long                    default_interval                = 100000;
+static long                    default_interval                =      0;
 
-static int                     nr_cpus                         = 0;
+static int                     nr_cpus                         =      0;
 static unsigned int            page_size;
-static unsigned int            mmap_pages                      = 128;
-static int                     freq                            = 0;
+static unsigned int            mmap_pages                      =    128;
+static int                     freq                            =   1000;
 static int                     output;
 static const char              *output_name                    = "perf.data";
-static int                     group                           = 0;
-static unsigned int            realtime_prio                   = 0;
-static int                     raw_samples                     = 0;
-static int                     system_wide                     = 0;
-static int                     profile_cpu                     = -1;
-static pid_t                   target_pid                      = -1;
-static pid_t                   child_pid                       = -1;
-static int                     inherit                         = 1;
-static int                     force                           = 0;
-static int                     append_file                     = 0;
-static int                     call_graph                      = 0;
-static int                     inherit_stat                    = 0;
-static int                     no_samples                      = 0;
-static int                     sample_address                  = 0;
-static int                     multiplex                       = 0;
-static int                     multiplex_fd                    = -1;
-
-static long                    samples;
+static int                     group                           =      0;
+static unsigned int            realtime_prio                   =      0;
+static int                     raw_samples                     =      0;
+static int                     system_wide                     =      0;
+static int                     profile_cpu                     =     -1;
+static pid_t                   target_pid                      =     -1;
+static pid_t                   child_pid                       =     -1;
+static int                     inherit                         =      1;
+static int                     force                           =      0;
+static int                     append_file                     =      0;
+static int                     call_graph                      =      0;
+static int                     inherit_stat                    =      0;
+static int                     no_samples                      =      0;
+static int                     sample_address                  =      0;
+static int                     multiplex                       =      0;
+static int                     multiplex_fd                    =     -1;
+
+static long                    samples                         =      0;
 static struct timeval          last_read;
 static struct timeval          this_read;
 
-static u64                     bytes_written;
+static u64                     bytes_written                   =      0;
 
 static struct pollfd           event_array[MAX_NR_CPUS * MAX_COUNTERS];
 
-static int                     nr_poll;
-static int                     nr_cpu;
+static int                     nr_poll                         =      0;
+static int                     nr_cpu                          =      0;
 
-static int                     file_new = 1;
+static int                     file_new                        =      1;
 
-struct perf_header             *header;
+struct perf_header             *header                         =   NULL;
 
 struct mmap_data {
        int                     counter;
@@ -113,6 +110,24 @@ static void write_output(void *buf, size_t size)
        }
 }
 
+static void write_event(event_t *buf, size_t size)
+{
+       /*
+       * Add it to the list of DSOs, so that when we finish this
+        * record session we can pick the available build-ids.
+        */
+       if (buf->header.type == PERF_RECORD_MMAP)
+               dsos__findnew(buf->mmap.filename);
+
+       write_output(buf, size);
+}
+
+static int process_synthesized_event(event_t *event)
+{
+       write_event(event, event->header.size);
+       return 0;
+}
+
 static void mmap_read(struct mmap_data *md)
 {
        unsigned int head = mmap_read_head(md);
@@ -161,14 +176,14 @@ static void mmap_read(struct mmap_data *md)
                size = md->mask + 1 - (old & md->mask);
                old += size;
 
-               write_output(buf, size);
+               write_event(buf, size);
        }
 
        buf = &data[old & md->mask];
        size = head - old;
        old += size;
 
-       write_output(buf, size);
+       write_event(buf, size);
 
        md->prev = old;
        mmap_write_tail(md, old);
@@ -195,168 +210,6 @@ static void sig_atexit(void)
        kill(getpid(), signr);
 }
 
-static pid_t pid_synthesize_comm_event(pid_t pid, int full)
-{
-       struct comm_event comm_ev;
-       char filename[PATH_MAX];
-       char bf[BUFSIZ];
-       FILE *fp;
-       size_t size = 0;
-       DIR *tasks;
-       struct dirent dirent, *next;
-       pid_t tgid = 0;
-
-       snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
-
-       fp = fopen(filename, "r");
-       if (fp == NULL) {
-               /*
-                * We raced with a task exiting - just return:
-                */
-               if (verbose)
-                       fprintf(stderr, "couldn't open %s\n", filename);
-               return 0;
-       }
-
-       memset(&comm_ev, 0, sizeof(comm_ev));
-       while (!comm_ev.comm[0] || !comm_ev.pid) {
-               if (fgets(bf, sizeof(bf), fp) == NULL)
-                       goto out_failure;
-
-               if (memcmp(bf, "Name:", 5) == 0) {
-                       char *name = bf + 5;
-                       while (*name && isspace(*name))
-                               ++name;
-                       size = strlen(name) - 1;
-                       memcpy(comm_ev.comm, name, size++);
-               } else if (memcmp(bf, "Tgid:", 5) == 0) {
-                       char *tgids = bf + 5;
-                       while (*tgids && isspace(*tgids))
-                               ++tgids;
-                       tgid = comm_ev.pid = atoi(tgids);
-               }
-       }
-
-       comm_ev.header.type = PERF_RECORD_COMM;
-       size = ALIGN(size, sizeof(u64));
-       comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
-
-       if (!full) {
-               comm_ev.tid = pid;
-
-               write_output(&comm_ev, comm_ev.header.size);
-               goto out_fclose;
-       }
-
-       snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
-
-       tasks = opendir(filename);
-       while (!readdir_r(tasks, &dirent, &next) && next) {
-               char *end;
-               pid = strtol(dirent.d_name, &end, 10);
-               if (*end)
-                       continue;
-
-               comm_ev.tid = pid;
-
-               write_output(&comm_ev, comm_ev.header.size);
-       }
-       closedir(tasks);
-
-out_fclose:
-       fclose(fp);
-       return tgid;
-
-out_failure:
-       fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
-               filename);
-       exit(EXIT_FAILURE);
-}
-
-static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
-{
-       char filename[PATH_MAX];
-       FILE *fp;
-
-       snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
-
-       fp = fopen(filename, "r");
-       if (fp == NULL) {
-               /*
-                * We raced with a task exiting - just return:
-                */
-               if (verbose)
-                       fprintf(stderr, "couldn't open %s\n", filename);
-               return;
-       }
-       while (1) {
-               char bf[BUFSIZ], *pbf = bf;
-               struct mmap_event mmap_ev = {
-                       .header = { .type = PERF_RECORD_MMAP },
-               };
-               int n;
-               size_t size;
-               if (fgets(bf, sizeof(bf), fp) == NULL)
-                       break;
-
-               /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = hex2u64(pbf, &mmap_ev.start);
-               if (n < 0)
-                       continue;
-               pbf += n + 1;
-               n = hex2u64(pbf, &mmap_ev.len);
-               if (n < 0)
-                       continue;
-               pbf += n + 3;
-               if (*pbf == 'x') { /* vm_exec */
-                       char *execname = strchr(bf, '/');
-
-                       /* Catch VDSO */
-                       if (execname == NULL)
-                               execname = strstr(bf, "[vdso]");
-
-                       if (execname == NULL)
-                               continue;
-
-                       size = strlen(execname);
-                       execname[size - 1] = '\0'; /* Remove \n */
-                       memcpy(mmap_ev.filename, execname, size);
-                       size = ALIGN(size, sizeof(u64));
-                       mmap_ev.len -= mmap_ev.start;
-                       mmap_ev.header.size = (sizeof(mmap_ev) -
-                                              (sizeof(mmap_ev.filename) - size));
-                       mmap_ev.pid = tgid;
-                       mmap_ev.tid = pid;
-
-                       write_output(&mmap_ev, mmap_ev.header.size);
-               }
-       }
-
-       fclose(fp);
-}
-
-static void synthesize_all(void)
-{
-       DIR *proc;
-       struct dirent dirent, *next;
-
-       proc = opendir("/proc");
-
-       while (!readdir_r(proc, &dirent, &next) && next) {
-               char *end;
-               pid_t pid, tgid;
-
-               pid = strtol(dirent.d_name, &end, 10);
-               if (*end) /* only interested in proper numerical dirents */
-                       continue;
-
-               tgid = pid_synthesize_comm_event(pid, 1);
-               pid_synthesize_mmap_samples(pid, tgid);
-       }
-
-       closedir(proc);
-}
-
 static int group_fd;
 
 static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
@@ -367,7 +220,11 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
                h_attr = header->attr[nr];
        } else {
                h_attr = perf_header_attr__new(a);
-               perf_header__add_attr(header, h_attr);
+               if (h_attr != NULL)
+                       if (perf_header__add_attr(header, h_attr) < 0) {
+                               perf_header_attr__delete(h_attr);
+                               h_attr = NULL;
+                       }
        }
 
        return h_attr;
@@ -375,9 +232,11 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
+       char *filter = filters[counter];
        struct perf_event_attr *attr = attrs + counter;
        struct perf_header_attr *h_attr;
        int track = !counter; /* only the first counter needs these */
+       int ret;
        struct {
                u64 count;
                u64 time_enabled;
@@ -448,11 +307,19 @@ try_again:
                printf("\n");
                error("perfcounter syscall returned with %d (%s)\n",
                        fd[nr_cpu][counter], strerror(err));
+
+#if defined(__i386__) || defined(__x86_64__)
+               if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
+                       die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
+#endif
+
                die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
                exit(-1);
        }
 
        h_attr = get_header_attr(attr, counter);
+       if (h_attr == NULL)
+               die("nomem\n");
 
        if (!file_new) {
                if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
@@ -466,7 +333,10 @@ try_again:
                exit(-1);
        }
 
-       perf_header_attr__add_id(h_attr, read_data.id);
+       if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+               pr_warning("Not enough memory to add id\n");
+               exit(-1);
+       }
 
        assert(fd[nr_cpu][counter] >= 0);
        fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
@@ -480,7 +350,6 @@ try_again:
                multiplex_fd = fd[nr_cpu][counter];
 
        if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
-               int ret;
 
                ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
                assert(ret != -1);
@@ -500,6 +369,16 @@ try_again:
                }
        }
 
+       if (filter != NULL) {
+               ret = ioctl(fd[nr_cpu][counter],
+                           PERF_EVENT_IOC_SET_FILTER, filter);
+               if (ret) {
+                       error("failed to set filter with %d (%s)\n", errno,
+                             strerror(errno));
+                       exit(-1);
+               }
+       }
+
        ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
 }
 
@@ -518,7 +397,7 @@ static void atexit_header(void)
 {
        header->data_size += bytes_written;
 
-       perf_header__write(header, output);
+       perf_header__write(header, output, true);
 }
 
 static int __cmd_record(int argc, const char **argv)
@@ -527,7 +406,7 @@ static int __cmd_record(int argc, const char **argv)
        struct stat st;
        pid_t pid = 0;
        int flags;
-       int ret;
+       int err;
        unsigned long waking = 0;
 
        page_size = sysconf(_SC_PAGE_SIZE);
@@ -561,22 +440,29 @@ static int __cmd_record(int argc, const char **argv)
                exit(-1);
        }
 
-       if (!file_new)
-               header = perf_header__read(output);
-       else
-               header = perf_header__new();
+       header = perf_header__new();
+       if (header == NULL) {
+               pr_err("Not enough memory for reading perf file header\n");
+               return -1;
+       }
 
+       if (!file_new) {
+               err = perf_header__read(header, output);
+               if (err < 0)
+                       return err;
+       }
 
        if (raw_samples) {
-               read_tracing_data(attrs, nr_counters);
+               perf_header__set_feat(header, HEADER_TRACE_INFO);
        } else {
                for (i = 0; i < nr_counters; i++) {
                        if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
-                               read_tracing_data(attrs, nr_counters);
+                               perf_header__set_feat(header, HEADER_TRACE_INFO);
                                break;
                        }
                }
        }
+
        atexit(atexit_header);
 
        if (!system_wide) {
@@ -594,25 +480,36 @@ static int __cmd_record(int argc, const char **argv)
                }
        }
 
-       if (file_new)
-               perf_header__write(header, output);
+       if (file_new) {
+               err = perf_header__write(header, output, false);
+               if (err < 0)
+                       return err;
+       }
 
-       if (!system_wide) {
-               pid_t tgid = pid_synthesize_comm_event(pid, 0);
-               pid_synthesize_mmap_samples(pid, tgid);
-       } else
-               synthesize_all();
+       if (!system_wide)
+               event__synthesize_thread(pid, process_synthesized_event);
+       else
+               event__synthesize_threads(process_synthesized_event);
 
        if (target_pid == -1 && argc) {
                pid = fork();
                if (pid < 0)
-                       perror("failed to fork");
+                       die("failed to fork");
 
                if (!pid) {
                        if (execvp(argv[0], (char **)argv)) {
                                perror(argv[0]);
                                exit(-1);
                        }
+               } else {
+                       /*
+                        * Wait a bit for the execv'ed child to appear
+                        * and be updated in /proc
+                        * FIXME: Do you know a less heuristical solution?
+                        */
+                       usleep(1000);
+                       event__synthesize_thread(pid,
+                                                process_synthesized_event);
                }
 
                child_pid = pid;
@@ -623,7 +520,7 @@ static int __cmd_record(int argc, const char **argv)
 
                param.sched_priority = realtime_prio;
                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-                       printf("Could not set realtime priority.\n");
+                       pr_err("Could not set realtime priority.\n");
                        exit(-1);
                }
        }
@@ -641,7 +538,7 @@ static int __cmd_record(int argc, const char **argv)
                if (hits == samples) {
                        if (done)
                                break;
-                       ret = poll(event_array, nr_poll, -1);
+                       err = poll(event_array, nr_poll, -1);
                        waking++;
                }
 
@@ -677,6 +574,8 @@ static const struct option options[] = {
        OPT_CALLBACK('e', "event", NULL, "event",
                     "event selector. use 'perf list' to list available events",
                     parse_events),
+       OPT_CALLBACK(0, "filter", NULL, "filter",
+                    "event filter", parse_filter),
        OPT_INTEGER('p', "pid", &target_pid,
                    "record events on existing pid"),
        OPT_INTEGER('r', "realtime", &realtime_prio,
@@ -720,6 +619,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
        int counter;
 
+       symbol__init(0);
+
        argc = parse_options(argc, argv, options, record_usage,
                PARSE_OPT_STOP_AT_NON_OPTION);
        if (!argc && target_pid == -1 && !system_wide)
@@ -731,6 +632,18 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
                attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
        }
 
+       /*
+        * User specified count overrides default frequency.
+        */
+       if (default_interval)
+               freq = 0;
+       else if (freq) {
+               default_interval = freq;
+       } else {
+               fprintf(stderr, "frequency and count are zero, aborting\n");
+               exit(EXIT_FAILURE);
+       }
+
        for (counter = 0; counter < nr_counters; counter++) {
                if (attrs[counter].sample_period)
                        continue;
index 19669c20088e12a256ebdd736efd0c8253fdd73e..383c4ab4f9af06e6cfaa0e6f6e358aa63e1d1d68 100644 (file)
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 
+#include "util/data_map.h"
 #include "util/thread.h"
+#include "util/sort.h"
+#include "util/hist.h"
 
 static char            const *input_name = "perf.data";
 
-static char            default_sort_order[] = "comm,dso,symbol";
-static char            *sort_order = default_sort_order;
 static char            *dso_list_str, *comm_list_str, *sym_list_str,
                        *col_width_list_str;
 static struct strlist  *dso_list, *comm_list, *sym_list;
-static char            *field_sep;
 
 static int             force;
-static int             input;
-static int             show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int             full_paths;
 static int             show_nr_samples;
@@ -50,374 +48,38 @@ static struct perf_read_values     show_threads_values;
 static char            default_pretty_printing_style[] = "normal";
 static char            *pretty_printing_style = default_pretty_printing_style;
 
-static unsigned long   page_size;
-static unsigned long   mmap_window = 32;
-
-static char            default_parent_pattern[] = "^sys_|^do_page_fault";
-static char            *parent_pattern = default_parent_pattern;
-static regex_t         parent_regex;
-
 static int             exclude_other = 1;
 
 static char            callchain_default_opt[] = "fractal,0.5";
 
-static int             callchain;
-
-static char            __cwd[PATH_MAX];
-static char            *cwd = __cwd;
-static int             cwdlen;
-
-static struct rb_root  threads;
-static struct thread   *last_match;
-
 static struct perf_header *header;
 
-static
-struct callchain_param callchain_param = {
-       .mode   = CHAIN_GRAPH_REL,
-       .min_percent = 0.5
-};
-
 static u64             sample_type;
 
-static int repsep_fprintf(FILE *fp, const char *fmt, ...)
-{
-       int n;
-       va_list ap;
-
-       va_start(ap, fmt);
-       if (!field_sep)
-               n = vfprintf(fp, fmt, ap);
-       else {
-               char *bf = NULL;
-               n = vasprintf(&bf, fmt, ap);
-               if (n > 0) {
-                       char *sep = bf;
-
-                       while (1) {
-                               sep = strchr(sep, *field_sep);
-                               if (sep == NULL)
-                                       break;
-                               *sep = '.';
-                       }
-               }
-               fputs(bf, fp);
-               free(bf);
-       }
-       va_end(ap);
-       return n;
-}
-
-static unsigned int dsos__col_width,
-                   comms__col_width,
-                   threads__col_width;
+struct symbol_conf     symbol_conf;
 
-/*
- * histogram, sorted on item, collects counts
- */
-
-static struct rb_root hist;
-
-struct hist_entry {
-       struct rb_node          rb_node;
-
-       struct thread           *thread;
-       struct map              *map;
-       struct dso              *dso;
-       struct symbol           *sym;
-       struct symbol           *parent;
-       u64                     ip;
-       char                    level;
-       struct callchain_node   callchain;
-       struct rb_root          sorted_chain;
-
-       u64                     count;
-};
-
-/*
- * configurable sorting bits
- */
-
-struct sort_entry {
-       struct list_head list;
-
-       const char *header;
-
-       int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-       int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-       size_t  (*print)(FILE *fp, struct hist_entry *, unsigned int width);
-       unsigned int *width;
-       bool    elide;
-};
-
-static int64_t cmp_null(void *l, void *r)
-{
-       if (!l && !r)
-               return 0;
-       else if (!l)
-               return -1;
-       else
-               return 1;
-}
-
-/* --sort pid */
-
-static int64_t
-sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       return right->thread->pid - left->thread->pid;
-}
 
 static size_t
-sort__thread_print(FILE *fp, struct hist_entry *self, unsigned int width)
+callchain__fprintf_left_margin(FILE *fp, int left_margin)
 {
-       return repsep_fprintf(fp, "%*s:%5d", width - 6,
-                             self->thread->comm ?: "", self->thread->pid);
-}
-
-static struct sort_entry sort_thread = {
-       .header = "Command:  Pid",
-       .cmp    = sort__thread_cmp,
-       .print  = sort__thread_print,
-       .width  = &threads__col_width,
-};
-
-/* --sort comm */
-
-static int64_t
-sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       return right->thread->pid - left->thread->pid;
-}
-
-static int64_t
-sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
-{
-       char *comm_l = left->thread->comm;
-       char *comm_r = right->thread->comm;
-
-       if (!comm_l || !comm_r)
-               return cmp_null(comm_l, comm_r);
-
-       return strcmp(comm_l, comm_r);
-}
-
-static size_t
-sort__comm_print(FILE *fp, struct hist_entry *self, unsigned int width)
-{
-       return repsep_fprintf(fp, "%*s", width, self->thread->comm);
-}
-
-static struct sort_entry sort_comm = {
-       .header         = "Command",
-       .cmp            = sort__comm_cmp,
-       .collapse       = sort__comm_collapse,
-       .print          = sort__comm_print,
-       .width          = &comms__col_width,
-};
-
-/* --sort dso */
-
-static int64_t
-sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       struct dso *dso_l = left->dso;
-       struct dso *dso_r = right->dso;
-
-       if (!dso_l || !dso_r)
-               return cmp_null(dso_l, dso_r);
-
-       return strcmp(dso_l->name, dso_r->name);
-}
-
-static size_t
-sort__dso_print(FILE *fp, struct hist_entry *self, unsigned int width)
-{
-       if (self->dso)
-               return repsep_fprintf(fp, "%-*s", width, self->dso->name);
-
-       return repsep_fprintf(fp, "%*llx", width, (u64)self->ip);
-}
-
-static struct sort_entry sort_dso = {
-       .header = "Shared Object",
-       .cmp    = sort__dso_cmp,
-       .print  = sort__dso_print,
-       .width  = &dsos__col_width,
-};
-
-/* --sort symbol */
-
-static int64_t
-sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       u64 ip_l, ip_r;
-
-       if (left->sym == right->sym)
-               return 0;
-
-       ip_l = left->sym ? left->sym->start : left->ip;
-       ip_r = right->sym ? right->sym->start : right->ip;
-
-       return (int64_t)(ip_r - ip_l);
-}
-
-static size_t
-sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
-{
-       size_t ret = 0;
+       int i;
+       int ret;
 
-       if (verbose)
-               ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip,
-                                     dso__symtab_origin(self->dso));
+       ret = fprintf(fp, "            ");
 
-       ret += repsep_fprintf(fp, "[%c] ", self->level);
-       if (self->sym) {
-               ret += repsep_fprintf(fp, "%s", self->sym->name);
-
-               if (self->sym->module)
-                       ret += repsep_fprintf(fp, "\t[%s]",
-                                            self->sym->module->name);
-       } else {
-               ret += repsep_fprintf(fp, "%#016llx", (u64)self->ip);
-       }
+       for (i = 0; i < left_margin; i++)
+               ret += fprintf(fp, " ");
 
        return ret;
 }
 
-static struct sort_entry sort_sym = {
-       .header = "Symbol",
-       .cmp    = sort__sym_cmp,
-       .print  = sort__sym_print,
-};
-
-/* --sort parent */
-
-static int64_t
-sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       struct symbol *sym_l = left->parent;
-       struct symbol *sym_r = right->parent;
-
-       if (!sym_l || !sym_r)
-               return cmp_null(sym_l, sym_r);
-
-       return strcmp(sym_l->name, sym_r->name);
-}
-
-static size_t
-sort__parent_print(FILE *fp, struct hist_entry *self, unsigned int width)
-{
-       return repsep_fprintf(fp, "%-*s", width,
-                             self->parent ? self->parent->name : "[other]");
-}
-
-static unsigned int parent_symbol__col_width;
-
-static struct sort_entry sort_parent = {
-       .header = "Parent symbol",
-       .cmp    = sort__parent_cmp,
-       .print  = sort__parent_print,
-       .width  = &parent_symbol__col_width,
-};
-
-static int sort__need_collapse = 0;
-static int sort__has_parent = 0;
-
-struct sort_dimension {
-       const char              *name;
-       struct sort_entry       *entry;
-       int                     taken;
-};
-
-static struct sort_dimension sort_dimensions[] = {
-       { .name = "pid",        .entry = &sort_thread,  },
-       { .name = "comm",       .entry = &sort_comm,    },
-       { .name = "dso",        .entry = &sort_dso,     },
-       { .name = "symbol",     .entry = &sort_sym,     },
-       { .name = "parent",     .entry = &sort_parent,  },
-};
-
-static LIST_HEAD(hist_entry__sort_list);
-
-static int sort_dimension__add(const char *tok)
-{
-       unsigned int i;
-
-       for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
-               struct sort_dimension *sd = &sort_dimensions[i];
-
-               if (sd->taken)
-                       continue;
-
-               if (strncasecmp(tok, sd->name, strlen(tok)))
-                       continue;
-
-               if (sd->entry->collapse)
-                       sort__need_collapse = 1;
-
-               if (sd->entry == &sort_parent) {
-                       int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
-                       if (ret) {
-                               char err[BUFSIZ];
-
-                               regerror(ret, &parent_regex, err, sizeof(err));
-                               fprintf(stderr, "Invalid regex: %s\n%s",
-                                       parent_pattern, err);
-                               exit(-1);
-                       }
-                       sort__has_parent = 1;
-               }
-
-               list_add_tail(&sd->entry->list, &hist_entry__sort_list);
-               sd->taken = 1;
-
-               return 0;
-       }
-
-       return -ESRCH;
-}
-
-static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
-{
-       struct sort_entry *se;
-       int64_t cmp = 0;
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               cmp = se->cmp(left, right);
-               if (cmp)
-                       break;
-       }
-
-       return cmp;
-}
-
-static int64_t
-hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
-{
-       struct sort_entry *se;
-       int64_t cmp = 0;
-
-       list_for_each_entry(se, &hist_entry__sort_list, list) {
-               int64_t (*f)(struct hist_entry *, struct hist_entry *);
-
-               f = se->collapse ?: se->cmp;
-
-               cmp = f(left, right);
-               if (cmp)
-                       break;
-       }
-
-       return cmp;
-}
-
-static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask)
+static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
+                                         int left_margin)
 {
        int i;
        size_t ret = 0;
 
-       ret += fprintf(fp, "%s", "                ");
+       ret += callchain__fprintf_left_margin(fp, left_margin);
 
        for (i = 0; i < depth; i++)
                if (depth_mask & (1 << i))
@@ -432,12 +94,12 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask)
 static size_t
 ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth,
                       int depth_mask, int count, u64 total_samples,
-                      int hits)
+                      int hits, int left_margin)
 {
        int i;
        size_t ret = 0;
 
-       ret += fprintf(fp, "%s", "                ");
+       ret += callchain__fprintf_left_margin(fp, left_margin);
        for (i = 0; i < depth; i++) {
                if (depth_mask & (1 << i))
                        ret += fprintf(fp, "|");
@@ -475,8 +137,9 @@ static void init_rem_hits(void)
 }
 
 static size_t
-callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
-                       u64 total_samples, int depth, int depth_mask)
+__callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
+                          u64 total_samples, int depth, int depth_mask,
+                          int left_margin)
 {
        struct rb_node *node, *next;
        struct callchain_node *child;
@@ -517,7 +180,8 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
                 * But we keep the older depth mask for the line seperator
                 * to keep the level link until we reach the last child
                 */
-               ret += ipchain__fprintf_graph_line(fp, depth, depth_mask);
+               ret += ipchain__fprintf_graph_line(fp, depth, depth_mask,
+                                                  left_margin);
                i = 0;
                list_for_each_entry(chain, &child->val, list) {
                        if (chain->ip >= PERF_CONTEXT_MAX)
@@ -525,11 +189,13 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
                        ret += ipchain__fprintf_graph(fp, chain, depth,
                                                      new_depth_mask, i++,
                                                      new_total,
-                                                     cumul);
+                                                     cumul,
+                                                     left_margin);
                }
-               ret += callchain__fprintf_graph(fp, child, new_total,
-                                               depth + 1,
-                                               new_depth_mask | (1 << depth));
+               ret += __callchain__fprintf_graph(fp, child, new_total,
+                                                 depth + 1,
+                                                 new_depth_mask | (1 << depth),
+                                                 left_margin);
                node = next;
        }
 
@@ -543,9 +209,48 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 
                ret += ipchain__fprintf_graph(fp, &rem_hits, depth,
                                              new_depth_mask, 0, new_total,
-                                             remaining);
+                                             remaining, left_margin);
+       }
+
+       return ret;
+}
+
+
+static size_t
+callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
+                        u64 total_samples, int left_margin)
+{
+       struct callchain_list *chain;
+       bool printed = false;
+       int i = 0;
+       int ret = 0;
+
+       list_for_each_entry(chain, &self->val, list) {
+               if (chain->ip >= PERF_CONTEXT_MAX)
+                       continue;
+
+               if (!i++ && sort__first_dimension == SORT_SYM)
+                       continue;
+
+               if (!printed) {
+                       ret += callchain__fprintf_left_margin(fp, left_margin);
+                       ret += fprintf(fp, "|\n");
+                       ret += callchain__fprintf_left_margin(fp, left_margin);
+                       ret += fprintf(fp, "---");
+
+                       left_margin += 3;
+                       printed = true;
+               } else
+                       ret += callchain__fprintf_left_margin(fp, left_margin);
+
+               if (chain->sym)
+                       ret += fprintf(fp, " %s\n", chain->sym->name);
+               else
+                       ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
        }
 
+       ret += __callchain__fprintf_graph(fp, self, total_samples, 1, 1, left_margin);
+
        return ret;
 }
 
@@ -577,7 +282,7 @@ callchain__fprintf_flat(FILE *fp, struct callchain_node *self,
 
 static size_t
 hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
-                             u64 total_samples)
+                             u64 total_samples, int left_margin)
 {
        struct rb_node *rb_node;
        struct callchain_node *chain;
@@ -597,8 +302,8 @@ hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
                        break;
                case CHAIN_GRAPH_ABS: /* Falldown */
                case CHAIN_GRAPH_REL:
-                       ret += callchain__fprintf_graph(fp, chain,
-                                                       total_samples, 1, 1);
+                       ret += callchain__fprintf_graph(fp, chain, total_samples,
+                                                       left_margin);
                case CHAIN_NONE:
                default:
                        break;
@@ -610,7 +315,6 @@ hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
        return ret;
 }
 
-
 static size_t
 hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 {
@@ -644,8 +348,19 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 
        ret += fprintf(fp, "\n");
 
-       if (callchain)
-               hist_entry_callchain__fprintf(fp, self, total_samples);
+       if (callchain) {
+               int left_margin = 0;
+
+               if (sort__first_dimension == SORT_COMM) {
+                       se = list_first_entry(&hist_entry__sort_list, typeof(*se),
+                                               list);
+                       left_margin = se->width ? *se->width : 0;
+                       left_margin -= thread__comm_len(self->thread);
+               }
+
+               hist_entry_callchain__fprintf(fp, self, total_samples,
+                                             left_margin);
+       }
 
        return ret;
 }
@@ -693,63 +408,6 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm)
        return 0;
 }
 
-
-static struct symbol *
-resolve_symbol(struct thread *thread, struct map **mapp,
-              struct dso **dsop, u64 *ipp)
-{
-       struct dso *dso = dsop ? *dsop : NULL;
-       struct map *map = mapp ? *mapp : NULL;
-       u64 ip = *ipp;
-
-       if (!thread)
-               return NULL;
-
-       if (dso)
-               goto got_dso;
-
-       if (map)
-               goto got_map;
-
-       map = thread__find_map(thread, ip);
-       if (map != NULL) {
-               /*
-                * We have to do this here as we may have a dso
-                * with no symbol hit that has a name longer than
-                * the ones with symbols sampled.
-                */
-               if (!sort_dso.elide && !map->dso->slen_calculated)
-                       dso__calc_col_width(map->dso);
-
-               if (mapp)
-                       *mapp = map;
-got_map:
-               ip = map->map_ip(map, ip);
-
-               dso = map->dso;
-       } else {
-               /*
-                * If this is outside of all known maps,
-                * and is a negative address, try to look it
-                * up in the kernel dso, as it might be a
-                * vsyscall (which executes in user-mode):
-                */
-               if ((long long)ip < 0)
-               dso = kernel_dso;
-       }
-       dump_printf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
-       dump_printf(" ...... map: %Lx -> %Lx\n", *ipp, ip);
-       *ipp  = ip;
-
-       if (dsop)
-               *dsop = dso;
-
-       if (!dso)
-               return NULL;
-got_dso:
-       return dso->find_symbol(dso, ip);
-}
-
 static int call__match(struct symbol *sym)
 {
        if (sym->name && !regexec(&parent_regex, sym->name, 0, NULL, 0))
@@ -758,11 +416,11 @@ static int call__match(struct symbol *sym)
        return 0;
 }
 
-static struct symbol **
-resolve_callchain(struct thread *thread, struct map *map __used,
-                   struct ip_callchain *chain, struct hist_entry *entry)
+static struct symbol **resolve_callchain(struct thread *thread,
+                                        struct ip_callchain *chain,
+                                        struct symbol **parent)
 {
-       u64 context = PERF_CONTEXT_MAX;
+       u8 cpumode = PERF_RECORD_MISC_USER;
        struct symbol **syms = NULL;
        unsigned int i;
 
@@ -776,34 +434,31 @@ resolve_callchain(struct thread *thread, struct map *map __used,
 
        for (i = 0; i < chain->nr; i++) {
                u64 ip = chain->ips[i];
-               struct dso *dso = NULL;
-               struct symbol *sym;
+               struct addr_location al;
 
                if (ip >= PERF_CONTEXT_MAX) {
-                       context = ip;
+                       switch (ip) {
+                       case PERF_CONTEXT_HV:
+                               cpumode = PERF_RECORD_MISC_HYPERVISOR;  break;
+                       case PERF_CONTEXT_KERNEL:
+                               cpumode = PERF_RECORD_MISC_KERNEL;      break;
+                       case PERF_CONTEXT_USER:
+                               cpumode = PERF_RECORD_MISC_USER;        break;
+                       default:
+                               break;
+                       }
                        continue;
                }
 
-               switch (context) {
-               case PERF_CONTEXT_HV:
-                       dso = hypervisor_dso;
-                       break;
-               case PERF_CONTEXT_KERNEL:
-                       dso = kernel_dso;
-                       break;
-               default:
-                       break;
-               }
-
-               sym = resolve_symbol(thread, NULL, &dso, &ip);
-
-               if (sym) {
-                       if (sort__has_parent && call__match(sym) &&
-                           !entry->parent)
-                               entry->parent = sym;
+               thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
+                                          ip, &al, NULL);
+               if (al.sym != NULL) {
+                       if (sort__has_parent && !*parent &&
+                           call__match(al.sym))
+                               *parent = al.sym;
                        if (!callchain)
                                break;
-                       syms[i] = sym;
+                       syms[i] = al.sym;
                }
        }
 
@@ -814,178 +469,33 @@ resolve_callchain(struct thread *thread, struct map *map __used,
  * collect histogram counts
  */
 
-static int
-hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-               struct symbol *sym, u64 ip, struct ip_callchain *chain,
-               char level, u64 count)
+static int hist_entry__add(struct addr_location *al,
+                          struct ip_callchain *chain, u64 count)
 {
-       struct rb_node **p = &hist.rb_node;
-       struct rb_node *parent = NULL;
+       struct symbol **syms = NULL, *parent = NULL;
+       bool hit;
        struct hist_entry *he;
-       struct symbol **syms = NULL;
-       struct hist_entry entry = {
-               .thread = thread,
-               .map    = map,
-               .dso    = dso,
-               .sym    = sym,
-               .ip     = ip,
-               .level  = level,
-               .count  = count,
-               .parent = NULL,
-               .sorted_chain = RB_ROOT
-       };
-       int cmp;
 
        if ((sort__has_parent || callchain) && chain)
-               syms = resolve_callchain(thread, map, chain, &entry);
+               syms = resolve_callchain(al->thread, chain, &parent);
 
-       while (*p != NULL) {
-               parent = *p;
-               he = rb_entry(parent, struct hist_entry, rb_node);
+       he = __hist_entry__add(al, parent, count, &hit);
+       if (he == NULL)
+               return -ENOMEM;
 
-               cmp = hist_entry__cmp(&entry, he);
+       if (hit)
+               he->count += count;
 
-               if (!cmp) {
-                       he->count += count;
-                       if (callchain) {
-                               append_chain(&he->callchain, chain, syms);
-                               free(syms);
-                       }
-                       return 0;
-               }
-
-               if (cmp < 0)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       he = malloc(sizeof(*he));
-       if (!he)
-               return -ENOMEM;
-       *he = entry;
        if (callchain) {
-               callchain_init(&he->callchain);
+               if (!hit)
+                       callchain_init(&he->callchain);
                append_chain(&he->callchain, chain, syms);
                free(syms);
        }
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &hist);
 
        return 0;
 }
 
-static void hist_entry__free(struct hist_entry *he)
-{
-       free(he);
-}
-
-/*
- * collapse the histogram
- */
-
-static struct rb_root collapse_hists;
-
-static void collapse__insert_entry(struct hist_entry *he)
-{
-       struct rb_node **p = &collapse_hists.rb_node;
-       struct rb_node *parent = NULL;
-       struct hist_entry *iter;
-       int64_t cmp;
-
-       while (*p != NULL) {
-               parent = *p;
-               iter = rb_entry(parent, struct hist_entry, rb_node);
-
-               cmp = hist_entry__collapse(iter, he);
-
-               if (!cmp) {
-                       iter->count += he->count;
-                       hist_entry__free(he);
-                       return;
-               }
-
-               if (cmp < 0)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &collapse_hists);
-}
-
-static void collapse__resort(void)
-{
-       struct rb_node *next;
-       struct hist_entry *n;
-
-       if (!sort__need_collapse)
-               return;
-
-       next = rb_first(&hist);
-       while (next) {
-               n = rb_entry(next, struct hist_entry, rb_node);
-               next = rb_next(&n->rb_node);
-
-               rb_erase(&n->rb_node, &hist);
-               collapse__insert_entry(n);
-       }
-}
-
-/*
- * reverse the map, sort on count.
- */
-
-static struct rb_root output_hists;
-
-static void output__insert_entry(struct hist_entry *he, u64 min_callchain_hits)
-{
-       struct rb_node **p = &output_hists.rb_node;
-       struct rb_node *parent = NULL;
-       struct hist_entry *iter;
-
-       if (callchain)
-               callchain_param.sort(&he->sorted_chain, &he->callchain,
-                                     min_callchain_hits, &callchain_param);
-
-       while (*p != NULL) {
-               parent = *p;
-               iter = rb_entry(parent, struct hist_entry, rb_node);
-
-               if (he->count > iter->count)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-
-       rb_link_node(&he->rb_node, parent, p);
-       rb_insert_color(&he->rb_node, &output_hists);
-}
-
-static void output__resort(u64 total_samples)
-{
-       struct rb_node *next;
-       struct hist_entry *n;
-       struct rb_root *tree = &hist;
-       u64 min_callchain_hits;
-
-       min_callchain_hits = total_samples * (callchain_param.min_percent / 100);
-
-       if (sort__need_collapse)
-               tree = &collapse_hists;
-
-       next = rb_first(tree);
-
-       while (next) {
-               n = rb_entry(next, struct hist_entry, rb_node);
-               next = rb_next(&n->rb_node);
-
-               rb_erase(&n->rb_node, tree);
-               output__insert_entry(n, min_callchain_hits);
-       }
-}
-
 static size_t output__fprintf(FILE *fp, u64 total_samples)
 {
        struct hist_entry *pos;
@@ -1080,13 +590,6 @@ print_entries:
        return ret;
 }
 
-static unsigned long total = 0,
-                    total_mmap = 0,
-                    total_comm = 0,
-                    total_fork = 0,
-                    total_unknown = 0,
-                    total_lost = 0;
-
 static int validate_chain(struct ip_callchain *chain, event_t *event)
 {
        unsigned int chain_size;
@@ -1100,30 +603,22 @@ static int validate_chain(struct ip_callchain *chain, event_t *event)
        return 0;
 }
 
-static int
-process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+static int process_sample_event(event_t *event)
 {
-       char level;
-       int show = 0;
-       struct dso *dso = NULL;
-       struct thread *thread;
        u64 ip = event->ip.ip;
        u64 period = 1;
-       struct map *map = NULL;
        void *more_data = event->ip.__more_data;
        struct ip_callchain *chain = NULL;
        int cpumode;
-
-       thread = threads__findnew(event->ip.pid, &threads, &last_match);
+       struct addr_location al;
+       struct thread *thread = threads__findnew(event->ip.pid);
 
        if (sample_type & PERF_SAMPLE_PERIOD) {
                period = *(u64 *)more_data;
                more_data += sizeof(u64);
        }
 
-       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
+       dump_printf("(IP, %d): %d/%d: %p period: %Ld\n",
                event->header.misc,
                event->ip.pid, event->ip.tid,
                (void *)(long)ip,
@@ -1137,7 +632,8 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                dump_printf("... chain: nr:%Lu\n", chain->nr);
 
                if (validate_chain(chain, event) < 0) {
-                       eprintf("call-chain problem with event, skipping it.\n");
+                       pr_debug("call-chain problem with event, "
+                                "skipping it.\n");
                        return 0;
                }
 
@@ -1147,163 +643,64 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                }
        }
 
-       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
        if (thread == NULL) {
-               eprintf("problem processing %d event, skipping it.\n",
+               pr_debug("problem processing %d event, skipping it.\n",
                        event->header.type);
                return -1;
        }
 
+       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
        if (comm_list && !strlist__has_entry(comm_list, thread->comm))
                return 0;
 
        cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-       if (cpumode == PERF_RECORD_MISC_KERNEL) {
-               show = SHOW_KERNEL;
-               level = 'k';
-
-               dso = kernel_dso;
-
-               dump_printf(" ...... dso: %s\n", dso->name);
-
-       } else if (cpumode == PERF_RECORD_MISC_USER) {
-
-               show = SHOW_USER;
-               level = '.';
-
-       } else {
-               show = SHOW_HV;
-               level = 'H';
-
-               dso = hypervisor_dso;
-
-               dump_printf(" ...... dso: [hypervisor]\n");
-       }
-
-       if (show & show_mask) {
-               struct symbol *sym = resolve_symbol(thread, &map, &dso, &ip);
-
-               if (dso_list && (!dso || !dso->name ||
-                                !strlist__has_entry(dso_list, dso->name)))
-                       return 0;
-
-               if (sym_list && (!sym || !strlist__has_entry(sym_list, sym->name)))
-                       return 0;
-
-               if (hist_entry__add(thread, map, dso, sym, ip, chain, level, period)) {
-                       eprintf("problem incrementing symbol count, skipping event\n");
-                       return -1;
-               }
-       }
-       total += period;
-
-       return 0;
-}
+       thread__find_addr_location(thread, cpumode,
+                                  MAP__FUNCTION, ip, &al, NULL);
+       /*
+        * We have to do this here as we may have a dso with no symbol hit that
+        * has a name longer than the ones with symbols sampled.
+        */
+       if (al.map && !sort_dso.elide && !al.map->dso->slen_calculated)
+               dso__calc_col_width(al.map->dso);
+
+       if (dso_list &&
+           (!al.map || !al.map->dso ||
+            !(strlist__has_entry(dso_list, al.map->dso->short_name) ||
+              (al.map->dso->short_name != al.map->dso->long_name &&
+               strlist__has_entry(dso_list, al.map->dso->long_name)))))
+               return 0;
 
-static int
-process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-       struct map *map = map__new(&event->mmap, cwd, cwdlen);
-
-       thread = threads__findnew(event->mmap.pid, &threads, &last_match);
-
-       dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->mmap.pid,
-               event->mmap.tid,
-               (void *)(long)event->mmap.start,
-               (void *)(long)event->mmap.len,
-               (void *)(long)event->mmap.pgoff,
-               event->mmap.filename);
-
-       if (thread == NULL || map == NULL) {
-               dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
+       if (sym_list && al.sym && !strlist__has_entry(sym_list, al.sym->name))
                return 0;
+
+       if (hist_entry__add(&al, chain, period)) {
+               pr_debug("problem incrementing symbol count, skipping event\n");
+               return -1;
        }
 
-       thread__insert_map(thread, map);
-       total_mmap++;
+       event__stats.total += period;
 
        return 0;
 }
 
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+static int process_comm_event(event_t *event)
 {
-       struct thread *thread;
-
-       thread = threads__findnew(event->comm.pid, &threads, &last_match);
+       struct thread *thread = threads__findnew(event->comm.pid);
 
-       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->comm.comm, event->comm.pid);
+       dump_printf(": %s:%d\n", event->comm.comm, event->comm.pid);
 
        if (thread == NULL ||
            thread__set_comm_adjust(thread, event->comm.comm)) {
                dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
                return -1;
        }
-       total_comm++;
-
-       return 0;
-}
-
-static int
-process_task_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-       struct thread *parent;
-
-       thread = threads__findnew(event->fork.pid, &threads, &last_match);
-       parent = threads__findnew(event->fork.ppid, &threads, &last_match);
-
-       dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT",
-               event->fork.pid, event->fork.tid,
-               event->fork.ppid, event->fork.ptid);
-
-       /*
-        * A thread clone will have the same PID for both
-        * parent and child.
-        */
-       if (thread == parent)
-               return 0;
-
-       if (event->header.type == PERF_RECORD_EXIT)
-               return 0;
-
-       if (!thread || !parent || thread__fork(thread, parent)) {
-               dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
-               return -1;
-       }
-       total_fork++;
 
        return 0;
 }
 
-static int
-process_lost_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->lost.id,
-               event->lost.lost);
-
-       total_lost += event->lost.lost;
-
-       return 0;
-}
-
-static int
-process_read_event(event_t *event, unsigned long offset, unsigned long head)
+static int process_read_event(event_t *event)
 {
        struct perf_event_attr *attr;
 
@@ -1319,238 +716,91 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head)
                                           event->read.value);
        }
 
-       dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n",
-                       (void *)(offset + head),
-                       (void *)(long)(event->header.size),
-                       event->read.pid,
-                       event->read.tid,
-                       attr ? __event_name(attr->type, attr->config)
-                            : "FAIL",
-                       event->read.value);
-
-       return 0;
-}
-
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       trace_event(event);
-
-       switch (event->header.type) {
-       case PERF_RECORD_SAMPLE:
-               return process_sample_event(event, offset, head);
-
-       case PERF_RECORD_MMAP:
-               return process_mmap_event(event, offset, head);
-
-       case PERF_RECORD_COMM:
-               return process_comm_event(event, offset, head);
-
-       case PERF_RECORD_FORK:
-       case PERF_RECORD_EXIT:
-               return process_task_event(event, offset, head);
-
-       case PERF_RECORD_LOST:
-               return process_lost_event(event, offset, head);
-
-       case PERF_RECORD_READ:
-               return process_read_event(event, offset, head);
-
-       /*
-        * We dont process them right now but they are fine:
-        */
-
-       case PERF_RECORD_THROTTLE:
-       case PERF_RECORD_UNTHROTTLE:
-               return 0;
-
-       default:
-               return -1;
-       }
+       dump_printf(": %d %d %s %Lu\n", event->read.pid, event->read.tid,
+                   attr ? __event_name(attr->type, attr->config) : "FAIL",
+                   event->read.value);
 
        return 0;
 }
 
-static int __cmd_report(void)
+static int sample_type_check(u64 type)
 {
-       int ret, rc = EXIT_FAILURE;
-       unsigned long offset = 0;
-       unsigned long head, shift;
-       struct stat input_stat;
-       struct thread *idle;
-       event_t *event;
-       uint32_t size;
-       char *buf;
-
-       idle = register_idle_thread(&threads, &last_match);
-       thread__comm_adjust(idle);
-
-       if (show_threads)
-               perf_read_values_init(&show_threads_values);
-
-       input = open(input_name, O_RDONLY);
-       if (input < 0) {
-               fprintf(stderr, " failed to open file: %s", input_name);
-               if (!strcmp(input_name, "perf.data"))
-                       fprintf(stderr, "  (try 'perf record' first)");
-               fprintf(stderr, "\n");
-               exit(-1);
-       }
-
-       ret = fstat(input, &input_stat);
-       if (ret < 0) {
-               perror("failed to stat file");
-               exit(-1);
-       }
-
-       if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
-               fprintf(stderr, "file: %s not owned by current user or root\n", input_name);
-               exit(-1);
-       }
-
-       if (!input_stat.st_size) {
-               fprintf(stderr, "zero-sized file, nothing to do!\n");
-               exit(0);
-       }
-
-       header = perf_header__read(input);
-       head = header->data_offset;
-
-       sample_type = perf_header__sample_type(header);
+       sample_type = type;
 
        if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
                if (sort__has_parent) {
                        fprintf(stderr, "selected --sort parent, but no"
                                        " callchain data. Did you call"
                                        " perf record without -g?\n");
-                       exit(-1);
+                       return -1;
                }
                if (callchain) {
                        fprintf(stderr, "selected -g but no callchain data."
                                        " Did you call perf record without"
                                        " -g?\n");
-                       exit(-1);
+                       return -1;
                }
        } else if (callchain_param.mode != CHAIN_NONE && !callchain) {
                        callchain = 1;
                        if (register_callchain_param(&callchain_param) < 0) {
                                fprintf(stderr, "Can't register callchain"
                                                " params\n");
-                               exit(-1);
+                               return -1;
                        }
        }
 
-       if (load_kernel() < 0) {
-               perror("failed to load kernel symbols");
-               return EXIT_FAILURE;
-       }
-
-       if (!full_paths) {
-               if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
-                       perror("failed to get the current directory");
-                       return EXIT_FAILURE;
-               }
-               cwdlen = strlen(cwd);
-       } else {
-               cwd = NULL;
-               cwdlen = 0;
-       }
-
-       shift = page_size * (head / page_size);
-       offset += shift;
-       head -= shift;
-
-remap:
-       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-                          MAP_SHARED, input, offset);
-       if (buf == MAP_FAILED) {
-               perror("failed to mmap file");
-               exit(-1);
-       }
-
-more:
-       event = (event_t *)(buf + head);
-
-       size = event->header.size;
-       if (!size)
-               size = 8;
-
-       if (head + event->header.size >= page_size * mmap_window) {
-               int munmap_ret;
-
-               shift = page_size * (head / page_size);
-
-               munmap_ret = munmap(buf, page_size * mmap_window);
-               assert(munmap_ret == 0);
-
-               offset += shift;
-               head -= shift;
-               goto remap;
-       }
-
-       size = event->header.size;
-
-       dump_printf("\n%p [%p]: event: %d\n",
-                       (void *)(offset + head),
-                       (void *)(long)event->header.size,
-                       event->header.type);
-
-       if (!size || process_event(event, offset, head) < 0) {
-
-               dump_printf("%p [%p]: skipping unknown header type: %d\n",
-                       (void *)(offset + head),
-                       (void *)(long)(event->header.size),
-                       event->header.type);
-
-               total_unknown++;
-
-               /*
-                * assume we lost track of the stream, check alignment, and
-                * increment a single u64 in the hope to catch on again 'soon'.
-                */
+       return 0;
+}
 
-               if (unlikely(head & 7))
-                       head &= ~7ULL;
+static struct perf_file_handler file_handler = {
+       .process_sample_event   = process_sample_event,
+       .process_mmap_event     = event__process_mmap,
+       .process_comm_event     = process_comm_event,
+       .process_exit_event     = event__process_task,
+       .process_fork_event     = event__process_task,
+       .process_lost_event     = event__process_lost,
+       .process_read_event     = process_read_event,
+       .sample_type_check      = sample_type_check,
+};
 
-               size = 8;
-       }
 
-       head += size;
+static int __cmd_report(void)
+{
+       struct thread *idle;
+       int ret;
 
-       if (offset + head >= header->data_offset + header->data_size)
-               goto done;
+       idle = register_idle_thread();
+       thread__comm_adjust(idle);
 
-       if (offset + head < (unsigned long)input_stat.st_size)
-               goto more;
+       if (show_threads)
+               perf_read_values_init(&show_threads_values);
 
-done:
-       rc = EXIT_SUCCESS;
-       close(input);
+       register_perf_file_handler(&file_handler);
 
-       dump_printf("      IP events: %10ld\n", total);
-       dump_printf("    mmap events: %10ld\n", total_mmap);
-       dump_printf("    comm events: %10ld\n", total_comm);
-       dump_printf("    fork events: %10ld\n", total_fork);
-       dump_printf("    lost events: %10ld\n", total_lost);
-       dump_printf(" unknown events: %10ld\n", total_unknown);
+       ret = mmap_dispatch_perf_file(&header, input_name, force,
+                                     full_paths, &event__cwdlen, &event__cwd);
+       if (ret)
+               return ret;
 
-       if (dump_trace)
+       if (dump_trace) {
+               event__print_totals();
                return 0;
+       }
 
-       if (verbose >= 3)
-               threads__fprintf(stdout, &threads);
+       if (verbose > 3)
+               threads__fprintf(stdout);
 
-       if (verbose >= 2)
+       if (verbose > 2)
                dsos__fprintf(stdout);
 
        collapse__resort();
-       output__resort(total);
-       output__fprintf(stdout, total);
+       output__resort(event__stats.total);
+       output__fprintf(stdout, event__stats.total);
 
        if (show_threads)
                perf_read_values_destroy(&show_threads_values);
 
-       return rc;
+       return ret;
 }
 
 static int
@@ -1606,7 +856,8 @@ setup:
        return 0;
 }
 
-static const char * const report_usage[] = {
+//static const char * const report_usage[] = {
+const char * const report_usage[] = {
        "perf report [<options>] <command>",
        NULL
 };
@@ -1618,9 +869,10 @@ static const struct option options[] = {
                    "be more verbose (show symbol address, etc)"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
-       OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
+       OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+                  "file", "vmlinux pathname"),
        OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-       OPT_BOOLEAN('m', "modules", &modules,
+       OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
                    "load module symbols - WARNING: use only with -k and LIVE kernel"),
        OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
                    "Show a column with the number of samples"),
@@ -1690,9 +942,8 @@ static void setup_list(struct strlist **list, const char *list_str,
 
 int cmd_report(int argc, const char **argv, const char *prefix __used)
 {
-       symbol__init();
-
-       page_size = getpagesize();
+       if (symbol__init(&symbol_conf) < 0)
+               return -1;
 
        argc = parse_options(argc, argv, options, report_usage, 0);
 
index ce2d5be4f30ef78bab41a803f868a0ee63708616..26b782f26ee1a97c35f8ce8445b07aa470a35806 100644 (file)
@@ -11,6 +11,7 @@
 #include "util/trace-event.h"
 
 #include "util/debug.h"
+#include "util/data_map.h"
 
 #include <sys/types.h>
 #include <sys/prctl.h>
 #include <math.h>
 
 static char                    const *input_name = "perf.data";
-static int                     input;
-static unsigned long           page_size;
-static unsigned long           mmap_window = 32;
-
-static unsigned long           total_comm = 0;
-
-static struct rb_root          threads;
-static struct thread           *last_match;
 
 static struct perf_header      *header;
 static u64                     sample_type;
@@ -35,11 +28,11 @@ static u64                  sample_type;
 static char                    default_sort_order[] = "avg, max, switch, runtime";
 static char                    *sort_order = default_sort_order;
 
+static int                     profile_cpu = -1;
+
 #define PR_SET_NAME            15               /* Set process name */
 #define MAX_CPUS               4096
 
-#define BUG_ON(x)              assert(!(x))
-
 static u64                     run_measurement_overhead;
 static u64                     sleep_measurement_overhead;
 
@@ -74,6 +67,7 @@ enum sched_event_type {
        SCHED_EVENT_RUN,
        SCHED_EVENT_SLEEP,
        SCHED_EVENT_WAKEUP,
+       SCHED_EVENT_MIGRATION,
 };
 
 struct sched_atom {
@@ -226,7 +220,7 @@ static void calibrate_sleep_measurement_overhead(void)
 static struct sched_atom *
 get_new_event(struct task_desc *task, u64 timestamp)
 {
-       struct sched_atom *event = calloc(1, sizeof(*event));
+       struct sched_atom *event = zalloc(sizeof(*event));
        unsigned long idx = task->nr_events;
        size_t size;
 
@@ -294,7 +288,7 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
                return;
        }
 
-       wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem));
+       wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
        sem_init(wakee_event->wait_sem, 0, 0);
        wakee_event->specific_wait = 1;
        event->wait_sem = wakee_event->wait_sem;
@@ -324,7 +318,7 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
        if (task)
                return task;
 
-       task = calloc(1, sizeof(*task));
+       task = zalloc(sizeof(*task));
        task->pid = pid;
        task->nr = nr_tasks;
        strcpy(task->comm, comm);
@@ -398,6 +392,8 @@ process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
                                ret = sem_post(atom->wait_sem);
                        BUG_ON(ret);
                        break;
+               case SCHED_EVENT_MIGRATION:
+                       break;
                default:
                        BUG_ON(1);
        }
@@ -632,29 +628,6 @@ static void test_calibrations(void)
        printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
-{
-       struct thread *thread;
-
-       thread = threads__findnew(event->comm.pid, &threads, &last_match);
-
-       dump_printf("%p [%p]: perf_event_comm: %s:%d\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->comm.comm, event->comm.pid);
-
-       if (thread == NULL ||
-           thread__set_comm(thread, event->comm.comm)) {
-               dump_printf("problem processing perf_event_comm, skipping event.\n");
-               return -1;
-       }
-       total_comm++;
-
-       return 0;
-}
-
-
 struct raw_event_sample {
        u32 size;
        char data[0];
@@ -745,6 +718,22 @@ struct trace_fork_event {
        u32 child_pid;
 };
 
+struct trace_migrate_task_event {
+       u32 size;
+
+       u16 common_type;
+       u8 common_flags;
+       u8 common_preempt_count;
+       u32 common_pid;
+       u32 common_tgid;
+
+       char comm[16];
+       u32 pid;
+
+       u32 prio;
+       u32 cpu;
+};
+
 struct trace_sched_handler {
        void (*switch_event)(struct trace_switch_event *,
                             struct event *,
@@ -769,6 +758,12 @@ struct trace_sched_handler {
                           int cpu,
                           u64 timestamp,
                           struct thread *thread);
+
+       void (*migrate_task_event)(struct trace_migrate_task_event *,
+                          struct event *,
+                          int cpu,
+                          u64 timestamp,
+                          struct thread *thread);
 };
 
 
@@ -941,9 +936,7 @@ __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 
 static void thread_atoms_insert(struct thread *thread)
 {
-       struct work_atoms *atoms;
-
-       atoms = calloc(sizeof(*atoms), 1);
+       struct work_atoms *atoms = zalloc(sizeof(*atoms));
        if (!atoms)
                die("No memory");
 
@@ -975,9 +968,7 @@ add_sched_out_event(struct work_atoms *atoms,
                    char run_state,
                    u64 timestamp)
 {
-       struct work_atom *atom;
-
-       atom = calloc(sizeof(*atom), 1);
+       struct work_atom *atom = zalloc(sizeof(*atom));
        if (!atom)
                die("Non memory");
 
@@ -1058,8 +1049,8 @@ latency_switch_event(struct trace_switch_event *switch_event,
                die("hm, delta: %Ld < 0 ?\n", delta);
 
 
-       sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
-       sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+       sched_out = threads__findnew(switch_event->prev_pid);
+       sched_in = threads__findnew(switch_event->next_pid);
 
        out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
        if (!out_events) {
@@ -1092,13 +1083,10 @@ latency_runtime_event(struct trace_runtime_event *runtime_event,
                     u64 timestamp,
                     struct thread *this_thread __used)
 {
-       struct work_atoms *atoms;
-       struct thread *thread;
+       struct thread *thread = threads__findnew(runtime_event->pid);
+       struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
 
        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
-
-       thread = threads__findnew(runtime_event->pid, &threads, &last_match);
-       atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
        if (!atoms) {
                thread_atoms_insert(thread);
                atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
@@ -1125,7 +1113,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
        if (!wakeup_event->success)
                return;
 
-       wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
+       wakee = threads__findnew(wakeup_event->pid);
        atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
        if (!atoms) {
                thread_atoms_insert(wakee);
@@ -1139,7 +1127,12 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 
        atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
-       if (atom->state != THREAD_SLEEPING)
+       /*
+        * You WILL be missing events if you've recorded only
+        * one CPU, or are only looking at only one, so don't
+        * make useless noise.
+        */
+       if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
                nr_state_machine_bugs++;
 
        nr_timestamps++;
@@ -1152,11 +1145,51 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
        atom->wake_up_time = timestamp;
 }
 
+static void
+latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
+                    struct event *__event __used,
+                    int cpu __used,
+                    u64 timestamp,
+                    struct thread *thread __used)
+{
+       struct work_atoms *atoms;
+       struct work_atom *atom;
+       struct thread *migrant;
+
+       /*
+        * Only need to worry about migration when profiling one CPU.
+        */
+       if (profile_cpu == -1)
+               return;
+
+       migrant = threads__findnew(migrate_task_event->pid);
+       atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
+       if (!atoms) {
+               thread_atoms_insert(migrant);
+               register_pid(migrant->pid, migrant->comm);
+               atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
+               if (!atoms)
+                       die("migration-event: Internal tree error");
+               add_sched_out_event(atoms, 'R', timestamp);
+       }
+
+       BUG_ON(list_empty(&atoms->work_list));
+
+       atom = list_entry(atoms->work_list.prev, struct work_atom, list);
+       atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
+
+       nr_timestamps++;
+
+       if (atom->sched_out_time > timestamp)
+               nr_unordered_timestamps++;
+}
+
 static struct trace_sched_handler lat_ops  = {
        .wakeup_event           = latency_wakeup_event,
        .switch_event           = latency_switch_event,
        .runtime_event          = latency_runtime_event,
        .fork_event             = latency_fork_event,
+       .migrate_task_event     = latency_migrate_task_event,
 };
 
 static void output_lat_thread(struct work_atoms *work_list)
@@ -1385,8 +1418,8 @@ map_switch_event(struct trace_switch_event *switch_event,
                die("hm, delta: %Ld < 0 ?\n", delta);
 
 
-       sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
-       sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+       sched_out = threads__findnew(switch_event->prev_pid);
+       sched_in = threads__findnew(switch_event->next_pid);
 
        curr_thread[this_cpu] = sched_in;
 
@@ -1516,6 +1549,26 @@ process_sched_exit_event(struct event *event,
                printf("sched_exit event %p\n", event);
 }
 
+static void
+process_sched_migrate_task_event(struct raw_event_sample *raw,
+                          struct event *event,
+                          int cpu __used,
+                          u64 timestamp __used,
+                          struct thread *thread __used)
+{
+       struct trace_migrate_task_event migrate_task_event;
+
+       FILL_COMMON_FIELDS(migrate_task_event, event, raw->data);
+
+       FILL_ARRAY(migrate_task_event, comm, event, raw->data);
+       FILL_FIELD(migrate_task_event, pid, event, raw->data);
+       FILL_FIELD(migrate_task_event, prio, event, raw->data);
+       FILL_FIELD(migrate_task_event, cpu, event, raw->data);
+
+       if (trace_handler->migrate_task_event)
+               trace_handler->migrate_task_event(&migrate_task_event, event, cpu, timestamp, thread);
+}
+
 static void
 process_raw_event(event_t *raw_event __used, void *more_data,
                  int cpu, u64 timestamp, struct thread *thread)
@@ -1539,23 +1592,23 @@ process_raw_event(event_t *raw_event __used, void *more_data,
                process_sched_fork_event(raw, event, cpu, timestamp, thread);
        if (!strcmp(event->name, "sched_process_exit"))
                process_sched_exit_event(event, cpu, timestamp, thread);
+       if (!strcmp(event->name, "sched_migrate_task"))
+               process_sched_migrate_task_event(raw, event, cpu, timestamp, thread);
 }
 
-static int
-process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+static int process_sample_event(event_t *event)
 {
-       char level;
-       int show = 0;
-       struct dso *dso = NULL;
        struct thread *thread;
        u64 ip = event->ip.ip;
        u64 timestamp = -1;
        u32 cpu = -1;
        u64 period = 1;
        void *more_data = event->ip.__more_data;
-       int cpumode;
 
-       thread = threads__findnew(event->ip.pid, &threads, &last_match);
+       if (!(sample_type & PERF_SAMPLE_RAW))
+               return 0;
+
+       thread = threads__findnew(event->ip.pid);
 
        if (sample_type & PERF_SAMPLE_TIME) {
                timestamp = *(u64 *)more_data;
@@ -1573,177 +1626,64 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                more_data += sizeof(u64);
        }
 
-       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
+       dump_printf("(IP, %d): %d/%d: %p period: %Ld\n",
                event->header.misc,
                event->ip.pid, event->ip.tid,
                (void *)(long)ip,
                (long long)period);
 
-       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
        if (thread == NULL) {
-               eprintf("problem processing %d event, skipping it.\n",
-                       event->header.type);
+               pr_debug("problem processing %d event, skipping it.\n",
+                        event->header.type);
                return -1;
        }
 
-       cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-
-       if (cpumode == PERF_RECORD_MISC_KERNEL) {
-               show = SHOW_KERNEL;
-               level = 'k';
-
-               dso = kernel_dso;
-
-               dump_printf(" ...... dso: %s\n", dso->name);
-
-       } else if (cpumode == PERF_RECORD_MISC_USER) {
-
-               show = SHOW_USER;
-               level = '.';
-
-       } else {
-               show = SHOW_HV;
-               level = 'H';
-
-               dso = hypervisor_dso;
+       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
-               dump_printf(" ...... dso: [hypervisor]\n");
-       }
+       if (profile_cpu != -1 && profile_cpu != (int) cpu)
+               return 0;
 
-       if (sample_type & PERF_SAMPLE_RAW)
-               process_raw_event(event, more_data, cpu, timestamp, thread);
+       process_raw_event(event, more_data, cpu, timestamp, thread);
 
        return 0;
 }
 
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
+static int process_lost_event(event_t *event __used)
 {
-       trace_event(event);
-
-       nr_events++;
-       switch (event->header.type) {
-       case PERF_RECORD_MMAP:
-               return 0;
-       case PERF_RECORD_LOST:
-               nr_lost_chunks++;
-               nr_lost_events += event->lost.lost;
-               return 0;
-
-       case PERF_RECORD_COMM:
-               return process_comm_event(event, offset, head);
+       nr_lost_chunks++;
+       nr_lost_events += event->lost.lost;
 
-       case PERF_RECORD_EXIT ... PERF_RECORD_READ:
-               return 0;
+       return 0;
+}
 
-       case PERF_RECORD_SAMPLE:
-               return process_sample_event(event, offset, head);
+static int sample_type_check(u64 type)
+{
+       sample_type = type;
 
-       case PERF_RECORD_MAX:
-       default:
+       if (!(sample_type & PERF_SAMPLE_RAW)) {
+               fprintf(stderr,
+                       "No trace sample to read. Did you call perf record "
+                       "without -R?");
                return -1;
        }
 
        return 0;
 }
 
+static struct perf_file_handler file_handler = {
+       .process_sample_event   = process_sample_event,
+       .process_comm_event     = event__process_comm,
+       .process_lost_event     = process_lost_event,
+       .sample_type_check      = sample_type_check,
+};
+
 static int read_events(void)
 {
-       int ret, rc = EXIT_FAILURE;
-       unsigned long offset = 0;
-       unsigned long head = 0;
-       struct stat perf_stat;
-       event_t *event;
-       uint32_t size;
-       char *buf;
-
-       trace_report();
-       register_idle_thread(&threads, &last_match);
-
-       input = open(input_name, O_RDONLY);
-       if (input < 0) {
-               perror("failed to open file");
-               exit(-1);
-       }
-
-       ret = fstat(input, &perf_stat);
-       if (ret < 0) {
-               perror("failed to stat file");
-               exit(-1);
-       }
-
-       if (!perf_stat.st_size) {
-               fprintf(stderr, "zero-sized file, nothing to do!\n");
-               exit(0);
-       }
-       header = perf_header__read(input);
-       head = header->data_offset;
-       sample_type = perf_header__sample_type(header);
-
-       if (!(sample_type & PERF_SAMPLE_RAW))
-               die("No trace sample to read. Did you call perf record "
-                   "without -R?");
-
-       if (load_kernel() < 0) {
-               perror("failed to load kernel symbols");
-               return EXIT_FAILURE;
-       }
-
-remap:
-       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-                          MAP_SHARED, input, offset);
-       if (buf == MAP_FAILED) {
-               perror("failed to mmap file");
-               exit(-1);
-       }
-
-more:
-       event = (event_t *)(buf + head);
-
-       size = event->header.size;
-       if (!size)
-               size = 8;
-
-       if (head + event->header.size >= page_size * mmap_window) {
-               unsigned long shift = page_size * (head / page_size);
-               int res;
-
-               res = munmap(buf, page_size * mmap_window);
-               assert(res == 0);
-
-               offset += shift;
-               head -= shift;
-               goto remap;
-       }
-
-       size = event->header.size;
-
-
-       if (!size || process_event(event, offset, head) < 0) {
-
-               /*
-                * assume we lost track of the stream, check alignment, and
-                * increment a single u64 in the hope to catch on again 'soon'.
-                */
-
-               if (unlikely(head & 7))
-                       head &= ~7ULL;
-
-               size = 8;
-       }
-
-       head += size;
-
-       if (offset + head < (unsigned long)perf_stat.st_size)
-               goto more;
-
-       rc = EXIT_SUCCESS;
-       close(input);
+       register_idle_thread();
+       register_perf_file_handler(&file_handler);
 
-       return rc;
+       return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+                                      &event__cwdlen, &event__cwd);
 }
 
 static void print_bad_events(void)
@@ -1883,6 +1823,8 @@ static const struct option latency_options[] = {
                   "sort by key(s): runtime, switch, avg, max"),
        OPT_BOOLEAN('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_INTEGER('C', "CPU", &profile_cpu,
+                   "CPU to profile on"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
        OPT_END()
@@ -1960,8 +1902,7 @@ static int __cmd_record(int argc, const char **argv)
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-       symbol__init();
-       page_size = getpagesize();
+       symbol__init(0);
 
        argc = parse_options(argc, argv, sched_options, sched_usage,
                             PARSE_OPT_STOP_AT_NON_OPTION);
index 3db31e7bf1737a438cc93e4648e987ec0073aeb1..c70d72003557f17f29345b0f219dc5ca9f572d75 100644 (file)
 
 static struct perf_event_attr default_attrs[] = {
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK     },
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS    },
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES     },
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS   },
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES   },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES       },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS         },
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS            },
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES             },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS           },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS    },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES          },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES       },
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES           },
 
 };
 
@@ -125,6 +127,7 @@ struct stats                        event_res_stats[MAX_COUNTERS][3];
 struct stats                   runtime_nsecs_stats;
 struct stats                   walltime_nsecs_stats;
 struct stats                   runtime_cycles_stats;
+struct stats                   runtime_branches_stats;
 
 #define MATCH_EVENT(t, c, counter)                     \
        (attrs[counter].type == PERF_TYPE_##t &&        \
@@ -235,6 +238,8 @@ static void read_counter(int counter)
                update_stats(&runtime_nsecs_stats, count[0]);
        if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
                update_stats(&runtime_cycles_stats, count[0]);
+       if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
+               update_stats(&runtime_branches_stats, count[0]);
 }
 
 static int run_perf_stat(int argc __used, const char **argv)
@@ -352,7 +357,16 @@ static void abs_printout(int counter, double avg)
                        ratio = avg / total;
 
                fprintf(stderr, " # %10.3f IPC  ", ratio);
-       } else {
+       } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
+                       runtime_branches_stats.n != 0) {
+               total = avg_stats(&runtime_branches_stats);
+
+               if (total)
+                       ratio = avg * 100 / total;
+
+               fprintf(stderr, " # %10.3f %%    ", ratio);
+
+       } else if (runtime_nsecs_stats.n != 0) {
                total = avg_stats(&runtime_nsecs_stats);
 
                if (total)
index e8a510d935e55d2cdae348e532fbdd47bf22cd4d..cb58b6605fcc875fec3d0122e8607b17a408db3c 100644 (file)
 #include "util/header.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
+#include "util/event.h"
+#include "util/data_map.h"
 #include "util/svghelper.h"
 
 static char            const *input_name = "perf.data";
 static char            const *output_name = "output.svg";
 
 
-static unsigned long   page_size;
-static unsigned long   mmap_window = 32;
 static u64             sample_type;
 
 static unsigned int    numcpus;
@@ -49,8 +49,6 @@ static u64            first_time, last_time;
 static int             power_only;
 
 
-static struct perf_header      *header;
-
 struct per_pid;
 struct per_pidcomm;
 
@@ -153,6 +151,17 @@ static struct wake_event     *wake_events;
 
 struct sample_wrapper *all_samples;
 
+
+struct process_filter;
+struct process_filter {
+       char                    *name;
+       int                     pid;
+       struct process_filter   *next;
+};
+
+static struct process_filter *process_filter;
+
+
 static struct per_pid *find_create_pid(int pid)
 {
        struct per_pid *cursor = all_data;
@@ -763,11 +772,11 @@ static void draw_wakeups(void)
                                c = p->all;
                                while (c) {
                                        if (c->Y && c->start_time <= we->time && c->end_time >= we->time) {
-                                               if (p->pid == we->waker) {
+                                               if (p->pid == we->waker && !from) {
                                                        from = c->Y;
                                                        task_from = strdup(c->comm);
                                                }
-                                               if (p->pid == we->wakee) {
+                                               if (p->pid == we->wakee && !to) {
                                                        to = c->Y;
                                                        task_to = strdup(c->comm);
                                                }
@@ -882,12 +891,89 @@ static void draw_process_bars(void)
        }
 }
 
+static void add_process_filter(const char *string)
+{
+       struct process_filter *filt;
+       int pid;
+
+       pid = strtoull(string, NULL, 10);
+       filt = malloc(sizeof(struct process_filter));
+       if (!filt)
+               return;
+
+       filt->name = strdup(string);
+       filt->pid  = pid;
+       filt->next = process_filter;
+
+       process_filter = filt;
+}
+
+static int passes_filter(struct per_pid *p, struct per_pidcomm *c)
+{
+       struct process_filter *filt;
+       if (!process_filter)
+               return 1;
+
+       filt = process_filter;
+       while (filt) {
+               if (filt->pid && p->pid == filt->pid)
+                       return 1;
+               if (strcmp(filt->name, c->comm) == 0)
+                       return 1;
+               filt = filt->next;
+       }
+       return 0;
+}
+
+static int determine_display_tasks_filtered(void)
+{
+       struct per_pid *p;
+       struct per_pidcomm *c;
+       int count = 0;
+
+       p = all_data;
+       while (p) {
+               p->display = 0;
+               if (p->start_time == 1)
+                       p->start_time = first_time;
+
+               /* no exit marker, task kept running to the end */
+               if (p->end_time == 0)
+                       p->end_time = last_time;
+
+               c = p->all;
+
+               while (c) {
+                       c->display = 0;
+
+                       if (c->start_time == 1)
+                               c->start_time = first_time;
+
+                       if (passes_filter(p, c)) {
+                               c->display = 1;
+                               p->display = 1;
+                               count++;
+                       }
+
+                       if (c->end_time == 0)
+                               c->end_time = last_time;
+
+                       c = c->next;
+               }
+               p = p->next;
+       }
+       return count;
+}
+
 static int determine_display_tasks(u64 threshold)
 {
        struct per_pid *p;
        struct per_pidcomm *c;
        int count = 0;
 
+       if (process_filter)
+               return determine_display_tasks_filtered();
+
        p = all_data;
        while (p) {
                p->display = 0;
@@ -957,36 +1043,6 @@ static void write_svg_file(const char *filename)
        svg_close();
 }
 
-static int
-process_event(event_t *event)
-{
-
-       switch (event->header.type) {
-
-       case PERF_RECORD_COMM:
-               return process_comm_event(event);
-       case PERF_RECORD_FORK:
-               return process_fork_event(event);
-       case PERF_RECORD_EXIT:
-               return process_exit_event(event);
-       case PERF_RECORD_SAMPLE:
-               return queue_sample_event(event);
-
-       /*
-        * We dont process them right now but they are fine:
-        */
-       case PERF_RECORD_MMAP:
-       case PERF_RECORD_THROTTLE:
-       case PERF_RECORD_UNTHROTTLE:
-               return 0;
-
-       default:
-               return -1;
-       }
-
-       return 0;
-}
-
 static void process_samples(void)
 {
        struct sample_wrapper *cursor;
@@ -1002,107 +1058,38 @@ static void process_samples(void)
        }
 }
 
-
-static int __cmd_timechart(void)
+static int sample_type_check(u64 type)
 {
-       int ret, rc = EXIT_FAILURE;
-       unsigned long offset = 0;
-       unsigned long head, shift;
-       struct stat statbuf;
-       event_t *event;
-       uint32_t size;
-       char *buf;
-       int input;
-
-       input = open(input_name, O_RDONLY);
-       if (input < 0) {
-               fprintf(stderr, " failed to open file: %s", input_name);
-               if (!strcmp(input_name, "perf.data"))
-                       fprintf(stderr, "  (try 'perf record' first)");
-               fprintf(stderr, "\n");
-               exit(-1);
-       }
-
-       ret = fstat(input, &statbuf);
-       if (ret < 0) {
-               perror("failed to stat file");
-               exit(-1);
-       }
-
-       if (!statbuf.st_size) {
-               fprintf(stderr, "zero-sized file, nothing to do!\n");
-               exit(0);
-       }
-
-       header = perf_header__read(input);
-       head = header->data_offset;
-
-       sample_type = perf_header__sample_type(header);
+       sample_type = type;
 
-       shift = page_size * (head / page_size);
-       offset += shift;
-       head -= shift;
-
-remap:
-       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-                          MAP_SHARED, input, offset);
-       if (buf == MAP_FAILED) {
-               perror("failed to mmap file");
-               exit(-1);
-       }
-
-more:
-       event = (event_t *)(buf + head);
-
-       size = event->header.size;
-       if (!size)
-               size = 8;
-
-       if (head + event->header.size >= page_size * mmap_window) {
-               int ret2;
-
-               shift = page_size * (head / page_size);
-
-               ret2 = munmap(buf, page_size * mmap_window);
-               assert(ret2 == 0);
-
-               offset += shift;
-               head -= shift;
-               goto remap;
-       }
-
-       size = event->header.size;
-
-       if (!size || process_event(event) < 0) {
-
-               printf("%p [%p]: skipping unknown header type: %d\n",
-                       (void *)(offset + head),
-                       (void *)(long)(event->header.size),
-                       event->header.type);
-
-               /*
-                * assume we lost track of the stream, check alignment, and
-                * increment a single u64 in the hope to catch on again 'soon'.
-                */
-
-               if (unlikely(head & 7))
-                       head &= ~7ULL;
-
-               size = 8;
+       if (!(sample_type & PERF_SAMPLE_RAW)) {
+               fprintf(stderr, "No trace samples found in the file.\n"
+                               "Have you used 'perf timechart record' to record it?\n");
+               return -1;
        }
 
-       head += size;
+       return 0;
+}
 
-       if (offset + head >= header->data_offset + header->data_size)
-               goto done;
+static struct perf_file_handler file_handler = {
+       .process_comm_event     = process_comm_event,
+       .process_fork_event     = process_fork_event,
+       .process_exit_event     = process_exit_event,
+       .process_sample_event   = queue_sample_event,
+       .sample_type_check      = sample_type_check,
+};
 
-       if (offset + head < (unsigned long)statbuf.st_size)
-               goto more;
+static int __cmd_timechart(void)
+{
+       struct perf_header *header;
+       int ret;
 
-done:
-       rc = EXIT_SUCCESS;
-       close(input);
+       register_perf_file_handler(&file_handler);
 
+       ret = mmap_dispatch_perf_file(&header, input_name, 0, 0,
+                                     &event__cwdlen, &event__cwd);
+       if (ret)
+               return EXIT_FAILURE;
 
        process_samples();
 
@@ -1112,9 +1099,10 @@ done:
 
        write_svg_file(output_name);
 
-       printf("Written %2.1f seconds of trace to %s.\n", (last_time - first_time) / 1000000000.0, output_name);
+       pr_info("Written %2.1f seconds of trace to %s.\n",
+               (last_time - first_time) / 1000000000.0, output_name);
 
-       return rc;
+       return EXIT_SUCCESS;
 }
 
 static const char * const timechart_usage[] = {
@@ -1153,6 +1141,14 @@ static int __cmd_record(int argc, const char **argv)
        return cmd_record(i, rec_argv, NULL);
 }
 
+static int
+parse_process(const struct option *opt __used, const char *arg, int __used unset)
+{
+       if (arg)
+               add_process_filter(arg);
+       return 0;
+}
+
 static const struct option options[] = {
        OPT_STRING('i', "input", &input_name, "file",
                    "input file name"),
@@ -1160,17 +1156,18 @@ static const struct option options[] = {
                    "output file name"),
        OPT_INTEGER('w', "width", &svg_page_width,
                    "page width"),
-       OPT_BOOLEAN('p', "power-only", &power_only,
+       OPT_BOOLEAN('P', "power-only", &power_only,
                    "output power data only"),
+       OPT_CALLBACK('p', "process", NULL, "process",
+                     "process selector. Pass a pid or process name.",
+                      parse_process),
        OPT_END()
 };
 
 
 int cmd_timechart(int argc, const char **argv, const char *prefix __used)
 {
-       symbol__init();
-
-       page_size = getpagesize();
+       symbol__init(0);
 
        argc = parse_options(argc, argv, options, timechart_usage,
                        PARSE_OPT_STOP_AT_NON_OPTION);
index e23bc74e734fbf210312b04e776234f19920cced..e0a374d0e43a8da197177a7ccccbc3ada6349a2e 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "util/symbol.h"
 #include "util/color.h"
+#include "util/thread.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
 #include "util/parse-options.h"
 
 static int                     fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static int                     system_wide                     =  0;
+static int                     system_wide                     =      0;
 
-static int                     default_interval                = 100000;
+static int                     default_interval                =      0;
 
-static int                     count_filter                    =  5;
-static int                     print_entries                   = 15;
+static int                     count_filter                    =      5;
+static int                     print_entries;
 
-static int                     target_pid                      = -1;
-static int                     inherit                         =  0;
-static int                     profile_cpu                     = -1;
-static int                     nr_cpus                         =  0;
-static unsigned int            realtime_prio                   =  0;
-static int                     group                           =  0;
+static int                     target_pid                      =     -1;
+static int                     inherit                         =      0;
+static int                     profile_cpu                     =     -1;
+static int                     nr_cpus                         =      0;
+static unsigned int            realtime_prio                   =      0;
+static int                     group                           =      0;
 static unsigned int            page_size;
-static unsigned int            mmap_pages                      = 16;
-static int                     freq                            =  0;
+static unsigned int            mmap_pages                      =     16;
+static int                     freq                            =   1000; /* 1 KHz */
 
-static int                     delay_secs                      =  2;
-static int                     zero;
-static int                     dump_symtab;
+static int                     delay_secs                      =      2;
+static int                     zero                            =      0;
+static int                     dump_symtab                     =      0;
+
+static bool                    hide_kernel_symbols             =  false;
+static bool                    hide_user_symbols               =  false;
+static struct winsize          winsize;
+struct symbol_conf             symbol_conf;
 
 /*
  * Source
@@ -86,83 +92,126 @@ struct source_line {
        struct source_line      *next;
 };
 
-static char                    *sym_filter                     =  NULL;
-struct sym_entry               *sym_filter_entry               =  NULL;
-static int                     sym_pcnt_filter                 =  5;
-static int                     sym_counter                     =  0;
-static int                     display_weighted                = -1;
+static char                    *sym_filter                     =   NULL;
+struct sym_entry               *sym_filter_entry               =   NULL;
+static int                     sym_pcnt_filter                 =      5;
+static int                     sym_counter                     =      0;
+static int                     display_weighted                =     -1;
 
 /*
  * Symbols
  */
 
-static u64                     min_ip;
-static u64                     max_ip = -1ll;
+struct sym_entry_source {
+       struct source_line      *source;
+       struct source_line      *lines;
+       struct source_line      **lines_tail;
+       pthread_mutex_t         lock;
+};
 
 struct sym_entry {
        struct rb_node          rb_node;
        struct list_head        node;
-       unsigned long           count[MAX_COUNTERS];
        unsigned long           snap_count;
        double                  weight;
        int                     skip;
-       struct source_line      *source;
-       struct source_line      *lines;
-       struct source_line      **lines_tail;
-       pthread_mutex_t         source_lock;
+       u16                     name_len;
+       u8                      origin;
+       struct map              *map;
+       struct sym_entry_source *src;
+       unsigned long           count[0];
 };
 
 /*
  * Source functions
  */
 
+static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
+{
+       return ((void *)self) + symbol_conf.priv_size;
+}
+
+static void get_term_dimensions(struct winsize *ws)
+{
+       char *s = getenv("LINES");
+
+       if (s != NULL) {
+               ws->ws_row = atoi(s);
+               s = getenv("COLUMNS");
+               if (s != NULL) {
+                       ws->ws_col = atoi(s);
+                       if (ws->ws_row && ws->ws_col)
+                               return;
+               }
+       }
+#ifdef TIOCGWINSZ
+       if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
+           ws->ws_row && ws->ws_col)
+               return;
+#endif
+       ws->ws_row = 25;
+       ws->ws_col = 80;
+}
+
+static void update_print_entries(struct winsize *ws)
+{
+       print_entries = ws->ws_row;
+
+       if (print_entries > 9)
+               print_entries -= 9;
+}
+
+static void sig_winch_handler(int sig __used)
+{
+       get_term_dimensions(&winsize);
+       update_print_entries(&winsize);
+}
+
 static void parse_source(struct sym_entry *syme)
 {
        struct symbol *sym;
-       struct module *module;
-       struct section *section = NULL;
+       struct sym_entry_source *source;
+       struct map *map;
        FILE *file;
        char command[PATH_MAX*2];
-       const char *path = vmlinux_name;
-       u64 start, end, len;
+       const char *path;
+       u64 len;
 
        if (!syme)
                return;
 
-       if (syme->lines) {
-               pthread_mutex_lock(&syme->source_lock);
-               goto out_assign;
+       if (syme->src == NULL) {
+               syme->src = zalloc(sizeof(*source));
+               if (syme->src == NULL)
+                       return;
+               pthread_mutex_init(&syme->src->lock, NULL);
        }
 
-       sym = (struct symbol *)(syme + 1);
-       module = sym->module;
-
-       if (module)
-               path = module->path;
-       if (!path)
-               return;
-
-       start = sym->obj_start;
-       if (!start)
-               start = sym->start;
+       source = syme->src;
 
-       if (module) {
-               section = module->sections->find_section(module->sections, ".text");
-               if (section)
-                       start -= section->vma;
+       if (source->lines) {
+               pthread_mutex_lock(&source->lock);
+               goto out_assign;
        }
 
-       end = start + sym->end - sym->start + 1;
+       sym = sym_entry__symbol(syme);
+       map = syme->map;
+       path = map->dso->long_name;
+
        len = sym->end - sym->start;
 
-       sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", start, end, path);
+       sprintf(command,
+               "objdump --start-address=0x%016Lx "
+                        "--stop-address=0x%016Lx -dS %s",
+               map->unmap_ip(map, sym->start),
+               map->unmap_ip(map, sym->end), path);
 
        file = popen(command, "r");
        if (!file)
                return;
 
-       pthread_mutex_lock(&syme->source_lock);
-       syme->lines_tail = &syme->lines;
+       pthread_mutex_lock(&source->lock);
+       source->lines_tail = &source->lines;
        while (!feof(file)) {
                struct source_line *src;
                size_t dummy = 0;
@@ -182,24 +231,22 @@ static void parse_source(struct sym_entry *syme)
                        *c = 0;
 
                src->next = NULL;
-               *syme->lines_tail = src;
-               syme->lines_tail = &src->next;
+               *source->lines_tail = src;
+               source->lines_tail = &src->next;
 
                if (strlen(src->line)>8 && src->line[8] == ':') {
                        src->eip = strtoull(src->line, NULL, 16);
-                       if (section)
-                               src->eip += section->vma;
+                       src->eip = map->unmap_ip(map, src->eip);
                }
                if (strlen(src->line)>8 && src->line[16] == ':') {
                        src->eip = strtoull(src->line, NULL, 16);
-                       if (section)
-                               src->eip += section->vma;
+                       src->eip = map->unmap_ip(map, src->eip);
                }
        }
        pclose(file);
 out_assign:
        sym_filter_entry = syme;
-       pthread_mutex_unlock(&syme->source_lock);
+       pthread_mutex_unlock(&source->lock);
 }
 
 static void __zero_source_counters(struct sym_entry *syme)
@@ -207,7 +254,7 @@ static void __zero_source_counters(struct sym_entry *syme)
        int i;
        struct source_line *line;
 
-       line = syme->lines;
+       line = syme->src->lines;
        while (line) {
                for (i = 0; i < nr_counters; i++)
                        line->count[i] = 0;
@@ -222,13 +269,13 @@ static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
        if (syme != sym_filter_entry)
                return;
 
-       if (pthread_mutex_trylock(&syme->source_lock))
+       if (pthread_mutex_trylock(&syme->src->lock))
                return;
 
-       if (!syme->source)
+       if (syme->src == NULL || syme->src->source == NULL)
                goto out_unlock;
 
-       for (line = syme->lines; line; line = line->next) {
+       for (line = syme->src->lines; line; line = line->next) {
                if (line->eip == ip) {
                        line->count[counter]++;
                        break;
@@ -237,32 +284,25 @@ static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
                        break;
        }
 out_unlock:
-       pthread_mutex_unlock(&syme->source_lock);
+       pthread_mutex_unlock(&syme->src->lock);
 }
 
 static void lookup_sym_source(struct sym_entry *syme)
 {
-       struct symbol *symbol = (struct symbol *)(syme + 1);
+       struct symbol *symbol = sym_entry__symbol(syme);
        struct source_line *line;
        char pattern[PATH_MAX];
-       char *idx;
 
        sprintf(pattern, "<%s>:", symbol->name);
 
-       if (symbol->module) {
-               idx = strstr(pattern, "\t");
-               if (idx)
-                       *idx = 0;
-       }
-
-       pthread_mutex_lock(&syme->source_lock);
-       for (line = syme->lines; line; line = line->next) {
+       pthread_mutex_lock(&syme->src->lock);
+       for (line = syme->src->lines; line; line = line->next) {
                if (strstr(line->line, pattern)) {
-                       syme->source = line;
+                       syme->src->source = line;
                        break;
                }
        }
-       pthread_mutex_unlock(&syme->source_lock);
+       pthread_mutex_unlock(&syme->src->lock);
 }
 
 static void show_lines(struct source_line *queue, int count, int total)
@@ -292,24 +332,24 @@ static void show_details(struct sym_entry *syme)
        if (!syme)
                return;
 
-       if (!syme->source)
+       if (!syme->src->source)
                lookup_sym_source(syme);
 
-       if (!syme->source)
+       if (!syme->src->source)
                return;
 
-       symbol = (struct symbol *)(syme + 1);
+       symbol = sym_entry__symbol(syme);
        printf("Showing %s for %s\n", event_name(sym_counter), symbol->name);
        printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
 
-       pthread_mutex_lock(&syme->source_lock);
-       line = syme->source;
+       pthread_mutex_lock(&syme->src->lock);
+       line = syme->src->source;
        while (line) {
                total += line->count[sym_counter];
                line = line->next;
        }
 
-       line = syme->source;
+       line = syme->src->source;
        while (line) {
                float pcnt = 0.0;
 
@@ -334,13 +374,13 @@ static void show_details(struct sym_entry *syme)
                line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8;
                line = line->next;
        }
-       pthread_mutex_unlock(&syme->source_lock);
+       pthread_mutex_unlock(&syme->src->lock);
        if (more)
                printf("%d lines not displayed, maybe increase display entries [e]\n", more);
 }
 
 /*
- * Symbols will be added here in record_ip and will get out
+ * Symbols will be added here in event__process_sample and will get out
  * after decayed.
  */
 static LIST_HEAD(active_symbols);
@@ -411,6 +451,8 @@ static void print_sym_table(void)
        struct sym_entry *syme, *n;
        struct rb_root tmp = RB_ROOT;
        struct rb_node *nd;
+       int sym_width = 0, dso_width = 0, max_dso_width;
+       const int win_width = winsize.ws_col - 1;
 
        samples = userspace_samples = 0;
 
@@ -422,6 +464,14 @@ static void print_sym_table(void)
        list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
                syme->snap_count = syme->count[snap];
                if (syme->snap_count != 0) {
+
+                       if ((hide_user_symbols &&
+                            syme->origin == PERF_RECORD_MISC_USER) ||
+                           (hide_kernel_symbols &&
+                            syme->origin == PERF_RECORD_MISC_KERNEL)) {
+                               list_remove_active_sym(syme);
+                               continue;
+                       }
                        syme->weight = sym_weight(syme);
                        rb_insert_active_sym(&tmp, syme);
                        sum_ksamples += syme->snap_count;
@@ -434,8 +484,7 @@ static void print_sym_table(void)
 
        puts(CONSOLE_CLEAR);
 
-       printf(
-"------------------------------------------------------------------------------\n");
+       printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
        printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
                samples_per_sec,
                100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
@@ -473,33 +522,57 @@ static void print_sym_table(void)
                        printf(", %d CPUs)\n", nr_cpus);
        }
 
-       printf("------------------------------------------------------------------------------\n\n");
+       printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
        if (sym_filter_entry) {
                show_details(sym_filter_entry);
                return;
        }
 
+       /*
+        * Find the longest symbol name that will be displayed
+        */
+       for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
+               syme = rb_entry(nd, struct sym_entry, rb_node);
+               if (++printed > print_entries ||
+                   (int)syme->snap_count < count_filter)
+                       continue;
+
+               if (syme->map->dso->long_name_len > dso_width)
+                       dso_width = syme->map->dso->long_name_len;
+
+               if (syme->name_len > sym_width)
+                       sym_width = syme->name_len;
+       }
+
+       printed = 0;
+
+       max_dso_width = winsize.ws_col - sym_width - 29;
+       if (dso_width > max_dso_width)
+               dso_width = max_dso_width;
+       putchar('\n');
        if (nr_counters == 1)
-               printf("             samples    pcnt");
+               printf("             samples  pcnt");
        else
-               printf("   weight    samples    pcnt");
+               printf("   weight    samples  pcnt");
 
        if (verbose)
                printf("         RIP       ");
-       printf("   kernel function\n");
-       printf("   %s    _______   _____",
+       printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
+       printf("   %s    _______ _____",
               nr_counters == 1 ? "      " : "______");
        if (verbose)
-               printf("   ________________");
-       printf("   _______________\n\n");
+               printf(" ________________");
+       printf(" %-*.*s", sym_width, sym_width, graph_line);
+       printf(" %-*.*s", dso_width, dso_width, graph_line);
+       puts("\n");
 
        for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
                struct symbol *sym;
                double pcnt;
 
                syme = rb_entry(nd, struct sym_entry, rb_node);
-               sym = (struct symbol *)(syme + 1);
+               sym = sym_entry__symbol(syme);
 
                if (++printed > print_entries || (int)syme->snap_count < count_filter)
                        continue;
@@ -508,17 +581,18 @@ static void print_sym_table(void)
                                         sum_ksamples));
 
                if (nr_counters == 1 || !display_weighted)
-                       printf("%20.2f ", syme->weight);
+                       printf("%20.2f ", syme->weight);
                else
-                       printf("%9.1f %10ld ", syme->weight, syme->snap_count);
+                       printf("%9.1f %10ld ", syme->weight, syme->snap_count);
 
                percent_color_fprintf(stdout, "%4.1f%%", pcnt);
                if (verbose)
-                       printf(" - %016llx", sym->start);
-               printf(" : %s", sym->name);
-               if (sym->module)
-                       printf("\t[%s]", sym->module->name);
-               printf("\n");
+                       printf(" %016llx", sym->start);
+               printf(" %-*.*s", sym_width, sym_width, sym->name);
+               printf(" %-*.*s\n", dso_width, dso_width,
+                      dso_width >= syme->map->dso->long_name_len ?
+                                       syme->map->dso->long_name :
+                                       syme->map->dso->short_name);
        }
 }
 
@@ -565,10 +639,10 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
 
        /* zero counters of active symbol */
        if (syme) {
-               pthread_mutex_lock(&syme->source_lock);
+               pthread_mutex_lock(&syme->src->lock);
                __zero_source_counters(syme);
                *target = NULL;
-               pthread_mutex_unlock(&syme->source_lock);
+               pthread_mutex_unlock(&syme->src->lock);
        }
 
        fprintf(stdout, "\n%s: ", msg);
@@ -584,7 +658,7 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
        pthread_mutex_unlock(&active_symbols_lock);
 
        list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-               struct symbol *sym = (struct symbol *)(syme + 1);
+               struct symbol *sym = sym_entry__symbol(syme);
 
                if (!strcmp(buf, sym->name)) {
                        found = syme;
@@ -608,7 +682,7 @@ static void print_mapped_keys(void)
        char *name = NULL;
 
        if (sym_filter_entry) {
-               struct symbol *sym = (struct symbol *)(sym_filter_entry+1);
+               struct symbol *sym = sym_entry__symbol(sym_filter_entry);
                name = sym->name;
        }
 
@@ -621,7 +695,7 @@ static void print_mapped_keys(void)
 
        fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
 
-       if (vmlinux_name) {
+       if (symbol_conf.vmlinux_name) {
                fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
                fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
                fprintf(stdout, "\t[S]     stop annotation.\n");
@@ -630,6 +704,12 @@ static void print_mapped_keys(void)
        if (nr_counters > 1)
                fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
 
+       fprintf(stdout,
+               "\t[K]     hide kernel_symbols symbols.             \t(%s)\n",
+               hide_kernel_symbols ? "yes" : "no");
+       fprintf(stdout,
+               "\t[U]     hide user symbols.               \t(%s)\n",
+               hide_user_symbols ? "yes" : "no");
        fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", zero ? 1 : 0);
        fprintf(stdout, "\t[qQ]    quit.\n");
 }
@@ -643,6 +723,8 @@ static int key_mapped(int c)
                case 'z':
                case 'q':
                case 'Q':
+               case 'K':
+               case 'U':
                        return 1;
                case 'E':
                case 'w':
@@ -650,7 +732,7 @@ static int key_mapped(int c)
                case 'F':
                case 's':
                case 'S':
-                       return vmlinux_name ? 1 : 0;
+                       return symbol_conf.vmlinux_name ? 1 : 0;
                default:
                        break;
        }
@@ -691,6 +773,11 @@ static void handle_keypress(int c)
                        break;
                case 'e':
                        prompt_integer(&print_entries, "Enter display entries (lines)");
+                       if (print_entries == 0) {
+                               sig_winch_handler(SIGWINCH);
+                               signal(SIGWINCH, sig_winch_handler);
+                       } else
+                               signal(SIGWINCH, SIG_DFL);
                        break;
                case 'E':
                        if (nr_counters > 1) {
@@ -715,9 +802,14 @@ static void handle_keypress(int c)
                case 'F':
                        prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
                        break;
+               case 'K':
+                       hide_kernel_symbols = !hide_kernel_symbols;
+                       break;
                case 'q':
                case 'Q':
                        printf("exiting.\n");
+                       if (dump_symtab)
+                               dsos__fprintf(stderr);
                        exit(0);
                case 's':
                        prompt_symbol(&sym_filter_entry, "Enter details symbol");
@@ -728,12 +820,15 @@ static void handle_keypress(int c)
                        else {
                                struct sym_entry *syme = sym_filter_entry;
 
-                               pthread_mutex_lock(&syme->source_lock);
+                               pthread_mutex_lock(&syme->src->lock);
                                sym_filter_entry = NULL;
                                __zero_source_counters(syme);
-                               pthread_mutex_unlock(&syme->source_lock);
+                               pthread_mutex_unlock(&syme->src->lock);
                        }
                        break;
+               case 'U':
+                       hide_user_symbols = !hide_user_symbols;
+                       break;
                case 'w':
                        display_weighted = ~display_weighted;
                        break;
@@ -790,7 +885,7 @@ static const char *skip_symbols[] = {
        NULL
 };
 
-static int symbol_filter(struct dso *self, struct symbol *sym)
+static int symbol_filter(struct map *map, struct symbol *sym)
 {
        struct sym_entry *syme;
        const char *name = sym->name;
@@ -812,8 +907,9 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
            strstr(name, "_text_end"))
                return 1;
 
-       syme = dso__sym_priv(self, sym);
-       pthread_mutex_init(&syme->source_lock, NULL);
+       syme = symbol__priv(sym);
+       syme->map = map;
+       syme->src = NULL;
        if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter))
                sym_filter_entry = syme;
 
@@ -824,75 +920,65 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
                }
        }
 
-       return 0;
-}
-
-static int parse_symbols(void)
-{
-       struct rb_node *node;
-       struct symbol  *sym;
-       int use_modules = vmlinux_name ? 1 : 0;
-
-       kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
-       if (kernel_dso == NULL)
-               return -1;
-
-       if (dso__load_kernel(kernel_dso, vmlinux_name, symbol_filter, verbose, use_modules) <= 0)
-               goto out_delete_dso;
-
-       node = rb_first(&kernel_dso->syms);
-       sym = rb_entry(node, struct symbol, rb_node);
-       min_ip = sym->start;
-
-       node = rb_last(&kernel_dso->syms);
-       sym = rb_entry(node, struct symbol, rb_node);
-       max_ip = sym->end;
-
-       if (dump_symtab)
-               dso__fprintf(kernel_dso, stderr);
+       if (!syme->skip)
+               syme->name_len = strlen(sym->name);
 
        return 0;
-
-out_delete_dso:
-       dso__delete(kernel_dso);
-       kernel_dso = NULL;
-       return -1;
 }
 
-/*
- * Binary search in the histogram table and record the hit:
- */
-static void record_ip(u64 ip, int counter)
+static void event__process_sample(const event_t *self, int counter)
 {
-       struct symbol *sym = dso__find_symbol(kernel_dso, ip);
-
-       if (sym != NULL) {
-               struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
-
-               if (!syme->skip) {
-                       syme->count[counter]++;
-                       record_precise_ip(syme, counter, ip);
-                       pthread_mutex_lock(&active_symbols_lock);
-                       if (list_empty(&syme->node) || !syme->node.next)
-                               __list_insert_active_sym(syme);
-                       pthread_mutex_unlock(&active_symbols_lock);
+       u64 ip = self->ip.ip;
+       struct sym_entry *syme;
+       struct addr_location al;
+       u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+       switch (origin) {
+       case PERF_RECORD_MISC_USER:
+               if (hide_user_symbols)
                        return;
-               }
+               break;
+       case PERF_RECORD_MISC_KERNEL:
+               if (hide_kernel_symbols)
+                       return;
+               break;
+       default:
+               return;
        }
 
-       samples--;
+       if (event__preprocess_sample(self, &al, symbol_filter) < 0 ||
+           al.sym == NULL)
+               return;
+
+       syme = symbol__priv(al.sym);
+       if (!syme->skip) {
+               syme->count[counter]++;
+               syme->origin = origin;
+               record_precise_ip(syme, counter, ip);
+               pthread_mutex_lock(&active_symbols_lock);
+               if (list_empty(&syme->node) || !syme->node.next)
+                       __list_insert_active_sym(syme);
+               pthread_mutex_unlock(&active_symbols_lock);
+               if (origin == PERF_RECORD_MISC_USER)
+                       ++userspace_samples;
+               ++samples;
+       }
 }
 
-static void process_event(u64 ip, int counter, int user)
+static int event__process(event_t *event)
 {
-       samples++;
-
-       if (user) {
-               userspace_samples++;
-               return;
+       switch (event->header.type) {
+       case PERF_RECORD_COMM:
+               event__process_comm(event);
+               break;
+       case PERF_RECORD_MMAP:
+               event__process_mmap(event);
+               break;
+       default:
+               break;
        }
 
-       record_ip(ip, counter);
+       return 0;
 }
 
 struct mmap_data {
@@ -913,8 +999,6 @@ static unsigned int mmap_read_head(struct mmap_data *md)
        return head;
 }
 
-struct timeval last_read, this_read;
-
 static void mmap_read_counter(struct mmap_data *md)
 {
        unsigned int head = mmap_read_head(md);
@@ -922,8 +1006,6 @@ static void mmap_read_counter(struct mmap_data *md)
        unsigned char *data = md->base + page_size;
        int diff;
 
-       gettimeofday(&this_read, NULL);
-
        /*
         * If we're further behind than half the buffer, there's a chance
         * the writer will bite our tail and mess up the samples under us.
@@ -934,14 +1016,7 @@ static void mmap_read_counter(struct mmap_data *md)
         */
        diff = head - old;
        if (diff > md->mask / 2 || diff < 0) {
-               struct timeval iv;
-               unsigned long msecs;
-
-               timersub(&this_read, &last_read, &iv);
-               msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-               fprintf(stderr, "WARNING: failed to keep up with mmap data."
-                               "  Last read %lu msecs ago.\n", msecs);
+               fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
 
                /*
                 * head points to a known good entry, start there.
@@ -949,8 +1024,6 @@ static void mmap_read_counter(struct mmap_data *md)
                old = head;
        }
 
-       last_read = this_read;
-
        for (; old != head;) {
                event_t *event = (event_t *)&data[old & md->mask];
 
@@ -978,13 +1051,11 @@ static void mmap_read_counter(struct mmap_data *md)
                        event = &event_copy;
                }
 
+               if (event->header.type == PERF_RECORD_SAMPLE)
+                       event__process_sample(event, md->counter);
+               else
+                       event__process(event);
                old += size;
-
-               if (event->header.type == PERF_RECORD_SAMPLE) {
-                       int user =
-       (event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER;
-                       process_event(event->ip.ip, md->counter, user);
-               }
        }
 
        md->prev = old;
@@ -1018,8 +1089,15 @@ static void start_counter(int i, int counter)
        attr = attrs + counter;
 
        attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-       attr->freq              = freq;
+
+       if (freq) {
+               attr->sample_type       |= PERF_SAMPLE_PERIOD;
+               attr->freq              = 1;
+               attr->sample_freq       = freq;
+       }
+
        attr->inherit           = (cpu < 0) && inherit;
+       attr->mmap              = 1;
 
 try_again:
        fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
@@ -1078,6 +1156,11 @@ static int __cmd_top(void)
        int i, counter;
        int ret;
 
+       if (target_pid != -1)
+               event__synthesize_thread(target_pid, event__process);
+       else
+               event__synthesize_threads(event__process);
+
        for (i = 0; i < nr_cpus; i++) {
                group_fd = -1;
                for (counter = 0; counter < nr_counters; counter++)
@@ -1133,7 +1216,10 @@ static const struct option options[] = {
                            "system-wide collection from all CPUs"),
        OPT_INTEGER('C', "CPU", &profile_cpu,
                    "CPU to profile on"),
-       OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
+       OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+                  "file", "vmlinux pathname"),
+       OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols,
+                   "hide kernel symbols"),
        OPT_INTEGER('m', "mmap-pages", &mmap_pages,
                    "number of mmap data pages"),
        OPT_INTEGER('r', "realtime", &realtime_prio,
@@ -1156,6 +1242,8 @@ static const struct option options[] = {
                    "profile at this frequency"),
        OPT_INTEGER('E', "entries", &print_entries,
                    "display this many functions"),
+       OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols,
+                   "hide user symbols"),
        OPT_BOOLEAN('v', "verbose", &verbose,
                    "be more verbose (show counter open errors, etc)"),
        OPT_END()
@@ -1165,19 +1253,12 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 {
        int counter;
 
-       symbol__init();
-
        page_size = sysconf(_SC_PAGE_SIZE);
 
        argc = parse_options(argc, argv, options, top_usage, 0);
        if (argc)
                usage_with_options(top_usage, options);
 
-       if (freq) {
-               default_interval = freq;
-               freq = 1;
-       }
-
        /* CPU and PID are mutually exclusive */
        if (target_pid != -1 && profile_cpu != -1) {
                printf("WARNING: PID switch overriding CPU\n");
@@ -1188,12 +1269,30 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
        if (!nr_counters)
                nr_counters = 1;
 
+       symbol_conf.priv_size = (sizeof(struct sym_entry) +
+                                (nr_counters + 1) * sizeof(unsigned long));
+       if (symbol_conf.vmlinux_name == NULL)
+               symbol_conf.try_vmlinux_path = true;
+       if (symbol__init(&symbol_conf) < 0)
+               return -1;
+
        if (delay_secs < 1)
                delay_secs = 1;
 
-       parse_symbols();
        parse_source(sym_filter_entry);
 
+       /*
+        * User specified count overrides default frequency.
+        */
+       if (default_interval)
+               freq = 0;
+       else if (freq) {
+               default_interval = freq;
+       } else {
+               fprintf(stderr, "frequency and count are zero, aborting\n");
+               exit(EXIT_FAILURE);
+       }
+
        /*
         * Fill in the ones not specifically initialized via -c:
         */
@@ -1211,5 +1310,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
        if (target_pid != -1 || profile_cpu != -1)
                nr_cpus = 1;
 
+       get_term_dimensions(&winsize);
+       if (print_entries == 0) {
+               update_print_entries(&winsize);
+               signal(SIGWINCH, sig_winch_handler);
+       }
+
        return __cmd_top();
 }
index 0c5e4f72f2bae827ee0456f7362aff1f0ad20a91..abb914aa7be62e13c18fa6f7eea3c268d295c65d 100644 (file)
@@ -5,66 +5,73 @@
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/header.h"
+#include "util/exec_cmd.h"
+#include "util/trace-event.h"
 
-#include "util/parse-options.h"
+static char const              *script_name;
+static char const              *generate_script_lang;
 
-#include "perf.h"
-#include "util/debug.h"
+static int default_start_script(const char *script __attribute((unused)))
+{
+       return 0;
+}
 
-#include "util/trace-event.h"
+static int default_stop_script(void)
+{
+       return 0;
+}
 
-static char            const *input_name = "perf.data";
-static int             input;
-static unsigned long   page_size;
-static unsigned long   mmap_window = 32;
+static int default_generate_script(const char *outfile __attribute ((unused)))
+{
+       return 0;
+}
 
-static unsigned long   total = 0;
-static unsigned long   total_comm = 0;
+static struct scripting_ops default_scripting_ops = {
+       .start_script           = default_start_script,
+       .stop_script            = default_stop_script,
+       .process_event          = print_event,
+       .generate_script        = default_generate_script,
+};
+
+static struct scripting_ops    *scripting_ops;
 
-static struct rb_root  threads;
-static struct thread   *last_match;
+static void setup_scripting(void)
+{
+       /* make sure PERF_EXEC_PATH is set for scripts */
+       perf_set_argv_exec_path(perf_exec_path());
 
-static struct perf_header *header;
-static u64             sample_type;
+       setup_perl_scripting();
 
+       scripting_ops = &default_scripting_ops;
+}
 
-static int
-process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+static int cleanup_scripting(void)
 {
-       struct thread *thread;
+       return scripting_ops->stop_script();
+}
 
-       thread = threads__findnew(event->comm.pid, &threads, &last_match);
+#include "util/parse-options.h"
 
-       dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
-               event->comm.comm, event->comm.pid);
+#include "perf.h"
+#include "util/debug.h"
 
-       if (thread == NULL ||
-           thread__set_comm(thread, event->comm.comm)) {
-               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
-               return -1;
-       }
-       total_comm++;
+#include "util/trace-event.h"
+#include "util/data_map.h"
+#include "util/exec_cmd.h"
 
-       return 0;
-}
+static char const              *input_name = "perf.data";
 
-static int
-process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+static struct perf_header      *header;
+static u64                     sample_type;
+
+static int process_sample_event(event_t *event)
 {
-       char level;
-       int show = 0;
-       struct dso *dso = NULL;
-       struct thread *thread;
        u64 ip = event->ip.ip;
        u64 timestamp = -1;
        u32 cpu = -1;
        u64 period = 1;
        void *more_data = event->ip.__more_data;
-       int cpumode;
-
-       thread = threads__findnew(event->ip.pid, &threads, &last_match);
+       struct thread *thread = threads__findnew(event->ip.pid);
 
        if (sample_type & PERF_SAMPLE_TIME) {
                timestamp = *(u64 *)more_data;
@@ -82,45 +89,19 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                more_data += sizeof(u64);
        }
 
-       dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
-               (void *)(offset + head),
-               (void *)(long)(event->header.size),
+       dump_printf("(IP, %d): %d/%d: %p period: %Ld\n",
                event->header.misc,
                event->ip.pid, event->ip.tid,
                (void *)(long)ip,
                (long long)period);
 
-       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
        if (thread == NULL) {
-               eprintf("problem processing %d event, skipping it.\n",
-                       event->header.type);
+               pr_debug("problem processing %d event, skipping it.\n",
+                        event->header.type);
                return -1;
        }
 
-       cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-
-       if (cpumode == PERF_RECORD_MISC_KERNEL) {
-               show = SHOW_KERNEL;
-               level = 'k';
-
-               dso = kernel_dso;
-
-               dump_printf(" ...... dso: %s\n", dso->name);
-
-       } else if (cpumode == PERF_RECORD_MISC_USER) {
-
-               show = SHOW_USER;
-               level = '.';
-
-       } else {
-               show = SHOW_HV;
-               level = 'H';
-
-               dso = hypervisor_dso;
-
-               dump_printf(" ...... dso: [hypervisor]\n");
-       }
+       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
        if (sample_type & PERF_SAMPLE_RAW) {
                struct {
@@ -133,128 +114,189 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
                 * field, although it should be the same than this perf
                 * event pid
                 */
-               print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
+               scripting_ops->process_event(cpu, raw->data, raw->size,
+                                            timestamp, thread->comm);
        }
-       total += period;
+       event__stats.total += period;
 
        return 0;
 }
 
-static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
+static int sample_type_check(u64 type)
 {
-       trace_event(event);
-
-       switch (event->header.type) {
-       case PERF_RECORD_MMAP ... PERF_RECORD_LOST:
-               return 0;
-
-       case PERF_RECORD_COMM:
-               return process_comm_event(event, offset, head);
-
-       case PERF_RECORD_EXIT ... PERF_RECORD_READ:
-               return 0;
-
-       case PERF_RECORD_SAMPLE:
-               return process_sample_event(event, offset, head);
+       sample_type = type;
 
-       case PERF_RECORD_MAX:
-       default:
+       if (!(sample_type & PERF_SAMPLE_RAW)) {
+               fprintf(stderr,
+                       "No trace sample to read. Did you call perf record "
+                       "without -R?");
                return -1;
        }
 
        return 0;
 }
 
+static struct perf_file_handler file_handler = {
+       .process_sample_event   = process_sample_event,
+       .process_comm_event     = event__process_comm,
+       .sample_type_check      = sample_type_check,
+};
+
 static int __cmd_trace(void)
 {
-       int ret, rc = EXIT_FAILURE;
-       unsigned long offset = 0;
-       unsigned long head = 0;
-       struct stat perf_stat;
-       event_t *event;
-       uint32_t size;
-       char *buf;
-
-       trace_report();
-       register_idle_thread(&threads, &last_match);
-
-       input = open(input_name, O_RDONLY);
-       if (input < 0) {
-               perror("failed to open file");
-               exit(-1);
-       }
+       register_idle_thread();
+       register_perf_file_handler(&file_handler);
 
-       ret = fstat(input, &perf_stat);
-       if (ret < 0) {
-               perror("failed to stat file");
-               exit(-1);
-       }
+       return mmap_dispatch_perf_file(&header, input_name,
+                                      0, 0, &event__cwdlen, &event__cwd);
+}
 
-       if (!perf_stat.st_size) {
-               fprintf(stderr, "zero-sized file, nothing to do!\n");
-               exit(0);
-       }
-       header = perf_header__read(input);
-       head = header->data_offset;
-       sample_type = perf_header__sample_type(header);
+struct script_spec {
+       struct list_head        node;
+       struct scripting_ops    *ops;
+       char                    spec[0];
+};
 
-       if (!(sample_type & PERF_SAMPLE_RAW))
-               die("No trace sample to read. Did you call perf record "
-                   "without -R?");
+LIST_HEAD(script_specs);
 
-       if (load_kernel() < 0) {
-               perror("failed to load kernel symbols");
-               return EXIT_FAILURE;
-       }
+static struct script_spec *script_spec__new(const char *spec,
+                                           struct scripting_ops *ops)
+{
+       struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
 
-remap:
-       buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-                          MAP_SHARED, input, offset);
-       if (buf == MAP_FAILED) {
-               perror("failed to mmap file");
-               exit(-1);
+       if (s != NULL) {
+               strcpy(s->spec, spec);
+               s->ops = ops;
        }
 
-more:
-       event = (event_t *)(buf + head);
+       return s;
+}
 
-       if (head + event->header.size >= page_size * mmap_window) {
-               unsigned long shift = page_size * (head / page_size);
-               int res;
+static void script_spec__delete(struct script_spec *s)
+{
+       free(s->spec);
+       free(s);
+}
 
-               res = munmap(buf, page_size * mmap_window);
-               assert(res == 0);
+static void script_spec__add(struct script_spec *s)
+{
+       list_add_tail(&s->node, &script_specs);
+}
 
-               offset += shift;
-               head -= shift;
-               goto remap;
-       }
+static struct script_spec *script_spec__find(const char *spec)
+{
+       struct script_spec *s;
 
-       size = event->header.size;
+       list_for_each_entry(s, &script_specs, node)
+               if (strcasecmp(s->spec, spec) == 0)
+                       return s;
+       return NULL;
+}
 
-       if (!size || process_event(event, offset, head) < 0) {
+static struct script_spec *script_spec__findnew(const char *spec,
+                                               struct scripting_ops *ops)
+{
+       struct script_spec *s = script_spec__find(spec);
 
-               /*
-                * assume we lost track of the stream, check alignment, and
-                * increment a single u64 in the hope to catch on again 'soon'.
-                */
+       if (s)
+               return s;
 
-               if (unlikely(head & 7))
-                       head &= ~7ULL;
+       s = script_spec__new(spec, ops);
+       if (!s)
+               goto out_delete_spec;
 
-               size = 8;
-       }
+       script_spec__add(s);
+
+       return s;
 
-       head += size;
+out_delete_spec:
+       script_spec__delete(s);
+
+       return NULL;
+}
 
-       if (offset + head < (unsigned long)perf_stat.st_size)
-               goto more;
+int script_spec_register(const char *spec, struct scripting_ops *ops)
+{
+       struct script_spec *s;
+
+       s = script_spec__find(spec);
+       if (s)
+               return -1;
 
-       rc = EXIT_SUCCESS;
-       close(input);
+       s = script_spec__findnew(spec, ops);
+       if (!s)
+               return -1;
+
+       return 0;
+}
+
+static struct scripting_ops *script_spec__lookup(const char *spec)
+{
+       struct script_spec *s = script_spec__find(spec);
+       if (!s)
+               return NULL;
 
-       return rc;
+       return s->ops;
+}
+
+static void list_available_languages(void)
+{
+       struct script_spec *s;
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Scripting language extensions (used in "
+               "perf trace -s [spec:]script.[spec]):\n\n");
+
+       list_for_each_entry(s, &script_specs, node)
+               fprintf(stderr, "  %-42s [%s]\n", s->spec, s->ops->name);
+
+       fprintf(stderr, "\n");
+}
+
+static int parse_scriptname(const struct option *opt __used,
+                           const char *str, int unset __used)
+{
+       char spec[PATH_MAX];
+       const char *script, *ext;
+       int len;
+
+       if (strcmp(str, "list") == 0) {
+               list_available_languages();
+               return 0;
+       }
+
+       script = strchr(str, ':');
+       if (script) {
+               len = script - str;
+               if (len >= PATH_MAX) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+               strncpy(spec, str, len);
+               spec[len] = '\0';
+               scripting_ops = script_spec__lookup(spec);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+               script++;
+       } else {
+               script = str;
+               ext = strchr(script, '.');
+               if (!ext) {
+                       fprintf(stderr, "invalid script extension");
+                       return -1;
+               }
+               scripting_ops = script_spec__lookup(++ext);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid script extension");
+                       return -1;
+               }
+       }
+
+       script_name = strdup(script);
+
+       return 0;
 }
 
 static const char * const annotate_usage[] = {
@@ -267,13 +309,24 @@ static const struct option options[] = {
                    "dump raw trace in ASCII"),
        OPT_BOOLEAN('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('l', "latency", &latency_format,
+                   "show latency attributes (irqs/preemption disabled, etc)"),
+       OPT_CALLBACK('s', "script", NULL, "name",
+                    "script file name (lang:script name, script name, or *)",
+                    parse_scriptname),
+       OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
+                  "generate perf-trace.xx script in specified language"),
+
        OPT_END()
 };
 
 int cmd_trace(int argc, const char **argv, const char *prefix __used)
 {
-       symbol__init();
-       page_size = getpagesize();
+       int err;
+
+       symbol__init(0);
+
+       setup_scripting();
 
        argc = parse_options(argc, argv, options, annotate_usage, 0);
        if (argc) {
@@ -287,5 +340,50 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
 
        setup_pager();
 
-       return __cmd_trace();
+       if (generate_script_lang) {
+               struct stat perf_stat;
+
+               int input = open(input_name, O_RDONLY);
+               if (input < 0) {
+                       perror("failed to open file");
+                       exit(-1);
+               }
+
+               err = fstat(input, &perf_stat);
+               if (err < 0) {
+                       perror("failed to stat file");
+                       exit(-1);
+               }
+
+               if (!perf_stat.st_size) {
+                       fprintf(stderr, "zero-sized file, nothing to do!\n");
+                       exit(0);
+               }
+
+               scripting_ops = script_spec__lookup(generate_script_lang);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+
+               header = perf_header__new();
+               if (header == NULL)
+                       return -1;
+
+               perf_header__read(header, input);
+               err = scripting_ops->generate_script("perf-trace");
+               goto out;
+       }
+
+       if (script_name) {
+               err = scripting_ops->start_script(script_name);
+               if (err)
+                       goto out;
+       }
+
+       err = __cmd_trace();
+
+       cleanup_scripting();
+out:
+       return err;
 }
index e11d8d231c3b3cdea52ccb359141f5e606ffa487..a3d8bf65f26c2a0466bd76ab02d5e48db02ce0dc 100644 (file)
@@ -15,6 +15,8 @@ extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
 extern int cmd_annotate(int argc, const char **argv, const char *prefix);
+extern int cmd_bench(int argc, const char **argv, const char *prefix);
+extern int cmd_buildid_list(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_sched(int argc, const char **argv, const char *prefix);
 extern int cmd_list(int argc, const char **argv, const char *prefix);
@@ -25,5 +27,7 @@ extern int cmd_timechart(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 
 #endif
index 00326e230d8756bd1b36ab060ffacd9b881eed72..02b09ea17a3ecad604cd970edf28b386fb22a15b 100644 (file)
@@ -3,6 +3,8 @@
 # command name                 category [deprecated] [common]
 #
 perf-annotate                  mainporcelain common
+perf-bench                     mainporcelain common
+perf-buildid-list              mainporcelain common
 perf-list                      mainporcelain common
 perf-sched                     mainporcelain common
 perf-record                    mainporcelain common
@@ -11,3 +13,5 @@ perf-stat                     mainporcelain common
 perf-timechart                 mainporcelain common
 perf-top                       mainporcelain common
 perf-trace                     mainporcelain common
+perf-probe                     mainporcelain common
+perf-kmem                      mainporcelain common
index fdd42a824c9870be6c1cfce297dc552093fa45f6..f000c30877acf3eb153c8a499b61d51660fde917 100644 (file)
@@ -137,6 +137,8 @@ enum sw_event_ids {
        PERF_COUNT_SW_CPU_MIGRATIONS    = 4,
        PERF_COUNT_SW_PAGE_FAULTS_MIN   = 5,
        PERF_COUNT_SW_PAGE_FAULTS_MAJ   = 6,
+       PERF_COUNT_SW_ALIGNMENT_FAULTS  = 7,
+       PERF_COUNT_SW_EMULATION_FAULTS  = 8,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
index 19fc7feb9d59c7a0f2ccfb53e0c4630a81f8c85f..cf64049bc9bdd71e4c97c68a59e5b754bf9311ad 100644 (file)
@@ -14,6 +14,7 @@
 #include "util/run-command.h"
 #include "util/parse-events.h"
 #include "util/string.h"
+#include "util/debugfs.h"
 
 const char perf_usage_string[] =
        "perf [--version] [--help] COMMAND [ARGS]";
@@ -89,8 +90,8 @@ static int handle_options(const char*** argv, int* argc, int* envchanged)
                /*
                 * Check remaining flags.
                 */
-               if (!prefixcmp(cmd, "--exec-path")) {
-                       cmd += 11;
+               if (!prefixcmp(cmd, CMD_EXEC_PATH)) {
+                       cmd += strlen(CMD_EXEC_PATH);
                        if (*cmd == '=')
                                perf_set_argv_exec_path(cmd + 1);
                        else {
@@ -117,8 +118,8 @@ static int handle_options(const char*** argv, int* argc, int* envchanged)
                        (*argv)++;
                        (*argc)--;
                        handled++;
-               } else if (!prefixcmp(cmd, "--perf-dir=")) {
-                       setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
+               } else if (!prefixcmp(cmd, CMD_PERF_DIR)) {
+                       setenv(PERF_DIR_ENVIRONMENT, cmd + strlen(CMD_PERF_DIR), 1);
                        if (envchanged)
                                *envchanged = 1;
                } else if (!strcmp(cmd, "--work-tree")) {
@@ -131,8 +132,8 @@ static int handle_options(const char*** argv, int* argc, int* envchanged)
                                *envchanged = 1;
                        (*argv)++;
                        (*argc)--;
-               } else if (!prefixcmp(cmd, "--work-tree=")) {
-                       setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
+               } else if (!prefixcmp(cmd, CMD_WORK_TREE)) {
+                       setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + strlen(CMD_WORK_TREE), 1);
                        if (envchanged)
                                *envchanged = 1;
                } else if (!strcmp(cmd, "--debugfs-dir")) {
@@ -146,8 +147,8 @@ static int handle_options(const char*** argv, int* argc, int* envchanged)
                                *envchanged = 1;
                        (*argv)++;
                        (*argc)--;
-               } else if (!prefixcmp(cmd, "--debugfs-dir=")) {
-                       strncpy(debugfs_mntpt, cmd + 14, MAXPATHLEN);
+               } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) {
+                       strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN);
                        debugfs_mntpt[MAXPATHLEN - 1] = '\0';
                        if (envchanged)
                                *envchanged = 1;
@@ -284,17 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
 {
        const char *cmd = argv[0];
        static struct cmd_struct commands[] = {
-               { "help", cmd_help, 0 },
-               { "list", cmd_list, 0 },
-               { "record", cmd_record, 0 },
-               { "report", cmd_report, 0 },
-               { "stat", cmd_stat, 0 },
-               { "timechart", cmd_timechart, 0 },
-               { "top", cmd_top, 0 },
-               { "annotate", cmd_annotate, 0 },
-               { "version", cmd_version, 0 },
-               { "trace", cmd_trace, 0 },
-               { "sched", cmd_sched, 0 },
+               { "buildid-list", cmd_buildid_list, 0 },
+               { "help",       cmd_help,       0 },
+               { "list",       cmd_list,       0 },
+               { "record",     cmd_record,     0 },
+               { "report",     cmd_report,     0 },
+               { "bench",      cmd_bench,      0 },
+               { "stat",       cmd_stat,       0 },
+               { "timechart",  cmd_timechart,  0 },
+               { "top",        cmd_top,        0 },
+               { "annotate",   cmd_annotate,   0 },
+               { "version",    cmd_version,    0 },
+               { "trace",      cmd_trace,      0 },
+               { "sched",      cmd_sched,      0 },
+               { "probe",      cmd_probe,      0 },
+               { "kmem",       cmd_kmem,       0 },
        };
        unsigned int i;
        static const char ext[] = STRIP_EXTENSION;
@@ -382,45 +387,12 @@ static int run_argv(int *argcp, const char ***argv)
 /* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */
 static void get_debugfs_mntpt(void)
 {
-       FILE *file;
-       char fs_type[100];
-       char debugfs[MAXPATHLEN];
+       const char *path = debugfs_find_mountpoint();
 
-       /*
-        * try the standard location
-        */
-       if (valid_debugfs_mount("/sys/kernel/debug/") == 0) {
-               strcpy(debugfs_mntpt, "/sys/kernel/debug/");
-               return;
-       }
-
-       /*
-        * try the sane location
-        */
-       if (valid_debugfs_mount("/debug/") == 0) {
-               strcpy(debugfs_mntpt, "/debug/");
-               return;
-       }
-
-       /*
-        * give up and parse /proc/mounts
-        */
-       file = fopen("/proc/mounts", "r");
-       if (file == NULL)
-               return;
-
-       while (fscanf(file, "%*s %"
-                     STR(MAXPATHLEN)
-                     "s %99s %*s %*d %*d\n",
-                     debugfs, fs_type) == 2) {
-               if (strcmp(fs_type, "debugfs") == 0)
-                       break;
-       }
-       fclose(file);
-       if (strcmp(fs_type, "debugfs") == 0) {
-               strncpy(debugfs_mntpt, debugfs, MAXPATHLEN);
-               debugfs_mntpt[MAXPATHLEN - 1] = '\0';
-       }
+       if (path)
+               strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt));
+       else
+               debugfs_mntpt[0] = '\0';
 }
 
 int main(int argc, const char **argv)
index 8cc4623afd6f677e38ad33eb847b56e56352c3fd..454d5d55f32d9cb30d8206c6f03bfe1e0b5f61fb 100644 (file)
 #define cpu_relax()    asm volatile("":::"memory")
 #endif
 
+#ifdef __alpha__
+#include "../../arch/alpha/include/asm/unistd.h"
+#define rmb()          asm volatile("mb" ::: "memory")
+#define cpu_relax()    asm volatile("" ::: "memory")
+#endif
+
+#ifdef __ia64__
+#include "../../arch/ia64/include/asm/unistd.h"
+#define rmb()          asm volatile ("mf" ::: "memory")
+#define cpu_relax()    asm volatile ("hint @pause" ::: "memory")
+#endif
+
 #include <time.h>
 #include <unistd.h>
 #include <sys/types.h>
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
new file mode 100644 (file)
index 0000000..af78d9a
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * This file was generated automatically by ExtUtils::ParseXS version 2.18_02 from the
+ * contents of Context.xs. Do not edit this file, edit Context.xs instead.
+ *
+ *     ANY CHANGES MADE HERE WILL BE LOST! 
+ *
+ */
+
+#line 1 "Context.xs"
+/*
+ * Context.xs.  XS interfaces for perf trace.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#include "../../../util/trace-event-perl.h"
+
+#ifndef PERL_UNUSED_VAR
+#  define PERL_UNUSED_VAR(var) if (0) var = var
+#endif
+
+#line 41 "Context.c"
+
+XS(XS_Perf__Trace__Context_common_pc); /* prototype to pass -Wmissing-prototypes */
+XS(XS_Perf__Trace__Context_common_pc)
+{
+#ifdef dVAR
+    dVAR; dXSARGS;
+#else
+    dXSARGS;
+#endif
+    if (items != 1)
+       Perl_croak(aTHX_ "Usage: %s(%s)", "Perf::Trace::Context::common_pc", "context");
+    PERL_UNUSED_VAR(cv); /* -W */
+    {
+       struct scripting_context *      context = INT2PTR(struct scripting_context *,SvIV(ST(0)));
+       int     RETVAL;
+       dXSTARG;
+
+       RETVAL = common_pc(context);
+       XSprePUSH; PUSHi((IV)RETVAL);
+    }
+    XSRETURN(1);
+}
+
+
+XS(XS_Perf__Trace__Context_common_flags); /* prototype to pass -Wmissing-prototypes */
+XS(XS_Perf__Trace__Context_common_flags)
+{
+#ifdef dVAR
+    dVAR; dXSARGS;
+#else
+    dXSARGS;
+#endif
+    if (items != 1)
+       Perl_croak(aTHX_ "Usage: %s(%s)", "Perf::Trace::Context::common_flags", "context");
+    PERL_UNUSED_VAR(cv); /* -W */
+    {
+       struct scripting_context *      context = INT2PTR(struct scripting_context *,SvIV(ST(0)));
+       int     RETVAL;
+       dXSTARG;
+
+       RETVAL = common_flags(context);
+       XSprePUSH; PUSHi((IV)RETVAL);
+    }
+    XSRETURN(1);
+}
+
+
+XS(XS_Perf__Trace__Context_common_lock_depth); /* prototype to pass -Wmissing-prototypes */
+XS(XS_Perf__Trace__Context_common_lock_depth)
+{
+#ifdef dVAR
+    dVAR; dXSARGS;
+#else
+    dXSARGS;
+#endif
+    if (items != 1)
+       Perl_croak(aTHX_ "Usage: %s(%s)", "Perf::Trace::Context::common_lock_depth", "context");
+    PERL_UNUSED_VAR(cv); /* -W */
+    {
+       struct scripting_context *      context = INT2PTR(struct scripting_context *,SvIV(ST(0)));
+       int     RETVAL;
+       dXSTARG;
+
+       RETVAL = common_lock_depth(context);
+       XSprePUSH; PUSHi((IV)RETVAL);
+    }
+    XSRETURN(1);
+}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+XS(boot_Perf__Trace__Context); /* prototype to pass -Wmissing-prototypes */
+XS(boot_Perf__Trace__Context)
+{
+#ifdef dVAR
+    dVAR; dXSARGS;
+#else
+    dXSARGS;
+#endif
+    const char* file = __FILE__;
+
+    PERL_UNUSED_VAR(cv); /* -W */
+    PERL_UNUSED_VAR(items); /* -W */
+    XS_VERSION_BOOTCHECK ;
+
+        newXSproto("Perf::Trace::Context::common_pc", XS_Perf__Trace__Context_common_pc, file, "$");
+        newXSproto("Perf::Trace::Context::common_flags", XS_Perf__Trace__Context_common_flags, file, "$");
+        newXSproto("Perf::Trace::Context::common_lock_depth", XS_Perf__Trace__Context_common_lock_depth, file, "$");
+    if (PL_unitcheckav)
+         call_list(PL_scopestack_ix, PL_unitcheckav);
+    XSRETURN_YES;
+}
+
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
new file mode 100644 (file)
index 0000000..fb78006
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Context.xs.  XS interfaces for perf trace.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#include "../../../util/trace-event-perl.h"
+
+MODULE = Perf::Trace::Context          PACKAGE = Perf::Trace::Context
+PROTOTYPES: ENABLE
+
+int
+common_pc(context)
+       struct scripting_context * context
+
+int
+common_flags(context)
+       struct scripting_context * context
+
+int
+common_lock_depth(context)
+       struct scripting_context * context
+
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Makefile.PL b/tools/perf/scripts/perl/Perf-Trace-Util/Makefile.PL
new file mode 100644 (file)
index 0000000..decdeb0
--- /dev/null
@@ -0,0 +1,17 @@
+use 5.010000;
+use ExtUtils::MakeMaker;
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+WriteMakefile(
+    NAME              => 'Perf::Trace::Context',
+    VERSION_FROM      => 'lib/Perf/Trace/Context.pm', # finds $VERSION
+    PREREQ_PM         => {}, # e.g., Module::Name => 1.1
+    ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
+      (ABSTRACT_FROM  => 'lib/Perf/Trace/Context.pm', # retrieve abstract from module
+       AUTHOR         => 'Tom Zanussi <tzanussi@gmail.com>') : ()),
+    LIBS              => [''], # e.g., '-lm'
+    DEFINE            => '-I ../..', # e.g., '-DHAVE_SOMETHING'
+    INC               => '-I.', # e.g., '-I. -I/usr/include/other'
+       # Un-comment this if you add C files to link with later:
+    OBJECT            => 'Context.o', # link all the C files too
+);
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/README b/tools/perf/scripts/perl/Perf-Trace-Util/README
new file mode 100644 (file)
index 0000000..9a97076
--- /dev/null
@@ -0,0 +1,59 @@
+Perf-Trace-Util version 0.01
+============================
+
+This module contains utility functions for use with perf trace.
+
+Core.pm and Util.pm are pure Perl modules; Core.pm contains routines
+that the core perf support for Perl calls on and should always be
+'used', while Util.pm contains useful but optional utility functions
+that scripts may want to use.  Context.pm contains the Perl->C
+interface that allows scripts to access data in the embedding perf
+executable; scripts wishing to do that should 'use Context.pm'.
+
+The Perl->C perf interface is completely driven by Context.xs.  If you
+want to add new Perl functions that end up accessing C data in the
+perf executable, you add desciptions of the new functions here.
+scripting_context is a pointer to the perf data in the perf executable
+that you want to access - it's passed as the second parameter,
+$context, to all handler functions.
+
+After you do that:
+
+  perl Makefile.PL   # to create a Makefile for the next step
+  make               # to create Context.c
+
+  edit Context.c to add const to the char* file = __FILE__ line in
+  XS(boot_Perf__Trace__Context) to silence a warning/error.
+
+  You can delete the Makefile, object files and anything else that was
+  generated e.g. blib and shared library, etc, except for of course
+  Context.c
+
+  You should then be able to run the normal perf make as usual.
+
+INSTALLATION
+
+Building perf with perf trace Perl scripting should install this
+module in the right place.
+
+You should make sure libperl and ExtUtils/Embed.pm are installed first
+e.g. apt-get install libperl-dev or yum install perl-ExtUtils-Embed.
+
+DEPENDENCIES
+
+This module requires these other modules and libraries:
+
+  None
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2009 by Tom Zanussi <tzanussi@gmail.com>
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.0 or,
+at your option, any later version of Perl 5 you may have available.
+
+Alternatively, this software may be distributed under the terms of the
+GNU General Public License ("GPL") version 2 as published by the Free
+Software Foundation.
+
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
new file mode 100644 (file)
index 0000000..6c7f365
--- /dev/null
@@ -0,0 +1,55 @@
+package Perf::Trace::Context;
+
+use 5.010000;
+use strict;
+use warnings;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+
+our %EXPORT_TAGS = ( 'all' => [ qw(
+) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw(
+       common_pc common_flags common_lock_depth
+);
+
+our $VERSION = '0.01';
+
+require XSLoader;
+XSLoader::load('Perf::Trace::Context', $VERSION);
+
+1;
+__END__
+=head1 NAME
+
+Perf::Trace::Context - Perl extension for accessing functions in perf.
+
+=head1 SYNOPSIS
+
+  use Perf::Trace::Context;
+
+=head1 SEE ALSO
+
+Perf (trace) documentation
+
+=head1 AUTHOR
+
+Tom Zanussi, E<lt>tzanussi@gmail.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2009 by Tom Zanussi
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.0 or,
+at your option, any later version of Perl 5 you may have available.
+
+Alternatively, this software may be distributed under the terms of the
+GNU General Public License ("GPL") version 2 as published by the Free
+Software Foundation.
+
+=cut
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
new file mode 100644 (file)
index 0000000..9df376a
--- /dev/null
@@ -0,0 +1,192 @@
+package Perf::Trace::Core;
+
+use 5.010000;
+use strict;
+use warnings;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+
+our %EXPORT_TAGS = ( 'all' => [ qw(
+) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw(
+define_flag_field define_flag_value flag_str dump_flag_fields
+define_symbolic_field define_symbolic_value symbol_str dump_symbolic_fields
+trace_flag_str
+);
+
+our $VERSION = '0.01';
+
+my %trace_flags = (0x00 => "NONE",
+                  0x01 => "IRQS_OFF",
+                  0x02 => "IRQS_NOSUPPORT",
+                  0x04 => "NEED_RESCHED",
+                  0x08 => "HARDIRQ",
+                  0x10 => "SOFTIRQ");
+
+sub trace_flag_str
+{
+    my ($value) = @_;
+
+    my $string;
+
+    my $print_delim = 0;
+
+    foreach my $idx (sort {$a <=> $b} keys %trace_flags) {
+       if (!$value && !$idx) {
+           $string .= "NONE";
+           last;
+       }
+
+       if ($idx && ($value & $idx) == $idx) {
+           if ($print_delim) {
+               $string .= " | ";
+           }
+           $string .= "$trace_flags{$idx}";
+           $print_delim = 1;
+           $value &= ~$idx;
+       }
+    }
+
+    return $string;
+}
+
+my %flag_fields;
+my %symbolic_fields;
+
+sub flag_str
+{
+    my ($event_name, $field_name, $value) = @_;
+
+    my $string;
+
+    if ($flag_fields{$event_name}{$field_name}) {
+       my $print_delim = 0;
+       foreach my $idx (sort {$a <=> $b} keys %{$flag_fields{$event_name}{$field_name}{"values"}}) {
+           if (!$value && !$idx) {
+               $string .= "$flag_fields{$event_name}{$field_name}{'values'}{$idx}";
+               last;
+           }
+           if ($idx && ($value & $idx) == $idx) {
+               if ($print_delim && $flag_fields{$event_name}{$field_name}{'delim'}) {
+                   $string .= " $flag_fields{$event_name}{$field_name}{'delim'} ";
+               }
+               $string .= "$flag_fields{$event_name}{$field_name}{'values'}{$idx}";
+               $print_delim = 1;
+               $value &= ~$idx;
+           }
+       }
+    }
+
+    return $string;
+}
+
+sub define_flag_field
+{
+    my ($event_name, $field_name, $delim) = @_;
+
+    $flag_fields{$event_name}{$field_name}{"delim"} = $delim;
+}
+
+sub define_flag_value
+{
+    my ($event_name, $field_name, $value, $field_str) = @_;
+
+    $flag_fields{$event_name}{$field_name}{"values"}{$value} = $field_str;
+}
+
+sub dump_flag_fields
+{
+    for my $event (keys %flag_fields) {
+       print "event $event:\n";
+       for my $field (keys %{$flag_fields{$event}}) {
+           print "    field: $field:\n";
+           print "        delim: $flag_fields{$event}{$field}{'delim'}\n";
+           foreach my $idx (sort {$a <=> $b} keys %{$flag_fields{$event}{$field}{"values"}}) {
+               print "        value $idx: $flag_fields{$event}{$field}{'values'}{$idx}\n";
+           }
+       }
+    }
+}
+
+sub symbol_str
+{
+    my ($event_name, $field_name, $value) = @_;
+
+    if ($symbolic_fields{$event_name}{$field_name}) {
+       foreach my $idx (sort {$a <=> $b} keys %{$symbolic_fields{$event_name}{$field_name}{"values"}}) {
+           if (!$value && !$idx) {
+               return "$symbolic_fields{$event_name}{$field_name}{'values'}{$idx}";
+               last;
+           }
+           if ($value == $idx) {
+               return "$symbolic_fields{$event_name}{$field_name}{'values'}{$idx}";
+           }
+       }
+    }
+
+    return undef;
+}
+
+sub define_symbolic_field
+{
+    my ($event_name, $field_name) = @_;
+
+    # nothing to do, really
+}
+
+sub define_symbolic_value
+{
+    my ($event_name, $field_name, $value, $field_str) = @_;
+
+    $symbolic_fields{$event_name}{$field_name}{"values"}{$value} = $field_str;
+}
+
+sub dump_symbolic_fields
+{
+    for my $event (keys %symbolic_fields) {
+       print "event $event:\n";
+       for my $field (keys %{$symbolic_fields{$event}}) {
+           print "    field: $field:\n";
+           foreach my $idx (sort {$a <=> $b} keys %{$symbolic_fields{$event}{$field}{"values"}}) {
+               print "        value $idx: $symbolic_fields{$event}{$field}{'values'}{$idx}\n";
+           }
+       }
+    }
+}
+
+1;
+__END__
+=head1 NAME
+
+Perf::Trace::Core - Perl extension for perf trace
+
+=head1 SYNOPSIS
+
+  use Perf::Trace::Core
+
+=head1 SEE ALSO
+
+Perf (trace) documentation
+
+=head1 AUTHOR
+
+Tom Zanussi, E<lt>tzanussi@gmail.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2009 by Tom Zanussi
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.0 or,
+at your option, any later version of Perl 5 you may have available.
+
+Alternatively, this software may be distributed under the terms of the
+GNU General Public License ("GPL") version 2 as published by the Free
+Software Foundation.
+
+=cut
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
new file mode 100644 (file)
index 0000000..052f132
--- /dev/null
@@ -0,0 +1,88 @@
+package Perf::Trace::Util;
+
+use 5.010000;
+use strict;
+use warnings;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+
+our %EXPORT_TAGS = ( 'all' => [ qw(
+) ] );
+
+our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
+
+our @EXPORT = qw(
+avg nsecs nsecs_secs nsecs_nsecs nsecs_usecs print_nsecs
+);
+
+our $VERSION = '0.01';
+
+sub avg
+{
+    my ($total, $n) = @_;
+
+    return $total / $n;
+}
+
+my $NSECS_PER_SEC    = 1000000000;
+
+sub nsecs
+{
+    my ($secs, $nsecs) = @_;
+
+    return $secs * $NSECS_PER_SEC + $nsecs;
+}
+
+sub nsecs_secs {
+    my ($nsecs) = @_;
+
+    return $nsecs / $NSECS_PER_SEC;
+}
+
+sub nsecs_nsecs {
+    my ($nsecs) = @_;
+
+    return $nsecs - nsecs_secs($nsecs);
+}
+
+sub nsecs_str {
+    my ($nsecs) = @_;
+
+    my $str = sprintf("%5u.%09u", nsecs_secs($nsecs), nsecs_nsecs($nsecs));
+
+    return $str;
+}
+
+1;
+__END__
+=head1 NAME
+
+Perf::Trace::Util - Perl extension for perf trace
+
+=head1 SYNOPSIS
+
+  use Perf::Trace::Util;
+
+=head1 SEE ALSO
+
+Perf (trace) documentation
+
+=head1 AUTHOR
+
+Tom Zanussi, E<lt>tzanussi@gmail.com<gt>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2009 by Tom Zanussi
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.0 or,
+at your option, any later version of Perl 5 you may have available.
+
+Alternatively, this software may be distributed under the terms of the
+GNU General Public License ("GPL") version 2 as published by the Free
+Software Foundation.
+
+=cut
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/typemap b/tools/perf/scripts/perl/Perf-Trace-Util/typemap
new file mode 100644 (file)
index 0000000..8408368
--- /dev/null
@@ -0,0 +1 @@
+struct scripting_context * T_PTR
diff --git a/tools/perf/scripts/perl/bin/check-perf-trace-record b/tools/perf/scripts/perl/bin/check-perf-trace-record
new file mode 100644 (file)
index 0000000..c7ec5de
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e kmem:kmalloc -e irq:softirq_entry
+
+
+
+
+
diff --git a/tools/perf/scripts/perl/bin/check-perf-trace-report b/tools/perf/scripts/perl/bin/check-perf-trace-report
new file mode 100644 (file)
index 0000000..89948b0
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/bash
+perf trace -s ~/libexec/perf-core/scripts/perl/check-perf-trace.pl
+
+
+
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-record b/tools/perf/scripts/perl/bin/rw-by-file-record
new file mode 100644 (file)
index 0000000..b25056e
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_enter_write
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-report b/tools/perf/scripts/perl/bin/rw-by-file-report
new file mode 100644 (file)
index 0000000..f5dcf9c
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/bash
+perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-file.pl
+
+
+
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-record b/tools/perf/scripts/perl/bin/rw-by-pid-record
new file mode 100644 (file)
index 0000000..8903979
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-report b/tools/perf/scripts/perl/bin/rw-by-pid-report
new file mode 100644 (file)
index 0000000..cea16f7
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/bash
+perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-pid.pl
+
+
+
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-record b/tools/perf/scripts/perl/bin/wakeup-latency-record
new file mode 100644 (file)
index 0000000..6abedda
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e sched:sched_switch -e sched:sched_wakeup
+
+
+
+
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-report b/tools/perf/scripts/perl/bin/wakeup-latency-report
new file mode 100644 (file)
index 0000000..85769dc
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/bash
+perf trace -s ~/libexec/perf-core/scripts/perl/wakeup-latency.pl
+
+
+
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
new file mode 100644 (file)
index 0000000..fce6637
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report
new file mode 100644 (file)
index 0000000..aa68435
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/bash
+perf trace -s ~/libexec/perf-core/scripts/perl/workqueue-stats.pl
+
+
+
+
diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl
new file mode 100644 (file)
index 0000000..4e7dc0a
--- /dev/null
@@ -0,0 +1,106 @@
+# perf trace event handlers, generated by perf trace -g perl
+# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# This script tests basic functionality such as flag and symbol
+# strings, common_xxx() calls back into perf, begin, end, unhandled
+# events, etc.  Basically, if this script runs successfully and
+# displays expected results, perl scripting support should be ok.
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Context;
+use Perf::Trace::Util;
+
+sub trace_begin
+{
+    print "trace_begin\n";
+}
+
+sub trace_end
+{
+    print "trace_end\n";
+
+    print_unhandled();
+}
+
+sub irq::softirq_entry
+{
+       my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+           $common_pid, $common_comm,
+           $vec) = @_;
+
+       print_header($event_name, $common_cpu, $common_secs, $common_nsecs,
+                    $common_pid, $common_comm);
+
+       print_uncommon($context);
+
+       printf("vec=%s\n",
+              symbol_str("irq::softirq_entry", "vec", $vec));
+}
+
+sub kmem::kmalloc
+{
+       my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+           $common_pid, $common_comm,
+           $call_site, $ptr, $bytes_req, $bytes_alloc,
+           $gfp_flags) = @_;
+
+       print_header($event_name, $common_cpu, $common_secs, $common_nsecs,
+                    $common_pid, $common_comm);
+
+       print_uncommon($context);
+
+       printf("call_site=%p, ptr=%p, bytes_req=%u, bytes_alloc=%u, ".
+              "gfp_flags=%s\n",
+              $call_site, $ptr, $bytes_req, $bytes_alloc,
+
+              flag_str("kmem::kmalloc", "gfp_flags", $gfp_flags));
+}
+
+# print trace fields not included in handler args
+sub print_uncommon
+{
+    my ($context) = @_;
+
+    printf("common_preempt_count=%d, common_flags=%s, common_lock_depth=%d, ",
+          common_pc($context), trace_flag_str(common_flags($context)),
+          common_lock_depth($context));
+
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+       return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+          "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+       printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
+
+sub print_header
+{
+       my ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;
+
+       printf("%-20s %5u %05u.%09u %8u %-20s ",
+              $event_name, $cpu, $secs, $nsecs, $pid, $comm);
+}
diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl
new file mode 100644 (file)
index 0000000..61f9156
--- /dev/null
@@ -0,0 +1,105 @@
+#!/usr/bin/perl -w
+# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# Display r/w activity for files read/written to for a given program
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the status files.  Those fields not available as handler params can
+# be retrieved via script functions of the form get_common_*().
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+
+# change this to the comm of the program you're interested in
+my $for_comm = "perf";
+
+my %reads;
+my %writes;
+
+sub syscalls::sys_enter_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_;
+
+    if ($common_comm eq $for_comm) {
+       $reads{$fd}{bytes_requested} += $count;
+       $reads{$fd}{total_reads}++;
+    }
+}
+
+sub syscalls::sys_enter_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm, $nr, $fd, $buf, $count) = @_;
+
+    if ($common_comm eq $for_comm) {
+       $writes{$fd}{bytes_written} += $count;
+       $writes{$fd}{total_writes}++;
+    }
+}
+
+sub trace_end
+{
+    printf("file read counts for $for_comm:\n\n");
+
+    printf("%6s  %10s  %10s\n", "fd", "# reads", "bytes_requested");
+    printf("%6s  %10s  %10s\n", "------", "----------", "-----------");
+
+    foreach my $fd (sort {$reads{$b}{bytes_requested} <=>
+                             $reads{$a}{bytes_requested}} keys %reads) {
+       my $total_reads = $reads{$fd}{total_reads};
+       my $bytes_requested = $reads{$fd}{bytes_requested};
+       printf("%6u  %10u  %10u\n", $fd, $total_reads, $bytes_requested);
+    }
+
+    printf("\nfile write counts for $for_comm:\n\n");
+
+    printf("%6s  %10s  %10s\n", "fd", "# writes", "bytes_written");
+    printf("%6s  %10s  %10s\n", "------", "----------", "-----------");
+
+    foreach my $fd (sort {$writes{$b}{bytes_written} <=>
+                             $writes{$a}{bytes_written}} keys %writes) {
+       my $total_writes = $writes{$fd}{total_writes};
+       my $bytes_written = $writes{$fd}{bytes_written};
+       printf("%6u  %10u  %10u\n", $fd, $total_writes, $bytes_written);
+    }
+
+    print_unhandled();
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+       return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+          "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+       printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
+
+
diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl
new file mode 100644 (file)
index 0000000..da601fa
--- /dev/null
@@ -0,0 +1,170 @@
+#!/usr/bin/perl -w
+# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# Display r/w activity for all processes
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the status files.  Those fields not available as handler params can
+# be retrieved via script functions of the form get_common_*().
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+
+my %reads;
+my %writes;
+
+sub syscalls::sys_exit_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $nr, $ret) = @_;
+
+    if ($ret > 0) {
+       $reads{$common_pid}{bytes_read} += $ret;
+    } else {
+       if (!defined ($reads{$common_pid}{bytes_read})) {
+           $reads{$common_pid}{bytes_read} = 0;
+       }
+       $reads{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $nr, $fd, $buf, $count) = @_;
+
+    $reads{$common_pid}{bytes_requested} += $count;
+    $reads{$common_pid}{total_reads}++;
+    $reads{$common_pid}{comm} = $common_comm;
+}
+
+sub syscalls::sys_exit_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $nr, $ret) = @_;
+
+    if ($ret <= 0) {
+       $writes{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $nr, $fd, $buf, $count) = @_;
+
+    $writes{$common_pid}{bytes_written} += $count;
+    $writes{$common_pid}{total_writes}++;
+    $writes{$common_pid}{comm} = $common_comm;
+}
+
+sub trace_end
+{
+    printf("read counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %10s  %10s\n", "pid", "comm",
+          "# reads", "bytes_requested", "bytes_read");
+    printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
+          "-----------", "----------", "----------");
+
+    foreach my $pid (sort {$reads{$b}{bytes_read} <=>
+                              $reads{$a}{bytes_read}} keys %reads) {
+       my $comm = $reads{$pid}{comm};
+       my $total_reads = $reads{$pid}{total_reads};
+       my $bytes_requested = $reads{$pid}{bytes_requested};
+       my $bytes_read = $reads{$pid}{bytes_read};
+
+       printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
+              $total_reads, $bytes_requested, $bytes_read);
+    }
+
+    printf("\nfailed reads by pid:\n\n");
+
+    printf("%6s  %20s  %6s  %10s\n", "pid", "comm", "error #", "# errors");
+    printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
+          "------", "----------");
+
+    foreach my $pid (keys %reads) {
+       my $comm = $reads{$pid}{comm};
+       foreach my $err (sort {$reads{$b}{comm} cmp $reads{$a}{comm}}
+                        keys %{$reads{$pid}{errors}}) {
+           my $errors = $reads{$pid}{errors}{$err};
+
+           printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+       }
+    }
+
+    printf("\nwrite counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %10s\n", "pid", "comm",
+          "# writes", "bytes_written");
+    printf("%6s  %-20s  %10s  %10s\n", "------", "--------------------",
+          "-----------", "----------");
+
+    foreach my $pid (sort {$writes{$b}{bytes_written} <=>
+                              $writes{$a}{bytes_written}} keys %writes) {
+       my $comm = $writes{$pid}{comm};
+       my $total_writes = $writes{$pid}{total_writes};
+       my $bytes_written = $writes{$pid}{bytes_written};
+
+       printf("%6s  %-20s  %10s  %10s\n", $pid, $comm,
+              $total_writes, $bytes_written);
+    }
+
+    printf("\nfailed writes by pid:\n\n");
+
+    printf("%6s  %20s  %6s  %10s\n", "pid", "comm", "error #", "# errors");
+    printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
+          "------", "----------");
+
+    foreach my $pid (keys %writes) {
+       my $comm = $writes{$pid}{comm};
+       foreach my $err (sort {$writes{$b}{comm} cmp $writes{$a}{comm}}
+                        keys %{$writes{$pid}{errors}}) {
+           my $errors = $writes{$pid}{errors}{$err};
+
+           printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+       }
+    }
+
+    print_unhandled();
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+       return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+          "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+       printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl
new file mode 100644 (file)
index 0000000..ed58ef2
--- /dev/null
@@ -0,0 +1,103 @@
+#!/usr/bin/perl -w
+# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# Display avg/min/max wakeup latency
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the status files.  Those fields not available as handler params can
+# be retrieved via script functions of the form get_common_*().
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+
+my %last_wakeup;
+
+my $max_wakeup_latency;
+my $min_wakeup_latency;
+my $total_wakeup_latency;
+my $total_wakeups;
+
+sub sched::sched_switch
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $prev_comm, $prev_pid, $prev_prio, $prev_state, $next_comm, $next_pid,
+       $next_prio) = @_;
+
+    my $wakeup_ts = $last_wakeup{$common_cpu}{ts};
+    if ($wakeup_ts) {
+       my $switch_ts = nsecs($common_secs, $common_nsecs);
+       my $wakeup_latency = $switch_ts - $wakeup_ts;
+       if ($wakeup_latency > $max_wakeup_latency) {
+           $max_wakeup_latency = $wakeup_latency;
+       }
+       if ($wakeup_latency < $min_wakeup_latency) {
+           $min_wakeup_latency = $wakeup_latency;
+       }
+       $total_wakeup_latency += $wakeup_latency;
+       $total_wakeups++;
+    }
+    $last_wakeup{$common_cpu}{ts} = 0;
+}
+
+sub sched::sched_wakeup
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $comm, $pid, $prio, $success, $target_cpu) = @_;
+
+    $last_wakeup{$target_cpu}{ts} = nsecs($common_secs, $common_nsecs);
+}
+
+sub trace_begin
+{
+    $min_wakeup_latency = 1000000000;
+    $max_wakeup_latency = 0;
+}
+
+sub trace_end
+{
+    printf("wakeup_latency stats:\n\n");
+    print "total_wakeups: $total_wakeups\n";
+    printf("avg_wakeup_latency (ns): %u\n",
+          avg($total_wakeup_latency, $total_wakeups));
+    printf("min_wakeup_latency (ns): %u\n", $min_wakeup_latency);
+    printf("max_wakeup_latency (ns): %u\n", $max_wakeup_latency);
+
+    print_unhandled();
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+       return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+          "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+       printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl
new file mode 100644 (file)
index 0000000..511302c
--- /dev/null
@@ -0,0 +1,129 @@
+#!/usr/bin/perl -w
+# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# Displays workqueue stats
+#
+# Usage:
+#
+#   perf record -c 1 -f -a -R -e workqueue:workqueue_creation -e
+#     workqueue:workqueue_destruction -e workqueue:workqueue_execution
+#     -e workqueue:workqueue_insertion
+#
+#   perf trace -p -s tools/perf/scripts/perl/workqueue-stats.pl
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+
+my @cpus;
+
+sub workqueue::workqueue_destruction
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $thread_comm, $thread_pid) = @_;
+
+    $cpus[$common_cpu]{$thread_pid}{destroyed}++;
+    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
+}
+
+sub workqueue::workqueue_creation
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $thread_comm, $thread_pid, $cpu) = @_;
+
+    $cpus[$common_cpu]{$thread_pid}{created}++;
+    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
+}
+
+sub workqueue::workqueue_execution
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $thread_comm, $thread_pid, $func) = @_;
+
+    $cpus[$common_cpu]{$thread_pid}{executed}++;
+    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
+}
+
+sub workqueue::workqueue_insertion
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm,
+       $thread_comm, $thread_pid, $func) = @_;
+
+    $cpus[$common_cpu]{$thread_pid}{inserted}++;
+    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
+}
+
+sub trace_end
+{
+    print "workqueue work stats:\n\n";
+    my $cpu = 0;
+    printf("%3s %6s %6s\t%-20s\n", "cpu", "ins", "exec", "name");
+    printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----");
+    foreach my $pidhash (@cpus) {
+       while ((my $pid, my $wqhash) = each %$pidhash) {
+           my $ins = $$wqhash{'inserted'};
+           my $exe = $$wqhash{'executed'};
+           my $comm = $$wqhash{'comm'};
+           if ($ins || $exe) {
+               printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm);
+           }
+       }
+       $cpu++;
+    }
+
+    $cpu = 0;
+    print "\nworkqueue lifecycle stats:\n\n";
+    printf("%3s %6s %6s\t%-20s\n", "cpu", "created", "destroyed", "name");
+    printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----");
+    foreach my $pidhash (@cpus) {
+       while ((my $pid, my $wqhash) = each %$pidhash) {
+           my $created = $$wqhash{'created'};
+           my $destroyed = $$wqhash{'destroyed'};
+           my $comm = $$wqhash{'comm'};
+           if ($created || $destroyed) {
+               printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed,
+                      $comm);
+           }
+       }
+       $cpu++;
+    }
+
+    print_unhandled();
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+       return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+          "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+       printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+       $common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
index 6f8ea9d210b6d822f2e2ee57e99a3a520f516b73..918eb376abe3943375cf1ea1843d4f724ac0f621 100644 (file)
@@ -1,10 +1,15 @@
-#ifndef CACHE_H
-#define CACHE_H
+#ifndef __PERF_CACHE_H
+#define __PERF_CACHE_H
 
 #include "util.h"
 #include "strbuf.h"
 #include "../perf.h"
 
+#define CMD_EXEC_PATH "--exec-path"
+#define CMD_PERF_DIR "--perf-dir="
+#define CMD_WORK_TREE "--work-tree="
+#define CMD_DEBUGFS_DIR "--debugfs-dir="
+
 #define PERF_DIR_ENVIRONMENT "PERF_DIR"
 #define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
 #define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
@@ -117,4 +122,4 @@ extern char *perf_pathdup(const char *fmt, ...)
 
 extern size_t strlcpy(char *dest, const char *src, size_t size);
 
-#endif /* CACHE_H */
+#endif /* __PERF_CACHE_H */
index 3b8380f1b478804bfd01f1a2f9484297db4d9934..b3b71258272a902f2e349bd74b3c610260d31c96 100644 (file)
@@ -206,7 +206,7 @@ fill_node(struct callchain_node *node, struct ip_callchain *chain,
        }
        node->val_nr = chain->nr - start;
        if (!node->val_nr)
-               printf("Warning: empty node in callchain tree\n");
+               pr_warning("Warning: empty node in callchain tree\n");
 }
 
 static void
index 43cf3ea9e088fd86953637450036ecb56f3964fd..ad4626de4c2b9cfa8deb5c5b49b42c644a95a761 100644 (file)
@@ -58,4 +58,4 @@ static inline u64 cumul_hits(struct callchain_node *node)
 int register_callchain_param(struct callchain_param *param);
 void append_chain(struct callchain_node *root, struct ip_callchain *chain,
                  struct symbol **syms);
-#endif
+#endif /* __PERF_CALLCHAIN_H */
index 58d597564b9960f7d43923248c27c7c450917816..24e8809210bbbc8af8e46c8f137b6e1032d199d7 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef COLOR_H
-#define COLOR_H
+#ifndef __PERF_COLOR_H
+#define __PERF_COLOR_H
 
 /* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
 #define COLOR_MAXLEN 24
@@ -39,4 +39,4 @@ int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *bu
 int percent_color_fprintf(FILE *fp, const char *fmt, double percent);
 const char *get_percent_color(double percent);
 
-#endif /* COLOR_H */
+#endif /* __PERF_COLOR_H */
index 0b791bd346bc604df872553b8982f5bf137b8c86..35073621e5de9a83891b7e6e58e7469345e94b71 100644 (file)
@@ -29,3 +29,11 @@ unsigned char sane_ctype[256] = {
        A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0,         /* 112..127 */
        /* Nothing in the 128.. range */
 };
+
+const char *graph_line =
+       "_____________________________________________________________________"
+       "_____________________________________________________________________";
+const char *graph_dotted_line =
+       "---------------------------------------------------------------------"
+       "---------------------------------------------------------------------"
+       "---------------------------------------------------------------------";
diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c
new file mode 100644 (file)
index 0000000..ca0bedf
--- /dev/null
@@ -0,0 +1,291 @@
+#include "data_map.h"
+#include "symbol.h"
+#include "util.h"
+#include "debug.h"
+
+
+static struct perf_file_handler *curr_handler;
+static unsigned long   mmap_window = 32;
+static char            __cwd[PATH_MAX];
+
+static int process_event_stub(event_t *event __used)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
+void register_perf_file_handler(struct perf_file_handler *handler)
+{
+       if (!handler->process_sample_event)
+               handler->process_sample_event = process_event_stub;
+       if (!handler->process_mmap_event)
+               handler->process_mmap_event = process_event_stub;
+       if (!handler->process_comm_event)
+               handler->process_comm_event = process_event_stub;
+       if (!handler->process_fork_event)
+               handler->process_fork_event = process_event_stub;
+       if (!handler->process_exit_event)
+               handler->process_exit_event = process_event_stub;
+       if (!handler->process_lost_event)
+               handler->process_lost_event = process_event_stub;
+       if (!handler->process_read_event)
+               handler->process_read_event = process_event_stub;
+       if (!handler->process_throttle_event)
+               handler->process_throttle_event = process_event_stub;
+       if (!handler->process_unthrottle_event)
+               handler->process_unthrottle_event = process_event_stub;
+
+       curr_handler = handler;
+}
+
+static const char *event__name[] = {
+       [0]                      = "TOTAL",
+       [PERF_RECORD_MMAP]       = "MMAP",
+       [PERF_RECORD_LOST]       = "LOST",
+       [PERF_RECORD_COMM]       = "COMM",
+       [PERF_RECORD_EXIT]       = "EXIT",
+       [PERF_RECORD_THROTTLE]   = "THROTTLE",
+       [PERF_RECORD_UNTHROTTLE] = "UNTHROTTLE",
+       [PERF_RECORD_FORK]       = "FORK",
+       [PERF_RECORD_READ]       = "READ",
+       [PERF_RECORD_SAMPLE]     = "SAMPLE",
+};
+
+unsigned long event__total[PERF_RECORD_MAX];
+
+void event__print_totals(void)
+{
+       int i;
+       for (i = 0; i < PERF_RECORD_MAX; ++i)
+               pr_info("%10s events: %10ld\n",
+                       event__name[i], event__total[i]);
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+       trace_event(event);
+
+       if (event->header.type < PERF_RECORD_MAX) {
+               dump_printf("%p [%p]: PERF_RECORD_%s",
+                           (void *)(offset + head),
+                           (void *)(long)(event->header.size),
+                           event__name[event->header.type]);
+               ++event__total[0];
+               ++event__total[event->header.type];
+       }
+
+       switch (event->header.type) {
+       case PERF_RECORD_SAMPLE:
+               return curr_handler->process_sample_event(event);
+       case PERF_RECORD_MMAP:
+               return curr_handler->process_mmap_event(event);
+       case PERF_RECORD_COMM:
+               return curr_handler->process_comm_event(event);
+       case PERF_RECORD_FORK:
+               return curr_handler->process_fork_event(event);
+       case PERF_RECORD_EXIT:
+               return curr_handler->process_exit_event(event);
+       case PERF_RECORD_LOST:
+               return curr_handler->process_lost_event(event);
+       case PERF_RECORD_READ:
+               return curr_handler->process_read_event(event);
+       case PERF_RECORD_THROTTLE:
+               return curr_handler->process_throttle_event(event);
+       case PERF_RECORD_UNTHROTTLE:
+               return curr_handler->process_unthrottle_event(event);
+       default:
+               curr_handler->total_unknown++;
+               return -1;
+       }
+}
+
+int perf_header__read_build_ids(int input, off_t offset, off_t size)
+{
+       struct build_id_event bev;
+       char filename[PATH_MAX];
+       off_t limit = offset + size;
+       int err = -1;
+
+       while (offset < limit) {
+               struct dso *dso;
+               ssize_t len;
+
+               if (read(input, &bev, sizeof(bev)) != sizeof(bev))
+                       goto out;
+
+               len = bev.header.size - sizeof(bev);
+               if (read(input, filename, len) != len)
+                       goto out;
+
+               dso = dsos__findnew(filename);
+               if (dso != NULL)
+                       dso__set_build_id(dso, &bev.build_id);
+
+               offset += bev.header.size;
+       }
+       err = 0;
+out:
+       return err;
+}
+
+int mmap_dispatch_perf_file(struct perf_header **pheader,
+                           const char *input_name,
+                           int force,
+                           int full_paths,
+                           int *cwdlen,
+                           char **cwd)
+{
+       int err;
+       struct perf_header *header;
+       unsigned long head, shift;
+       unsigned long offset = 0;
+       struct stat input_stat;
+       size_t  page_size;
+       u64 sample_type;
+       event_t *event;
+       uint32_t size;
+       int input;
+       char *buf;
+
+       if (curr_handler == NULL) {
+               pr_debug("Forgot to register perf file handler\n");
+               return -EINVAL;
+       }
+
+       page_size = getpagesize();
+
+       input = open(input_name, O_RDONLY);
+       if (input < 0) {
+               pr_err("Failed to open file: %s", input_name);
+               if (!strcmp(input_name, "perf.data"))
+                       pr_err("  (try 'perf record' first)");
+               pr_err("\n");
+               return -errno;
+       }
+
+       if (fstat(input, &input_stat) < 0) {
+               pr_err("failed to stat file");
+               err = -errno;
+               goto out_close;
+       }
+
+       err = -EACCES;
+       if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) {
+               pr_err("file: %s not owned by current user or root\n",
+                       input_name);
+               goto out_close;
+       }
+
+       if (input_stat.st_size == 0) {
+               pr_info("zero-sized file, nothing to do!\n");
+               goto done;
+       }
+
+       err = -ENOMEM;
+       header = perf_header__new();
+       if (header == NULL)
+               goto out_close;
+
+       err = perf_header__read(header, input);
+       if (err < 0)
+               goto out_delete;
+       *pheader = header;
+       head = header->data_offset;
+
+       sample_type = perf_header__sample_type(header);
+
+       err = -EINVAL;
+       if (curr_handler->sample_type_check &&
+           curr_handler->sample_type_check(sample_type) < 0)
+               goto out_delete;
+
+       if (!full_paths) {
+               if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+                       pr_err("failed to get the current directory\n");
+                       err = -errno;
+                       goto out_delete;
+               }
+               *cwd = __cwd;
+               *cwdlen = strlen(*cwd);
+       } else {
+               *cwd = NULL;
+               *cwdlen = 0;
+       }
+
+       shift = page_size * (head / page_size);
+       offset += shift;
+       head -= shift;
+
+remap:
+       buf = mmap(NULL, page_size * mmap_window, PROT_READ,
+                  MAP_SHARED, input, offset);
+       if (buf == MAP_FAILED) {
+               pr_err("failed to mmap file\n");
+               err = -errno;
+               goto out_delete;
+       }
+
+more:
+       event = (event_t *)(buf + head);
+
+       size = event->header.size;
+       if (!size)
+               size = 8;
+
+       if (head + event->header.size >= page_size * mmap_window) {
+               int munmap_ret;
+
+               shift = page_size * (head / page_size);
+
+               munmap_ret = munmap(buf, page_size * mmap_window);
+               assert(munmap_ret == 0);
+
+               offset += shift;
+               head -= shift;
+               goto remap;
+       }
+
+       size = event->header.size;
+
+       dump_printf("\n%p [%p]: event: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)event->header.size,
+                       event->header.type);
+
+       if (!size || process_event(event, offset, head) < 0) {
+
+               dump_printf("%p [%p]: skipping unknown header type: %d\n",
+                       (void *)(offset + head),
+                       (void *)(long)(event->header.size),
+                       event->header.type);
+
+               /*
+                * assume we lost track of the stream, check alignment, and
+                * increment a single u64 in the hope to catch on again 'soon'.
+                */
+
+               if (unlikely(head & 7))
+                       head &= ~7ULL;
+
+               size = 8;
+       }
+
+       head += size;
+
+       if (offset + head >= header->data_offset + header->data_size)
+               goto done;
+
+       if (offset + head < (unsigned long)input_stat.st_size)
+               goto more;
+
+done:
+       err = 0;
+out_close:
+       close(input);
+
+       return err;
+out_delete:
+       perf_header__delete(header);
+       goto out_close;
+}
diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h
new file mode 100644 (file)
index 0000000..3180ff7
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef __PERF_DATAMAP_H
+#define __PERF_DATAMAP_H
+
+#include "event.h"
+#include "header.h"
+
+typedef int (*event_type_handler_t)(event_t *);
+
+struct perf_file_handler {
+       event_type_handler_t    process_sample_event;
+       event_type_handler_t    process_mmap_event;
+       event_type_handler_t    process_comm_event;
+       event_type_handler_t    process_fork_event;
+       event_type_handler_t    process_exit_event;
+       event_type_handler_t    process_lost_event;
+       event_type_handler_t    process_read_event;
+       event_type_handler_t    process_throttle_event;
+       event_type_handler_t    process_unthrottle_event;
+       int                     (*sample_type_check)(u64 sample_type);
+       unsigned long           total_unknown;
+};
+
+void register_perf_file_handler(struct perf_file_handler *handler);
+int mmap_dispatch_perf_file(struct perf_header **pheader,
+                           const char *input_name,
+                           int force,
+                           int full_paths,
+                           int *cwdlen,
+                           char **cwd);
+int perf_header__read_build_ids(int input, off_t offset, off_t file_size);
+
+#endif
index e8ca98fe0bd4d856f3444657789af42ef67c00d0..28d520d5a1fbe823cd29f5fc11c2a7cc92735038 100644 (file)
 int verbose = 0;
 int dump_trace = 0;
 
-int eprintf(const char *fmt, ...)
+int eprintf(int level, const char *fmt, ...)
 {
        va_list args;
        int ret = 0;
 
-       if (verbose) {
+       if (verbose >= level) {
                va_start(args, fmt);
                ret = vfprintf(stderr, fmt, args);
                va_end(args);
index 437eea58ce406058bd5037ad7450e6e99de7f302..c6c24c522deaf0cbbef6af3ffd95de706525412d 100644 (file)
@@ -1,8 +1,15 @@
 /* For debugging general purposes */
+#ifndef __PERF_DEBUG_H
+#define __PERF_DEBUG_H
+
+#include "event.h"
 
 extern int verbose;
 extern int dump_trace;
 
-int eprintf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+int eprintf(int level,
+           const char *fmt, ...) __attribute__((format(printf, 2, 3)));
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
+
+#endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/debugfs.c b/tools/perf/util/debugfs.c
new file mode 100644 (file)
index 0000000..06b73ee
--- /dev/null
@@ -0,0 +1,241 @@
+#include "util.h"
+#include "debugfs.h"
+#include "cache.h"
+
+static int debugfs_premounted;
+static char debugfs_mountpoint[MAX_PATH+1];
+
+static const char *debugfs_known_mountpoints[] = {
+       "/sys/kernel/debug/",
+       "/debug/",
+       0,
+};
+
+/* use this to force a umount */
+void debugfs_force_cleanup(void)
+{
+       debugfs_find_mountpoint();
+       debugfs_premounted = 0;
+       debugfs_umount();
+}
+
+/* construct a full path to a debugfs element */
+int debugfs_make_path(const char *element, char *buffer, int size)
+{
+       int len;
+
+       if (strlen(debugfs_mountpoint) == 0) {
+               buffer[0] = '\0';
+               return -1;
+       }
+
+       len = strlen(debugfs_mountpoint) + strlen(element) + 1;
+       if (len >= size)
+               return len+1;
+
+       snprintf(buffer, size-1, "%s/%s", debugfs_mountpoint, element);
+       return 0;
+}
+
+static int debugfs_found;
+
+/* find the path to the mounted debugfs */
+const char *debugfs_find_mountpoint(void)
+{
+       const char **ptr;
+       char type[100];
+       FILE *fp;
+
+       if (debugfs_found)
+               return (const char *) debugfs_mountpoint;
+
+       ptr = debugfs_known_mountpoints;
+       while (*ptr) {
+               if (debugfs_valid_mountpoint(*ptr) == 0) {
+                       debugfs_found = 1;
+                       strcpy(debugfs_mountpoint, *ptr);
+                       return debugfs_mountpoint;
+               }
+               ptr++;
+       }
+
+       /* give up and parse /proc/mounts */
+       fp = fopen("/proc/mounts", "r");
+       if (fp == NULL)
+               die("Can't open /proc/mounts for read");
+
+       while (fscanf(fp, "%*s %"
+                     STR(MAX_PATH)
+                     "s %99s %*s %*d %*d\n",
+                     debugfs_mountpoint, type) == 2) {
+               if (strcmp(type, "debugfs") == 0)
+                       break;
+       }
+       fclose(fp);
+
+       if (strcmp(type, "debugfs") != 0)
+               return NULL;
+
+       debugfs_found = 1;
+
+       return debugfs_mountpoint;
+}
+
+/* verify that a mountpoint is actually a debugfs instance */
+
+int debugfs_valid_mountpoint(const char *debugfs)
+{
+       struct statfs st_fs;
+
+       if (statfs(debugfs, &st_fs) < 0)
+               return -ENOENT;
+       else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
+               return -ENOENT;
+
+       return 0;
+}
+
+
+int debugfs_valid_entry(const char *path)
+{
+       struct stat st;
+
+       if (stat(path, &st))
+               return -errno;
+
+       return 0;
+}
+
+/* mount the debugfs somewhere */
+
+int debugfs_mount(const char *mountpoint)
+{
+       char mountcmd[128];
+
+       /* see if it's already mounted */
+       if (debugfs_find_mountpoint()) {
+               debugfs_premounted = 1;
+               return 0;
+       }
+
+       /* if not mounted and no argument */
+       if (mountpoint == NULL) {
+               /* see if environment variable set */
+               mountpoint = getenv(PERF_DEBUGFS_ENVIRONMENT);
+               /* if no environment variable, use default */
+               if (mountpoint == NULL)
+                       mountpoint = "/sys/kernel/debug";
+       }
+
+       /* save the mountpoint */
+       strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
+
+       /* mount it */
+       snprintf(mountcmd, sizeof(mountcmd),
+                "/bin/mount -t debugfs debugfs %s", mountpoint);
+       return system(mountcmd);
+}
+
+/* umount the debugfs */
+
+int debugfs_umount(void)
+{
+       char umountcmd[128];
+       int ret;
+
+       /* if it was already mounted, leave it */
+       if (debugfs_premounted)
+               return 0;
+
+       /* make sure it's a valid mount point */
+       ret = debugfs_valid_mountpoint(debugfs_mountpoint);
+       if (ret)
+               return ret;
+
+       snprintf(umountcmd, sizeof(umountcmd),
+                "/bin/umount %s", debugfs_mountpoint);
+       return system(umountcmd);
+}
+
+int debugfs_write(const char *entry, const char *value)
+{
+       char path[MAX_PATH+1];
+       int ret, count;
+       int fd;
+
+       /* construct the path */
+       snprintf(path, sizeof(path), "%s/%s", debugfs_mountpoint, entry);
+
+       /* verify that it exists */
+       ret = debugfs_valid_entry(path);
+       if (ret)
+               return ret;
+
+       /* get how many chars we're going to write */
+       count = strlen(value);
+
+       /* open the debugfs entry */
+       fd = open(path, O_RDWR);
+       if (fd < 0)
+               return -errno;
+
+       while (count > 0) {
+               /* write it */
+               ret = write(fd, value, count);
+               if (ret <= 0) {
+                       if (ret == EAGAIN)
+                               continue;
+                       close(fd);
+                       return -errno;
+               }
+               count -= ret;
+       }
+
+       /* close it */
+       close(fd);
+
+       /* return success */
+       return 0;
+}
+
+/*
+ * read a debugfs entry
+ * returns the number of chars read or a negative errno
+ */
+int debugfs_read(const char *entry, char *buffer, size_t size)
+{
+       char path[MAX_PATH+1];
+       int ret;
+       int fd;
+
+       /* construct the path */
+       snprintf(path, sizeof(path), "%s/%s", debugfs_mountpoint, entry);
+
+       /* verify that it exists */
+       ret = debugfs_valid_entry(path);
+       if (ret)
+               return ret;
+
+       /* open the debugfs entry */
+       fd = open(path, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       do {
+               /* read it */
+               ret = read(fd, buffer, size);
+               if (ret == 0) {
+                       close(fd);
+                       return EOF;
+               }
+       } while (ret < 0 && errno == EAGAIN);
+
+       /* close it */
+       close(fd);
+
+       /* make *sure* there's a null character at the end */
+       buffer[ret] = '\0';
+
+       /* return the number of chars read */
+       return ret;
+}
diff --git a/tools/perf/util/debugfs.h b/tools/perf/util/debugfs.h
new file mode 100644 (file)
index 0000000..3cd14f9
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef __DEBUGFS_H__
+#define __DEBUGFS_H__
+
+#include <sys/mount.h>
+
+#ifndef MAX_PATH
+# define MAX_PATH 256
+#endif
+
+#ifndef STR
+# define _STR(x) #x
+# define STR(x) _STR(x)
+#endif
+
+extern const char *debugfs_find_mountpoint(void);
+extern int debugfs_valid_mountpoint(const char *debugfs);
+extern int debugfs_valid_entry(const char *path);
+extern int debugfs_mount(const char *mountpoint);
+extern int debugfs_umount(void);
+extern int debugfs_write(const char *entry, const char *value);
+extern int debugfs_read(const char *entry, char *buffer, size_t size);
+extern void debugfs_force_cleanup(void);
+extern int debugfs_make_path(const char *element, char *buffer, int size);
+
+#endif /* __DEBUGFS_H__ */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
new file mode 100644 (file)
index 0000000..414b89d
--- /dev/null
@@ -0,0 +1,312 @@
+#include <linux/types.h>
+#include "event.h"
+#include "debug.h"
+#include "string.h"
+#include "thread.h"
+
+static pid_t event__synthesize_comm(pid_t pid, int full,
+                                   int (*process)(event_t *event))
+{
+       event_t ev;
+       char filename[PATH_MAX];
+       char bf[BUFSIZ];
+       FILE *fp;
+       size_t size = 0;
+       DIR *tasks;
+       struct dirent dirent, *next;
+       pid_t tgid = 0;
+
+       snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
+
+       fp = fopen(filename, "r");
+       if (fp == NULL) {
+out_race:
+               /*
+                * We raced with a task exiting - just return:
+                */
+               pr_debug("couldn't open %s\n", filename);
+               return 0;
+       }
+
+       memset(&ev.comm, 0, sizeof(ev.comm));
+       while (!ev.comm.comm[0] || !ev.comm.pid) {
+               if (fgets(bf, sizeof(bf), fp) == NULL)
+                       goto out_failure;
+
+               if (memcmp(bf, "Name:", 5) == 0) {
+                       char *name = bf + 5;
+                       while (*name && isspace(*name))
+                               ++name;
+                       size = strlen(name) - 1;
+                       memcpy(ev.comm.comm, name, size++);
+               } else if (memcmp(bf, "Tgid:", 5) == 0) {
+                       char *tgids = bf + 5;
+                       while (*tgids && isspace(*tgids))
+                               ++tgids;
+                       tgid = ev.comm.pid = atoi(tgids);
+               }
+       }
+
+       ev.comm.header.type = PERF_RECORD_COMM;
+       size = ALIGN(size, sizeof(u64));
+       ev.comm.header.size = sizeof(ev.comm) - (sizeof(ev.comm.comm) - size);
+
+       if (!full) {
+               ev.comm.tid = pid;
+
+               process(&ev);
+               goto out_fclose;
+       }
+
+       snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
+
+       tasks = opendir(filename);
+       if (tasks == NULL)
+               goto out_race;
+
+       while (!readdir_r(tasks, &dirent, &next) && next) {
+               char *end;
+               pid = strtol(dirent.d_name, &end, 10);
+               if (*end)
+                       continue;
+
+               ev.comm.tid = pid;
+
+               process(&ev);
+       }
+       closedir(tasks);
+
+out_fclose:
+       fclose(fp);
+       return tgid;
+
+out_failure:
+       pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
+       return -1;
+}
+
+static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
+                                        int (*process)(event_t *event))
+{
+       char filename[PATH_MAX];
+       FILE *fp;
+
+       snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
+
+       fp = fopen(filename, "r");
+       if (fp == NULL) {
+               /*
+                * We raced with a task exiting - just return:
+                */
+               pr_debug("couldn't open %s\n", filename);
+               return -1;
+       }
+
+       while (1) {
+               char bf[BUFSIZ], *pbf = bf;
+               event_t ev = {
+                       .header = { .type = PERF_RECORD_MMAP },
+               };
+               int n;
+               size_t size;
+               if (fgets(bf, sizeof(bf), fp) == NULL)
+                       break;
+
+               /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+               n = hex2u64(pbf, &ev.mmap.start);
+               if (n < 0)
+                       continue;
+               pbf += n + 1;
+               n = hex2u64(pbf, &ev.mmap.len);
+               if (n < 0)
+                       continue;
+               pbf += n + 3;
+               if (*pbf == 'x') { /* vm_exec */
+                       char *execname = strchr(bf, '/');
+
+                       /* Catch VDSO */
+                       if (execname == NULL)
+                               execname = strstr(bf, "[vdso]");
+
+                       if (execname == NULL)
+                               continue;
+
+                       size = strlen(execname);
+                       execname[size - 1] = '\0'; /* Remove \n */
+                       memcpy(ev.mmap.filename, execname, size);
+                       size = ALIGN(size, sizeof(u64));
+                       ev.mmap.len -= ev.mmap.start;
+                       ev.mmap.header.size = (sizeof(ev.mmap) -
+                                              (sizeof(ev.mmap.filename) - size));
+                       ev.mmap.pid = tgid;
+                       ev.mmap.tid = pid;
+
+                       process(&ev);
+               }
+       }
+
+       fclose(fp);
+       return 0;
+}
+
+int event__synthesize_thread(pid_t pid, int (*process)(event_t *event))
+{
+       pid_t tgid = event__synthesize_comm(pid, 1, process);
+       if (tgid == -1)
+               return -1;
+       return event__synthesize_mmap_events(pid, tgid, process);
+}
+
+void event__synthesize_threads(int (*process)(event_t *event))
+{
+       DIR *proc;
+       struct dirent dirent, *next;
+
+       proc = opendir("/proc");
+
+       while (!readdir_r(proc, &dirent, &next) && next) {
+               char *end;
+               pid_t pid = strtol(dirent.d_name, &end, 10);
+
+               if (*end) /* only interested in proper numerical dirents */
+                       continue;
+
+               event__synthesize_thread(pid, process);
+       }
+
+       closedir(proc);
+}
+
+char *event__cwd;
+int  event__cwdlen;
+
+struct events_stats event__stats;
+
+int event__process_comm(event_t *self)
+{
+       struct thread *thread = threads__findnew(self->comm.pid);
+
+       dump_printf(": %s:%d\n", self->comm.comm, self->comm.pid);
+
+       if (thread == NULL || thread__set_comm(thread, self->comm.comm)) {
+               dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+int event__process_lost(event_t *self)
+{
+       dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost);
+       event__stats.lost += self->lost.lost;
+       return 0;
+}
+
+int event__process_mmap(event_t *self)
+{
+       struct thread *thread = threads__findnew(self->mmap.pid);
+       struct map *map = map__new(&self->mmap, MAP__FUNCTION,
+                                  event__cwd, event__cwdlen);
+
+       dump_printf(" %d/%d: [%p(%p) @ %p]: %s\n",
+                   self->mmap.pid, self->mmap.tid,
+                   (void *)(long)self->mmap.start,
+                   (void *)(long)self->mmap.len,
+                   (void *)(long)self->mmap.pgoff,
+                   self->mmap.filename);
+
+       if (thread == NULL || map == NULL)
+               dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
+       else
+               thread__insert_map(thread, map);
+
+       return 0;
+}
+
+int event__process_task(event_t *self)
+{
+       struct thread *thread = threads__findnew(self->fork.pid);
+       struct thread *parent = threads__findnew(self->fork.ppid);
+
+       dump_printf("(%d:%d):(%d:%d)\n", self->fork.pid, self->fork.tid,
+                   self->fork.ppid, self->fork.ptid);
+       /*
+        * A thread clone will have the same PID for both parent and child.
+        */
+       if (thread == parent)
+               return 0;
+
+       if (self->header.type == PERF_RECORD_EXIT)
+               return 0;
+
+       if (thread == NULL || parent == NULL ||
+           thread__fork(thread, parent) < 0) {
+               dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+void thread__find_addr_location(struct thread *self, u8 cpumode,
+                               enum map_type type, u64 addr,
+                               struct addr_location *al,
+                               symbol_filter_t filter)
+{
+       struct thread *thread = al->thread = self;
+
+       al->addr = addr;
+
+       if (cpumode & PERF_RECORD_MISC_KERNEL) {
+               al->level = 'k';
+               thread = kthread;
+       } else if (cpumode & PERF_RECORD_MISC_USER)
+               al->level = '.';
+       else {
+               al->level = 'H';
+               al->map = NULL;
+               al->sym = NULL;
+               return;
+       }
+try_again:
+       al->map = thread__find_map(thread, type, al->addr);
+       if (al->map == NULL) {
+               /*
+                * If this is outside of all known maps, and is a negative
+                * address, try to look it up in the kernel dso, as it might be
+                * a vsyscall or vdso (which executes in user-mode).
+                *
+                * XXX This is nasty, we should have a symbol list in the
+                * "[vdso]" dso, but for now lets use the old trick of looking
+                * in the whole kernel symbol list.
+                */
+               if ((long long)al->addr < 0 && thread != kthread) {
+                       thread = kthread;
+                       goto try_again;
+               }
+               al->sym = NULL;
+       } else {
+               al->addr = al->map->map_ip(al->map, al->addr);
+               al->sym = map__find_symbol(al->map, al->addr, filter);
+       }
+}
+
+int event__preprocess_sample(const event_t *self, struct addr_location *al,
+                            symbol_filter_t filter)
+{
+       u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+       struct thread *thread = threads__findnew(self->ip.pid);
+
+       if (thread == NULL)
+               return -1;
+
+       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+       thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
+                                  self->ip.ip, al, filter);
+       dump_printf(" ...... dso: %s\n",
+                   al->map ? al->map->dso->long_name :
+                       al->level == 'H' ? "[hypervisor]" : "<not found>");
+       return 0;
+}
index 2c9c26d6ded0aff8f3679c01b0cb93a386194738..a4cc8105cf675f42285896e674dd81193da7582b 100644 (file)
@@ -1,14 +1,10 @@
 #ifndef __PERF_RECORD_H
 #define __PERF_RECORD_H
+
 #include "../perf.h"
 #include "util.h"
 #include <linux/list.h>
-
-enum {
-       SHOW_KERNEL     = 1,
-       SHOW_USER       = 2,
-       SHOW_HV         = 4,
-};
+#include <linux/rbtree.h>
 
 /*
  * PERF_SAMPLE_IP | PERF_SAMPLE_TID | *
@@ -65,6 +61,13 @@ struct sample_event{
        u64 array[];
 };
 
+#define BUILD_ID_SIZE 20
+
+struct build_id_event {
+       struct perf_event_header header;
+       u8                       build_id[ALIGN(BUILD_ID_SIZE, sizeof(u64))];
+       char                     filename[];
+};
 
 typedef union event_union {
        struct perf_event_header        header;
@@ -77,12 +80,30 @@ typedef union event_union {
        struct sample_event             sample;
 } event_t;
 
+struct events_stats {
+       unsigned long total;
+       unsigned long lost;
+};
+
+void event__print_totals(void);
+
+enum map_type {
+       MAP__FUNCTION = 0,
+
+       MAP__NR_TYPES,
+};
+
 struct map {
-       struct list_head        node;
+       union {
+               struct rb_node  rb_node;
+               struct list_head node;
+       };
        u64                     start;
        u64                     end;
+       enum map_type           type;
        u64                     pgoff;
        u64                     (*map_ip)(struct map *, u64);
+       u64                     (*unmap_ip)(struct map *, u64);
        struct dso              *dso;
 };
 
@@ -91,14 +112,48 @@ static inline u64 map__map_ip(struct map *map, u64 ip)
        return ip - map->start + map->pgoff;
 }
 
-static inline u64 vdso__map_ip(struct map *map __used, u64 ip)
+static inline u64 map__unmap_ip(struct map *map, u64 ip)
+{
+       return ip + map->start - map->pgoff;
+}
+
+static inline u64 identity__map_ip(struct map *map __used, u64 ip)
 {
        return ip;
 }
 
-struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen);
+struct symbol;
+
+typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
+
+void map__init(struct map *self, enum map_type type,
+              u64 start, u64 end, u64 pgoff, struct dso *dso);
+struct map *map__new(struct mmap_event *event, enum map_type,
+                    char *cwd, int cwdlen);
+void map__delete(struct map *self);
 struct map *map__clone(struct map *self);
 int map__overlap(struct map *l, struct map *r);
 size_t map__fprintf(struct map *self, FILE *fp);
+struct symbol *map__find_symbol(struct map *self, u64 addr,
+                               symbol_filter_t filter);
+void map__fixup_start(struct map *self);
+void map__fixup_end(struct map *self);
+
+int event__synthesize_thread(pid_t pid, int (*process)(event_t *event));
+void event__synthesize_threads(int (*process)(event_t *event));
+
+extern char *event__cwd;
+extern int  event__cwdlen;
+extern struct events_stats event__stats;
+extern unsigned long event__total[PERF_RECORD_MAX];
+
+int event__process_comm(event_t *self);
+int event__process_lost(event_t *self);
+int event__process_mmap(event_t *self);
+int event__process_task(event_t *self);
+
+struct addr_location;
+int event__preprocess_sample(const event_t *self, struct addr_location *al,
+                            symbol_filter_t filter);
 
-#endif
+#endif /* __PERF_RECORD_H */
index effe25eb15456dcad9bc6233eb19f38193af2623..31647ac92ed1733e71dade59609c310563193b31 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef PERF_EXEC_CMD_H
-#define PERF_EXEC_CMD_H
+#ifndef __PERF_EXEC_CMD_H
+#define __PERF_EXEC_CMD_H
 
 extern void perf_set_argv_exec_path(const char *exec_path);
 extern const char *perf_extract_argv0_path(const char *path);
@@ -10,4 +10,4 @@ extern int execv_perf_cmd(const char **argv); /* NULL terminated */
 extern int execl_perf_cmd(const char *cmd, ...);
 extern const char *system_path(const char *path);
 
-#endif /* PERF_EXEC_CMD_H */
+#endif /* __PERF_EXEC_CMD_H */
index e306857b2c2b98e3624bf30b96be74130edab831..4805e6dfd23c8a77f2ea7a7bf56bc15c8e5a4544 100644 (file)
@@ -2,9 +2,15 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <linux/list.h>
 
 #include "util.h"
 #include "header.h"
+#include "../perf.h"
+#include "trace-event.h"
+#include "symbol.h"
+#include "data_map.h"
+#include "debug.h"
 
 /*
  * Create new perf.data header attribute:
@@ -13,32 +19,43 @@ struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr)
 {
        struct perf_header_attr *self = malloc(sizeof(*self));
 
-       if (!self)
-               die("nomem");
-
-       self->attr = *attr;
-       self->ids = 0;
-       self->size = 1;
-       self->id = malloc(sizeof(u64));
-
-       if (!self->id)
-               die("nomem");
+       if (self != NULL) {
+               self->attr = *attr;
+               self->ids  = 0;
+               self->size = 1;
+               self->id   = malloc(sizeof(u64));
+               if (self->id == NULL) {
+                       free(self);
+                       self = NULL;
+               }
+       }
 
        return self;
 }
 
-void perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
+void perf_header_attr__delete(struct perf_header_attr *self)
+{
+       free(self->id);
+       free(self);
+}
+
+int perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
 {
        int pos = self->ids;
 
        self->ids++;
        if (self->ids > self->size) {
-               self->size *= 2;
-               self->id = realloc(self->id, self->size * sizeof(u64));
-               if (!self->id)
-                       die("nomem");
+               int nsize = self->size * 2;
+               u64 *nid = realloc(self->id, nsize * sizeof(u64));
+
+               if (nid == NULL)
+                       return -1;
+
+               self->size = nsize;
+               self->id = nid;
        }
        self->id[pos] = id;
+       return 0;
 }
 
 /*
@@ -46,42 +63,52 @@ void perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
  */
 struct perf_header *perf_header__new(void)
 {
-       struct perf_header *self = malloc(sizeof(*self));
+       struct perf_header *self = zalloc(sizeof(*self));
 
-       if (!self)
-               die("nomem");
+       if (self != NULL) {
+               self->size = 1;
+               self->attr = malloc(sizeof(void *));
 
-       self->frozen = 0;
+               if (self->attr == NULL) {
+                       free(self);
+                       self = NULL;
+               }
+       }
 
-       self->attrs = 0;
-       self->size = 1;
-       self->attr = malloc(sizeof(void *));
+       return self;
+}
 
-       if (!self->attr)
-               die("nomem");
+void perf_header__delete(struct perf_header *self)
+{
+       int i;
 
-       self->data_offset = 0;
-       self->data_size = 0;
+       for (i = 0; i < self->attrs; ++i)
+               perf_header_attr__delete(self->attr[i]);
 
-       return self;
+       free(self->attr);
+       free(self);
 }
 
-void perf_header__add_attr(struct perf_header *self,
-                          struct perf_header_attr *attr)
+int perf_header__add_attr(struct perf_header *self,
+                         struct perf_header_attr *attr)
 {
-       int pos = self->attrs;
-
        if (self->frozen)
-               die("frozen");
+               return -1;
 
-       self->attrs++;
-       if (self->attrs > self->size) {
-               self->size *= 2;
-               self->attr = realloc(self->attr, self->size * sizeof(void *));
-               if (!self->attr)
-                       die("nomem");
+       if (self->attrs == self->size) {
+               int nsize = self->size * 2;
+               struct perf_header_attr **nattr;
+
+               nattr = realloc(self->attr, nsize * sizeof(void *));
+               if (nattr == NULL)
+                       return -1;
+
+               self->size = nsize;
+               self->attr = nattr;
        }
-       self->attr[pos] = attr;
+
+       self->attr[self->attrs++] = attr;
+       return 0;
 }
 
 #define MAX_EVENT_NAME 64
@@ -97,7 +124,7 @@ static struct perf_trace_event_type *events;
 void perf_header__push_event(u64 id, const char *name)
 {
        if (strlen(name) > MAX_EVENT_NAME)
-               printf("Event %s will be truncated\n", name);
+               pr_warning("Event %s will be truncated\n", name);
 
        if (!events) {
                events = malloc(sizeof(struct perf_trace_event_type));
@@ -128,44 +155,137 @@ static const char *__perf_magic = "PERFFILE";
 
 #define PERF_MAGIC     (*(u64 *)__perf_magic)
 
-struct perf_file_section {
-       u64 offset;
-       u64 size;
-};
-
 struct perf_file_attr {
        struct perf_event_attr  attr;
        struct perf_file_section        ids;
 };
 
-struct perf_file_header {
-       u64                             magic;
-       u64                             size;
-       u64                             attr_size;
-       struct perf_file_section        attrs;
-       struct perf_file_section        data;
-       struct perf_file_section        event_types;
-};
+void perf_header__set_feat(struct perf_header *self, int feat)
+{
+       set_bit(feat, self->adds_features);
+}
 
-static void do_write(int fd, void *buf, size_t size)
+bool perf_header__has_feat(const struct perf_header *self, int feat)
+{
+       return test_bit(feat, self->adds_features);
+}
+
+static int do_write(int fd, const void *buf, size_t size)
 {
        while (size) {
                int ret = write(fd, buf, size);
 
                if (ret < 0)
-                       die("failed to write");
+                       return -errno;
 
                size -= ret;
                buf += ret;
        }
+
+       return 0;
+}
+
+static int __dsos__write_buildid_table(struct list_head *head, int fd)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, head, node) {
+               int err;
+               struct build_id_event b;
+               size_t len;
+
+               if (!pos->has_build_id)
+                       continue;
+               len = pos->long_name_len + 1;
+               len = ALIGN(len, 64);
+               memset(&b, 0, sizeof(b));
+               memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
+               b.header.size = sizeof(b) + len;
+               err = do_write(fd, &b, sizeof(b));
+               if (err < 0)
+                       return err;
+               err = do_write(fd, pos->long_name, len);
+               if (err < 0)
+                       return err;
+       }
+
+       return 0;
 }
 
-void perf_header__write(struct perf_header *self, int fd)
+static int dsos__write_buildid_table(int fd)
+{
+       int err = __dsos__write_buildid_table(&dsos__kernel, fd);
+       if (err == 0)
+               err = __dsos__write_buildid_table(&dsos__user, fd);
+       return err;
+}
+
+static int perf_header__adds_write(struct perf_header *self, int fd)
+{
+       int nr_sections;
+       struct perf_file_section *feat_sec;
+       int sec_size;
+       u64 sec_start;
+       int idx = 0, err;
+
+       if (dsos__read_build_ids())
+               perf_header__set_feat(self, HEADER_BUILD_ID);
+
+       nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
+       if (!nr_sections)
+               return 0;
+
+       feat_sec = calloc(sizeof(*feat_sec), nr_sections);
+       if (feat_sec == NULL)
+               return -ENOMEM;
+
+       sec_size = sizeof(*feat_sec) * nr_sections;
+
+       sec_start = self->data_offset + self->data_size;
+       lseek(fd, sec_start + sec_size, SEEK_SET);
+
+       if (perf_header__has_feat(self, HEADER_TRACE_INFO)) {
+               struct perf_file_section *trace_sec;
+
+               trace_sec = &feat_sec[idx++];
+
+               /* Write trace info */
+               trace_sec->offset = lseek(fd, 0, SEEK_CUR);
+               read_tracing_data(fd, attrs, nr_counters);
+               trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
+       }
+
+
+       if (perf_header__has_feat(self, HEADER_BUILD_ID)) {
+               struct perf_file_section *buildid_sec;
+
+               buildid_sec = &feat_sec[idx++];
+
+               /* Write build-ids */
+               buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
+               err = dsos__write_buildid_table(fd);
+               if (err < 0) {
+                       pr_debug("failed to write buildid table\n");
+                       goto out_free;
+               }
+               buildid_sec->size = lseek(fd, 0, SEEK_CUR) - buildid_sec->offset;
+       }
+
+       lseek(fd, sec_start, SEEK_SET);
+       err = do_write(fd, feat_sec, sec_size);
+       if (err < 0)
+               pr_debug("failed to write feature section\n");
+out_free:
+       free(feat_sec);
+       return err;
+}
+
+int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 {
        struct perf_file_header f_header;
        struct perf_file_attr   f_attr;
        struct perf_header_attr *attr;
-       int i;
+       int i, err;
 
        lseek(fd, sizeof(f_header), SEEK_SET);
 
@@ -174,7 +294,11 @@ void perf_header__write(struct perf_header *self, int fd)
                attr = self->attr[i];
 
                attr->id_offset = lseek(fd, 0, SEEK_CUR);
-               do_write(fd, attr->id, attr->ids * sizeof(u64));
+               err = do_write(fd, attr->id, attr->ids * sizeof(u64));
+               if (err < 0) {
+                       pr_debug("failed to write perf header\n");
+                       return err;
+               }
        }
 
 
@@ -190,17 +314,31 @@ void perf_header__write(struct perf_header *self, int fd)
                                .size   = attr->ids * sizeof(u64),
                        }
                };
-               do_write(fd, &f_attr, sizeof(f_attr));
+               err = do_write(fd, &f_attr, sizeof(f_attr));
+               if (err < 0) {
+                       pr_debug("failed to write perf header attribute\n");
+                       return err;
+               }
        }
 
        self->event_offset = lseek(fd, 0, SEEK_CUR);
        self->event_size = event_count * sizeof(struct perf_trace_event_type);
-       if (events)
-               do_write(fd, events, self->event_size);
-
+       if (events) {
+               err = do_write(fd, events, self->event_size);
+               if (err < 0) {
+                       pr_debug("failed to write perf header events\n");
+                       return err;
+               }
+       }
 
        self->data_offset = lseek(fd, 0, SEEK_CUR);
 
+       if (at_exit) {
+               err = perf_header__adds_write(self, fd);
+               if (err < 0)
+                       return err;
+       }
+
        f_header = (struct perf_file_header){
                .magic     = PERF_MAGIC,
                .size      = sizeof(f_header),
@@ -219,11 +357,18 @@ void perf_header__write(struct perf_header *self, int fd)
                },
        };
 
+       memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features));
+
        lseek(fd, 0, SEEK_SET);
-       do_write(fd, &f_header, sizeof(f_header));
+       err = do_write(fd, &f_header, sizeof(f_header));
+       if (err < 0) {
+               pr_debug("failed to write perf header\n");
+               return err;
+       }
        lseek(fd, self->data_offset + self->data_size, SEEK_SET);
 
        self->frozen = 1;
+       return 0;
 }
 
 static void do_read(int fd, void *buf, size_t size)
@@ -241,22 +386,109 @@ static void do_read(int fd, void *buf, size_t size)
        }
 }
 
-struct perf_header *perf_header__read(int fd)
+int perf_header__process_sections(struct perf_header *self, int fd,
+                                 int (*process)(struct perf_file_section *self,
+                                                int feat, int fd))
+{
+       struct perf_file_section *feat_sec;
+       int nr_sections;
+       int sec_size;
+       int idx = 0;
+       int err = 0, feat = 1;
+
+       nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
+       if (!nr_sections)
+               return 0;
+
+       feat_sec = calloc(sizeof(*feat_sec), nr_sections);
+       if (!feat_sec)
+               return -1;
+
+       sec_size = sizeof(*feat_sec) * nr_sections;
+
+       lseek(fd, self->data_offset + self->data_size, SEEK_SET);
+
+       do_read(fd, feat_sec, sec_size);
+
+       while (idx < nr_sections && feat < HEADER_LAST_FEATURE) {
+               if (perf_header__has_feat(self, feat)) {
+                       struct perf_file_section *sec = &feat_sec[idx++];
+
+                       err = process(sec, feat, fd);
+                       if (err < 0)
+                               break;
+               }
+               ++feat;
+       }
+
+       free(feat_sec);
+       return err;
+};
+
+int perf_file_header__read(struct perf_file_header *self,
+                          struct perf_header *ph, int fd)
+{
+       lseek(fd, 0, SEEK_SET);
+       do_read(fd, self, sizeof(*self));
+
+       if (self->magic     != PERF_MAGIC ||
+           self->attr_size != sizeof(struct perf_file_attr))
+               return -1;
+
+       if (self->size != sizeof(*self)) {
+               /* Support the previous format */
+               if (self->size == offsetof(typeof(*self), adds_features))
+                       bitmap_zero(self->adds_features, HEADER_FEAT_BITS);
+               else
+                       return -1;
+       }
+
+       memcpy(&ph->adds_features, &self->adds_features,
+              sizeof(self->adds_features));
+
+       ph->event_offset = self->event_types.offset;
+       ph->event_size   = self->event_types.size;
+       ph->data_offset  = self->data.offset;
+       ph->data_size    = self->data.size;
+       return 0;
+}
+
+static int perf_file_section__process(struct perf_file_section *self,
+                                     int feat, int fd)
+{
+       if (lseek(fd, self->offset, SEEK_SET) < 0) {
+               pr_debug("Failed to lseek to %Ld offset for feature %d, "
+                        "continuing...\n", self->offset, feat);
+               return 0;
+       }
+
+       switch (feat) {
+       case HEADER_TRACE_INFO:
+               trace_report(fd);
+               break;
+
+       case HEADER_BUILD_ID:
+               if (perf_header__read_build_ids(fd, self->offset, self->size))
+                       pr_debug("Failed to read buildids, continuing...\n");
+               break;
+       default:
+               pr_debug("unknown feature %d, continuing...\n", feat);
+       }
+
+       return 0;
+}
+
+int perf_header__read(struct perf_header *self, int fd)
 {
-       struct perf_header      *self = perf_header__new();
        struct perf_file_header f_header;
        struct perf_file_attr   f_attr;
        u64                     f_id;
-
        int nr_attrs, nr_ids, i, j;
 
-       lseek(fd, 0, SEEK_SET);
-       do_read(fd, &f_header, sizeof(f_header));
-
-       if (f_header.magic      != PERF_MAGIC           ||
-           f_header.size       != sizeof(f_header)     ||
-           f_header.attr_size  != sizeof(f_attr))
-               die("incompatible file format");
+       if (perf_file_header__read(&f_header, self, fd) < 0) {
+               pr_debug("incompatible file format\n");
+               return -EINVAL;
+       }
 
        nr_attrs = f_header.attrs.size / sizeof(f_attr);
        lseek(fd, f_header.attrs.offset, SEEK_SET);
@@ -269,6 +501,8 @@ struct perf_header *perf_header__read(int fd)
                tmp = lseek(fd, 0, SEEK_CUR);
 
                attr = perf_header_attr__new(&f_attr.attr);
+               if (attr == NULL)
+                        return -ENOMEM;
 
                nr_ids = f_attr.ids.size / sizeof(u64);
                lseek(fd, f_attr.ids.offset, SEEK_SET);
@@ -276,31 +510,34 @@ struct perf_header *perf_header__read(int fd)
                for (j = 0; j < nr_ids; j++) {
                        do_read(fd, &f_id, sizeof(f_id));
 
-                       perf_header_attr__add_id(attr, f_id);
+                       if (perf_header_attr__add_id(attr, f_id) < 0) {
+                               perf_header_attr__delete(attr);
+                               return -ENOMEM;
+                       }
                }
-               perf_header__add_attr(self, attr);
+               if (perf_header__add_attr(self, attr) < 0) {
+                       perf_header_attr__delete(attr);
+                       return -ENOMEM;
+               }
+
                lseek(fd, tmp, SEEK_SET);
        }
 
        if (f_header.event_types.size) {
                lseek(fd, f_header.event_types.offset, SEEK_SET);
                events = malloc(f_header.event_types.size);
-               if (!events)
-                       die("nomem");
+               if (events == NULL)
+                       return -ENOMEM;
                do_read(fd, events, f_header.event_types.size);
                event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
        }
-       self->event_offset = f_header.event_types.offset;
-       self->event_size   = f_header.event_types.size;
 
-       self->data_offset = f_header.data.offset;
-       self->data_size   = f_header.data.size;
+       perf_header__process_sections(self, fd, perf_file_section__process);
 
        lseek(fd, self->data_offset, SEEK_SET);
 
        self->frozen = 1;
-
-       return self;
+       return 0;
 }
 
 u64 perf_header__sample_type(struct perf_header *header)
index a0761bc7863c7e2ba7ad644751fa668c092a67a6..d1dbe2b79c42bfcbf9a3e90e965076b6a5afc075 100644 (file)
@@ -1,10 +1,13 @@
-#ifndef _PERF_HEADER_H
-#define _PERF_HEADER_H
+#ifndef __PERF_HEADER_H
+#define __PERF_HEADER_H
 
 #include "../../../include/linux/perf_event.h"
 #include <sys/types.h>
+#include <stdbool.h>
 #include "types.h"
 
+#include <linux/bitmap.h>
+
 struct perf_header_attr {
        struct perf_event_attr attr;
        int ids, size;
@@ -12,36 +15,71 @@ struct perf_header_attr {
        off_t id_offset;
 };
 
+enum {
+       HEADER_TRACE_INFO = 1,
+       HEADER_BUILD_ID,
+       HEADER_LAST_FEATURE,
+};
+
+#define HEADER_FEAT_BITS                       256
+
+struct perf_file_section {
+       u64 offset;
+       u64 size;
+};
+
+struct perf_file_header {
+       u64                             magic;
+       u64                             size;
+       u64                             attr_size;
+       struct perf_file_section        attrs;
+       struct perf_file_section        data;
+       struct perf_file_section        event_types;
+       DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
+};
+
+struct perf_header;
+
+int perf_file_header__read(struct perf_file_header *self,
+                          struct perf_header *ph, int fd);
+
 struct perf_header {
-       int frozen;
-       int attrs, size;
+       int                     frozen;
+       int                     attrs, size;
        struct perf_header_attr **attr;
-       s64 attr_offset;
-       u64 data_offset;
-       u64 data_size;
-       u64 event_offset;
-       u64 event_size;
+       s64                     attr_offset;
+       u64                     data_offset;
+       u64                     data_size;
+       u64                     event_offset;
+       u64                     event_size;
+       DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
-struct perf_header *perf_header__read(int fd);
-void perf_header__write(struct perf_header *self, int fd);
+struct perf_header *perf_header__new(void);
+void perf_header__delete(struct perf_header *self);
 
-void perf_header__add_attr(struct perf_header *self,
-                          struct perf_header_attr *attr);
+int perf_header__read(struct perf_header *self, int fd);
+int perf_header__write(struct perf_header *self, int fd, bool at_exit);
+
+int perf_header__add_attr(struct perf_header *self,
+                         struct perf_header_attr *attr);
 
 void perf_header__push_event(u64 id, const char *name);
 char *perf_header__find_event(u64 id);
 
+struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr);
+void perf_header_attr__delete(struct perf_header_attr *self);
 
-struct perf_header_attr *
-perf_header_attr__new(struct perf_event_attr *attr);
-void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
+int perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
 
 u64 perf_header__sample_type(struct perf_header *header);
 struct perf_event_attr *
 perf_header__find_attr(u64 id, struct perf_header *header);
+void perf_header__set_feat(struct perf_header *self, int feat);
+bool perf_header__has_feat(const struct perf_header *self, int feat);
 
+int perf_header__process_sections(struct perf_header *self, int fd,
+                                 int (*process)(struct perf_file_section *self,
+                                                int feat, int fd));
 
-struct perf_header *perf_header__new(void);
-
-#endif /* _PERF_HEADER_H */
+#endif /* __PERF_HEADER_H */
index 7128783637b4a7be720c49a9e74ccb43cba2cac9..7f5c6dedd714ff8492cce6998c866becfd7bf8d3 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef HELP_H
-#define HELP_H
+#ifndef __PERF_HELP_H
+#define __PERF_HELP_H
 
 struct cmdnames {
        size_t alloc;
@@ -26,4 +26,4 @@ int is_in_cmdlist(struct cmdnames *c, const char *s);
 void list_commands(const char *title, struct cmdnames *main_cmds,
                   struct cmdnames *other_cmds);
 
-#endif /* HELP_H */
+#endif /* __PERF_HELP_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
new file mode 100644 (file)
index 0000000..0ebf6ee
--- /dev/null
@@ -0,0 +1,202 @@
+#include "hist.h"
+
+struct rb_root hist;
+struct rb_root collapse_hists;
+struct rb_root output_hists;
+int callchain;
+
+struct callchain_param callchain_param = {
+       .mode   = CHAIN_GRAPH_REL,
+       .min_percent = 0.5
+};
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+struct hist_entry *__hist_entry__add(struct addr_location *al,
+                                    struct symbol *sym_parent,
+                                    u64 count, bool *hit)
+{
+       struct rb_node **p = &hist.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *he;
+       struct hist_entry entry = {
+               .thread = al->thread,
+               .map    = al->map,
+               .sym    = al->sym,
+               .ip     = al->addr,
+               .level  = al->level,
+               .count  = count,
+               .parent = sym_parent,
+       };
+       int cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               he = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__cmp(&entry, he);
+
+               if (!cmp) {
+                       *hit = true;
+                       return he;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       he = malloc(sizeof(*he));
+       if (!he)
+               return NULL;
+       *he = entry;
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &hist);
+       *hit = false;
+       return he;
+}
+
+int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               cmp = se->cmp(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       struct sort_entry *se;
+       int64_t cmp = 0;
+
+       list_for_each_entry(se, &hist_entry__sort_list, list) {
+               int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+               f = se->collapse ?: se->cmp;
+
+               cmp = f(left, right);
+               if (cmp)
+                       break;
+       }
+
+       return cmp;
+}
+
+void hist_entry__free(struct hist_entry *he)
+{
+       free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+void collapse__insert_entry(struct hist_entry *he)
+{
+       struct rb_node **p = &collapse_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+       int64_t cmp;
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               cmp = hist_entry__collapse(iter, he);
+
+               if (!cmp) {
+                       iter->count += he->count;
+                       hist_entry__free(he);
+                       return;
+               }
+
+               if (cmp < 0)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+void collapse__resort(void)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+
+       if (!sort__need_collapse)
+               return;
+
+       next = rb_first(&hist);
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, &hist);
+               collapse__insert_entry(n);
+       }
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+void output__insert_entry(struct hist_entry *he, u64 min_callchain_hits)
+{
+       struct rb_node **p = &output_hists.rb_node;
+       struct rb_node *parent = NULL;
+       struct hist_entry *iter;
+
+       if (callchain)
+               callchain_param.sort(&he->sorted_chain, &he->callchain,
+                                     min_callchain_hits, &callchain_param);
+
+       while (*p != NULL) {
+               parent = *p;
+               iter = rb_entry(parent, struct hist_entry, rb_node);
+
+               if (he->count > iter->count)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&he->rb_node, parent, p);
+       rb_insert_color(&he->rb_node, &output_hists);
+}
+
+void output__resort(u64 total_samples)
+{
+       struct rb_node *next;
+       struct hist_entry *n;
+       struct rb_root *tree = &hist;
+       u64 min_callchain_hits;
+
+       min_callchain_hits =
+               total_samples * (callchain_param.min_percent / 100);
+
+       if (sort__need_collapse)
+               tree = &collapse_hists;
+
+       next = rb_first(tree);
+
+       while (next) {
+               n = rb_entry(next, struct hist_entry, rb_node);
+               next = rb_next(&n->rb_node);
+
+               rb_erase(&n->rb_node, tree);
+               output__insert_entry(n, min_callchain_hits);
+       }
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
new file mode 100644 (file)
index 0000000..3020db0
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef __PERF_HIST_H
+#define __PERF_HIST_H
+#include "../builtin.h"
+
+#include "util.h"
+
+#include "color.h"
+#include <linux/list.h>
+#include "cache.h"
+#include <linux/rbtree.h>
+#include "symbol.h"
+#include "string.h"
+#include "callchain.h"
+#include "strlist.h"
+#include "values.h"
+
+#include "../perf.h"
+#include "debug.h"
+#include "header.h"
+
+#include "parse-options.h"
+#include "parse-events.h"
+
+#include "thread.h"
+#include "sort.h"
+
+extern struct rb_root hist;
+extern struct rb_root collapse_hists;
+extern struct rb_root output_hists;
+extern int callchain;
+extern struct callchain_param callchain_param;
+extern unsigned long total;
+extern unsigned long total_mmap;
+extern unsigned long total_comm;
+extern unsigned long total_fork;
+extern unsigned long total_unknown;
+extern unsigned long total_lost;
+
+struct hist_entry *__hist_entry__add(struct addr_location *al,
+                                    struct symbol *parent,
+                                    u64 count, bool *hit);
+extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
+extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
+extern void hist_entry__free(struct hist_entry *);
+extern void collapse__insert_entry(struct hist_entry *);
+extern void collapse__resort(void);
+extern void output__insert_entry(struct hist_entry *, u64);
+extern void output__resort(u64);
+
+#endif /* __PERF_HIST_H */
diff --git a/tools/perf/util/include/asm/asm-offsets.h b/tools/perf/util/include/asm/asm-offsets.h
new file mode 100644 (file)
index 0000000..ed53894
--- /dev/null
@@ -0,0 +1 @@
+/* stub */
diff --git a/tools/perf/util/include/asm/bitops.h b/tools/perf/util/include/asm/bitops.h
new file mode 100644 (file)
index 0000000..58e9817
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _PERF_ASM_BITOPS_H_
+#define _PERF_ASM_BITOPS_H_
+
+#include <sys/types.h>
+#include "../../types.h"
+#include <linux/compiler.h>
+
+/* CHECKME: Not sure both always match */
+#define BITS_PER_LONG  __WORDSIZE
+
+#include "../../../../include/asm-generic/bitops/__fls.h"
+#include "../../../../include/asm-generic/bitops/fls.h"
+#include "../../../../include/asm-generic/bitops/fls64.h"
+#include "../../../../include/asm-generic/bitops/__ffs.h"
+#include "../../../../include/asm-generic/bitops/ffz.h"
+#include "../../../../include/asm-generic/bitops/hweight.h"
+
+#endif
diff --git a/tools/perf/util/include/asm/bug.h b/tools/perf/util/include/asm/bug.h
new file mode 100644 (file)
index 0000000..7fcc681
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef _PERF_ASM_GENERIC_BUG_H
+#define _PERF_ASM_GENERIC_BUG_H
+
+#define __WARN_printf(arg...)  do { fprintf(stderr, arg); } while (0)
+
+#define WARN(condition, format...) ({          \
+       int __ret_warn_on = !!(condition);      \
+       if (unlikely(__ret_warn_on))            \
+               __WARN_printf(format);          \
+       unlikely(__ret_warn_on);                \
+})
+
+#define WARN_ONCE(condition, format...)        ({      \
+       static int __warned;                    \
+       int __ret_warn_once = !!(condition);    \
+                                               \
+       if (unlikely(__ret_warn_once))          \
+               if (WARN(!__warned, format))    \
+                       __warned = 1;           \
+       unlikely(__ret_warn_once);              \
+})
+#endif
diff --git a/tools/perf/util/include/asm/byteorder.h b/tools/perf/util/include/asm/byteorder.h
new file mode 100644 (file)
index 0000000..b722abe
--- /dev/null
@@ -0,0 +1,2 @@
+#include <asm/types.h>
+#include "../../../../include/linux/swab.h"
diff --git a/tools/perf/util/include/asm/swab.h b/tools/perf/util/include/asm/swab.h
new file mode 100644 (file)
index 0000000..ed53894
--- /dev/null
@@ -0,0 +1 @@
+/* stub */
diff --git a/tools/perf/util/include/asm/uaccess.h b/tools/perf/util/include/asm/uaccess.h
new file mode 100644 (file)
index 0000000..d0f72b8
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _PERF_ASM_UACCESS_H_
+#define _PERF_ASM_UACCESS_H_
+
+#define __get_user(src, dest)                                          \
+({                                                                     \
+       (src) = *dest;                                                  \
+       0;                                                              \
+})
+
+#define get_user       __get_user
+
+#define access_ok(type, addr, size)    1
+
+#endif
diff --git a/tools/perf/util/include/linux/bitmap.h b/tools/perf/util/include/linux/bitmap.h
new file mode 100644 (file)
index 0000000..9450763
--- /dev/null
@@ -0,0 +1,3 @@
+#include "../../../../include/linux/bitmap.h"
+#include "../../../../include/asm-generic/bitops/find.h"
+#include <linux/errno.h>
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
new file mode 100644 (file)
index 0000000..8d63116
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _PERF_LINUX_BITOPS_H_
+#define _PERF_LINUX_BITOPS_H_
+
+#define __KERNEL__
+
+#define CONFIG_GENERIC_FIND_NEXT_BIT
+#define CONFIG_GENERIC_FIND_FIRST_BIT
+#include "../../../../include/linux/bitops.h"
+
+#undef __KERNEL__
+
+static inline void set_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
+}
+
+static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
+{
+       return ((1UL << (nr % BITS_PER_LONG)) &
+               (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
+}
+
+unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, unsigned
+               long size, unsigned long offset);
+
+unsigned long generic_find_next_le_bit(const unsigned long *addr, unsigned
+               long size, unsigned long offset);
+
+#endif
diff --git a/tools/perf/util/include/linux/compiler.h b/tools/perf/util/include/linux/compiler.h
new file mode 100644 (file)
index 0000000..dfb0713
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef _PERF_LINUX_COMPILER_H_
+#define _PERF_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+#define __always_inline        inline
+#endif
+#define __user
+#define __attribute_const__
+
+#endif
diff --git a/tools/perf/util/include/linux/ctype.h b/tools/perf/util/include/linux/ctype.h
new file mode 100644 (file)
index 0000000..a53d4ee
--- /dev/null
@@ -0,0 +1 @@
+#include "../util.h"
index a6b87390cb52648e3a5d02c5486f4eb2635504cd..21c0274c02fa3a5010c99711dab9fe3cd5e15bff 100644 (file)
@@ -1,6 +1,16 @@
 #ifndef PERF_LINUX_KERNEL_H_
 #define PERF_LINUX_KERNEL_H_
 
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define ALIGN(x,a)             __ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)   (((x)+(mask))&~(mask))
+
 #ifndef offsetof
 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
 #endif
        _max1 > _max2 ? _max1 : _max2; })
 #endif
 
+#ifndef min
+#define min(x, y) ({                           \
+       typeof(x) _min1 = (x);                  \
+       typeof(y) _min2 = (y);                  \
+       (void) (&_min1 == &_min2);              \
+       _min1 < _min2 ? _min1 : _min2; })
+#endif
+
+#ifndef BUG_ON
+#define BUG_ON(cond) assert(!(cond))
+#endif
+
+/*
+ * Both need more care to handle endianness
+ * (Don't use bitmap_copy_le() for now)
+ */
+#define cpu_to_le64(x) (x)
+#define cpu_to_le32(x) (x)
+
+static inline int
+vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+       int i;
+       ssize_t ssize = size;
+
+       i = vsnprintf(buf, size, fmt, args);
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+static inline int scnprintf(char * buf, size_t size, const char * fmt, ...)
+{
+       va_list args;
+       ssize_t ssize = size;
+       int i;
+
+       va_start(args, fmt);
+       i = vsnprintf(buf, size, fmt, args);
+       va_end(args);
+
+       return (i >= ssize) ? (ssize - 1) : i;
+}
+
+static inline unsigned long
+simple_strtoul(const char *nptr, char **endptr, int base)
+{
+       return strtoul(nptr, endptr, base);
+}
+
+#ifndef pr_fmt
+#define pr_fmt(fmt) fmt
+#endif
+
+#define pr_err(fmt, ...) \
+       do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+#define pr_warning(fmt, ...) \
+       do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+#define pr_info(fmt, ...) \
+       do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+#define pr_debug(fmt, ...) \
+       eprintf(1, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_debugN(n, fmt, ...) \
+       eprintf(n, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_debug2(fmt, ...) pr_debugN(2, pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_debug3(fmt, ...) pr_debugN(3, pr_fmt(fmt), ##__VA_ARGS__)
+
 #endif
diff --git a/tools/perf/util/include/linux/string.h b/tools/perf/util/include/linux/string.h
new file mode 100644 (file)
index 0000000..3b2f590
--- /dev/null
@@ -0,0 +1 @@
+#include <string.h>
diff --git a/tools/perf/util/include/linux/types.h b/tools/perf/util/include/linux/types.h
new file mode 100644 (file)
index 0000000..196862a
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef _PERF_LINUX_TYPES_H_
+#define _PERF_LINUX_TYPES_H_
+
+#include <asm/types.h>
+
+#define DECLARE_BITMAP(name,bits) \
+       unsigned long name[BITS_TO_LONGS(bits)]
+
+#endif
index 0173abeef52c8dc3930a5c36420ff073ba8ddc87..b0fcb6d8a881d88b021cfcd7e1a08805b006ed3d 100644 (file)
@@ -1,8 +1,8 @@
-#ifndef LEVENSHTEIN_H
-#define LEVENSHTEIN_H
+#ifndef __PERF_LEVENSHTEIN_H
+#define __PERF_LEVENSHTEIN_H
 
 int levenshtein(const char *string1, const char *string2,
        int swap_penalty, int substition_penalty,
        int insertion_penalty, int deletion_penalty);
 
-#endif
+#endif /* __PERF_LEVENSHTEIN_H */
index 804e023827391f2ada3dfcac8a280a2d5d085aed..69f94fe9db20a059e4e00f687bd109a543c7d0e0 100644 (file)
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include "debug.h"
 
 static inline int is_anon_memory(const char *filename)
 {
@@ -19,13 +20,28 @@ static int strcommon(const char *pathname, char *cwd, int cwdlen)
        return n;
 }
 
- struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
+void map__init(struct map *self, enum map_type type,
+              u64 start, u64 end, u64 pgoff, struct dso *dso)
+{
+       self->type     = type;
+       self->start    = start;
+       self->end      = end;
+       self->pgoff    = pgoff;
+       self->dso      = dso;
+       self->map_ip   = map__map_ip;
+       self->unmap_ip = map__unmap_ip;
+       RB_CLEAR_NODE(&self->rb_node);
+}
+
+struct map *map__new(struct mmap_event *event, enum map_type type,
+                    char *cwd, int cwdlen)
 {
        struct map *self = malloc(sizeof(*self));
 
        if (self != NULL) {
                const char *filename = event->filename;
                char newfilename[PATH_MAX];
+               struct dso *dso;
                int anon;
 
                if (cwd) {
@@ -45,18 +61,15 @@ static int strcommon(const char *pathname, char *cwd, int cwdlen)
                        filename = newfilename;
                }
 
-               self->start = event->start;
-               self->end   = event->start + event->len;
-               self->pgoff = event->pgoff;
-
-               self->dso = dsos__findnew(filename);
-               if (self->dso == NULL)
+               dso = dsos__findnew(filename);
+               if (dso == NULL)
                        goto out_delete;
 
+               map__init(self, type, event->start, event->start + event->len,
+                         event->pgoff, dso);
+
                if (self->dso == vdso || anon)
-                       self->map_ip = vdso__map_ip;
-               else
-                       self->map_ip = map__map_ip;
+                       self->map_ip = self->unmap_ip = identity__map_ip;
        }
        return self;
 out_delete:
@@ -64,6 +77,72 @@ out_delete:
        return NULL;
 }
 
+void map__delete(struct map *self)
+{
+       free(self);
+}
+
+void map__fixup_start(struct map *self)
+{
+       struct rb_root *symbols = &self->dso->symbols[self->type];
+       struct rb_node *nd = rb_first(symbols);
+       if (nd != NULL) {
+               struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+               self->start = sym->start;
+       }
+}
+
+void map__fixup_end(struct map *self)
+{
+       struct rb_root *symbols = &self->dso->symbols[self->type];
+       struct rb_node *nd = rb_last(symbols);
+       if (nd != NULL) {
+               struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+               self->end = sym->end;
+       }
+}
+
+#define DSO__DELETED "(deleted)"
+
+struct symbol *map__find_symbol(struct map *self, u64 addr,
+                               symbol_filter_t filter)
+{
+       if (!dso__loaded(self->dso, self->type)) {
+               int nr = dso__load(self->dso, self, filter);
+
+               if (nr < 0) {
+                       if (self->dso->has_build_id) {
+                               char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+                               build_id__sprintf(self->dso->build_id,
+                                                 sizeof(self->dso->build_id),
+                                                 sbuild_id);
+                               pr_warning("%s with build id %s not found",
+                                          self->dso->long_name, sbuild_id);
+                       } else
+                               pr_warning("Failed to open %s",
+                                          self->dso->long_name);
+                       pr_warning(", continuing without symbols\n");
+                       return NULL;
+               } else if (nr == 0) {
+                       const char *name = self->dso->long_name;
+                       const size_t len = strlen(name);
+                       const size_t real_len = len - sizeof(DSO__DELETED);
+
+                       if (len > sizeof(DSO__DELETED) &&
+                           strcmp(name + real_len + 1, DSO__DELETED) == 0) {
+                               pr_warning("%.*s was updated, restart the long running apps that use it!\n",
+                                          (int)real_len, name);
+                       } else {
+                               pr_warning("no symbols found in %s, maybe install a debug package?\n", name);
+                       }
+                       return NULL;
+               }
+       }
+
+       return self->dso->find_symbol(self->dso, self->type, addr);
+}
+
 struct map *map__clone(struct map *self)
 {
        struct map *map = malloc(sizeof(*self));
diff --git a/tools/perf/util/module.c b/tools/perf/util/module.c
deleted file mode 100644 (file)
index 0d8c85d..0000000
+++ /dev/null
@@ -1,545 +0,0 @@
-#include "util.h"
-#include "../perf.h"
-#include "string.h"
-#include "module.h"
-
-#include <libelf.h>
-#include <libgen.h>
-#include <gelf.h>
-#include <elf.h>
-#include <dirent.h>
-#include <sys/utsname.h>
-
-static unsigned int crc32(const char *p, unsigned int len)
-{
-       int i;
-       unsigned int crc = 0;
-
-       while (len--) {
-               crc ^= *p++;
-               for (i = 0; i < 8; i++)
-                       crc = (crc >> 1) ^ ((crc & 1) ? 0xedb88320 : 0);
-       }
-       return crc;
-}
-
-/* module section methods */
-
-struct sec_dso *sec_dso__new_dso(const char *name)
-{
-       struct sec_dso *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-       if (self != NULL) {
-               strcpy(self->name, name);
-               self->secs = RB_ROOT;
-               self->find_section = sec_dso__find_section;
-       }
-
-       return self;
-}
-
-static void sec_dso__delete_section(struct section *self)
-{
-       free(((void *)self));
-}
-
-void sec_dso__delete_sections(struct sec_dso *self)
-{
-       struct section *pos;
-       struct rb_node *next = rb_first(&self->secs);
-
-       while (next) {
-               pos = rb_entry(next, struct section, rb_node);
-               next = rb_next(&pos->rb_node);
-               rb_erase(&pos->rb_node, &self->secs);
-               sec_dso__delete_section(pos);
-       }
-}
-
-void sec_dso__delete_self(struct sec_dso *self)
-{
-       sec_dso__delete_sections(self);
-       free(self);
-}
-
-static void sec_dso__insert_section(struct sec_dso *self, struct section *sec)
-{
-       struct rb_node **p = &self->secs.rb_node;
-       struct rb_node *parent = NULL;
-       const u64 hash = sec->hash;
-       struct section *s;
-
-       while (*p != NULL) {
-               parent = *p;
-               s = rb_entry(parent, struct section, rb_node);
-               if (hash < s->hash)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-       rb_link_node(&sec->rb_node, parent, p);
-       rb_insert_color(&sec->rb_node, &self->secs);
-}
-
-struct section *sec_dso__find_section(struct sec_dso *self, const char *name)
-{
-       struct rb_node *n;
-       u64 hash;
-       int len;
-
-       if (self == NULL)
-               return NULL;
-
-       len = strlen(name);
-       hash = crc32(name, len);
-
-       n = self->secs.rb_node;
-
-       while (n) {
-               struct section *s = rb_entry(n, struct section, rb_node);
-
-               if (hash < s->hash)
-                       n = n->rb_left;
-               else if (hash > s->hash)
-                       n = n->rb_right;
-               else {
-                       if (!strcmp(name, s->name))
-                               return s;
-                       else
-                               n = rb_next(&s->rb_node);
-               }
-       }
-
-       return NULL;
-}
-
-static size_t sec_dso__fprintf_section(struct section *self, FILE *fp)
-{
-       return fprintf(fp, "name:%s vma:%llx path:%s\n",
-                      self->name, self->vma, self->path);
-}
-
-size_t sec_dso__fprintf(struct sec_dso *self, FILE *fp)
-{
-       size_t ret = fprintf(fp, "dso: %s\n", self->name);
-
-       struct rb_node *nd;
-       for (nd = rb_first(&self->secs); nd; nd = rb_next(nd)) {
-               struct section *pos = rb_entry(nd, struct section, rb_node);
-               ret += sec_dso__fprintf_section(pos, fp);
-       }
-
-       return ret;
-}
-
-static struct section *section__new(const char *name, const char *path)
-{
-       struct section *self = calloc(1, sizeof(*self));
-
-       if (!self)
-               goto out_failure;
-
-       self->name = calloc(1, strlen(name) + 1);
-       if (!self->name)
-               goto out_failure;
-
-       self->path = calloc(1, strlen(path) + 1);
-       if (!self->path)
-               goto out_failure;
-
-       strcpy(self->name, name);
-       strcpy(self->path, path);
-       self->hash = crc32(self->name, strlen(name));
-
-       return self;
-
-out_failure:
-       if (self) {
-               if (self->name)
-                       free(self->name);
-               if (self->path)
-                       free(self->path);
-               free(self);
-       }
-
-       return NULL;
-}
-
-/* module methods */
-
-struct mod_dso *mod_dso__new_dso(const char *name)
-{
-       struct mod_dso *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-       if (self != NULL) {
-               strcpy(self->name, name);
-               self->mods = RB_ROOT;
-               self->find_module = mod_dso__find_module;
-       }
-
-       return self;
-}
-
-static void mod_dso__delete_module(struct module *self)
-{
-       free(((void *)self));
-}
-
-void mod_dso__delete_modules(struct mod_dso *self)
-{
-       struct module *pos;
-       struct rb_node *next = rb_first(&self->mods);
-
-       while (next) {
-               pos = rb_entry(next, struct module, rb_node);
-               next = rb_next(&pos->rb_node);
-               rb_erase(&pos->rb_node, &self->mods);
-               mod_dso__delete_module(pos);
-       }
-}
-
-void mod_dso__delete_self(struct mod_dso *self)
-{
-       mod_dso__delete_modules(self);
-       free(self);
-}
-
-static void mod_dso__insert_module(struct mod_dso *self, struct module *mod)
-{
-       struct rb_node **p = &self->mods.rb_node;
-       struct rb_node *parent = NULL;
-       const u64 hash = mod->hash;
-       struct module *m;
-
-       while (*p != NULL) {
-               parent = *p;
-               m = rb_entry(parent, struct module, rb_node);
-               if (hash < m->hash)
-                       p = &(*p)->rb_left;
-               else
-                       p = &(*p)->rb_right;
-       }
-       rb_link_node(&mod->rb_node, parent, p);
-       rb_insert_color(&mod->rb_node, &self->mods);
-}
-
-struct module *mod_dso__find_module(struct mod_dso *self, const char *name)
-{
-       struct rb_node *n;
-       u64 hash;
-       int len;
-
-       if (self == NULL)
-               return NULL;
-
-       len = strlen(name);
-       hash = crc32(name, len);
-
-       n = self->mods.rb_node;
-
-       while (n) {
-               struct module *m = rb_entry(n, struct module, rb_node);
-
-               if (hash < m->hash)
-                       n = n->rb_left;
-               else if (hash > m->hash)
-                       n = n->rb_right;
-               else {
-                       if (!strcmp(name, m->name))
-                               return m;
-                       else
-                               n = rb_next(&m->rb_node);
-               }
-       }
-
-       return NULL;
-}
-
-static size_t mod_dso__fprintf_module(struct module *self, FILE *fp)
-{
-       return fprintf(fp, "name:%s path:%s\n", self->name, self->path);
-}
-
-size_t mod_dso__fprintf(struct mod_dso *self, FILE *fp)
-{
-       struct rb_node *nd;
-       size_t ret;
-
-       ret = fprintf(fp, "dso: %s\n", self->name);
-
-       for (nd = rb_first(&self->mods); nd; nd = rb_next(nd)) {
-               struct module *pos = rb_entry(nd, struct module, rb_node);
-
-               ret += mod_dso__fprintf_module(pos, fp);
-       }
-
-       return ret;
-}
-
-static struct module *module__new(const char *name, const char *path)
-{
-       struct module *self = calloc(1, sizeof(*self));
-
-       if (!self)
-               goto out_failure;
-
-       self->name = calloc(1, strlen(name) + 1);
-       if (!self->name)
-               goto out_failure;
-
-       self->path = calloc(1, strlen(path) + 1);
-       if (!self->path)
-               goto out_failure;
-
-       strcpy(self->name, name);
-       strcpy(self->path, path);
-       self->hash = crc32(self->name, strlen(name));
-
-       return self;
-
-out_failure:
-       if (self) {
-               if (self->name)
-                       free(self->name);
-               if (self->path)
-                       free(self->path);
-               free(self);
-       }
-
-       return NULL;
-}
-
-static int mod_dso__load_sections(struct module *mod)
-{
-       int count = 0, path_len;
-       struct dirent *entry;
-       char *line = NULL;
-       char *dir_path;
-       DIR *dir;
-       size_t n;
-
-       path_len = strlen("/sys/module/");
-       path_len += strlen(mod->name);
-       path_len += strlen("/sections/");
-
-       dir_path = calloc(1, path_len + 1);
-       if (dir_path == NULL)
-               goto out_failure;
-
-       strcat(dir_path, "/sys/module/");
-       strcat(dir_path, mod->name);
-       strcat(dir_path, "/sections/");
-
-       dir = opendir(dir_path);
-       if (dir == NULL)
-               goto out_free;
-
-       while ((entry = readdir(dir))) {
-               struct section *section;
-               char *path, *vma;
-               int line_len;
-               FILE *file;
-
-               if (!strcmp(".", entry->d_name) || !strcmp("..", entry->d_name))
-                       continue;
-
-               path = calloc(1, path_len + strlen(entry->d_name) + 1);
-               if (path == NULL)
-                       break;
-               strcat(path, dir_path);
-               strcat(path, entry->d_name);
-
-               file = fopen(path, "r");
-               if (file == NULL) {
-                       free(path);
-                       break;
-               }
-
-               line_len = getline(&line, &n, file);
-               if (line_len < 0) {
-                       free(path);
-                       fclose(file);
-                       break;
-               }
-
-               if (!line) {
-                       free(path);
-                       fclose(file);
-                       break;
-               }
-
-               line[--line_len] = '\0'; /* \n */
-
-               vma = strstr(line, "0x");
-               if (!vma) {
-                       free(path);
-                       fclose(file);
-                       break;
-               }
-               vma += 2;
-
-               section = section__new(entry->d_name, path);
-               if (!section) {
-                       fprintf(stderr, "load_sections: allocation error\n");
-                       free(path);
-                       fclose(file);
-                       break;
-               }
-
-               hex2u64(vma, &section->vma);
-               sec_dso__insert_section(mod->sections, section);
-
-               free(path);
-               fclose(file);
-               count++;
-       }
-
-       closedir(dir);
-       free(line);
-       free(dir_path);
-
-       return count;
-
-out_free:
-       free(dir_path);
-
-out_failure:
-       return count;
-}
-
-static int mod_dso__load_module_paths(struct mod_dso *self)
-{
-       struct utsname uts;
-       int count = 0, len, err = -1;
-       char *line = NULL;
-       FILE *file;
-       char *dpath, *dir;
-       size_t n;
-
-       if (uname(&uts) < 0)
-               return err;
-
-       len = strlen("/lib/modules/");
-       len += strlen(uts.release);
-       len += strlen("/modules.dep");
-
-       dpath = calloc(1, len + 1);
-       if (dpath == NULL)
-               return err;
-
-       strcat(dpath, "/lib/modules/");
-       strcat(dpath, uts.release);
-       strcat(dpath, "/modules.dep");
-
-       file = fopen(dpath, "r");
-       if (file == NULL)
-               goto out_failure;
-
-       dir = dirname(dpath);
-       if (!dir)
-               goto out_failure;
-       strcat(dir, "/");
-
-       while (!feof(file)) {
-               struct module *module;
-               char *name, *path, *tmp;
-               FILE *modfile;
-               int line_len;
-
-               line_len = getline(&line, &n, file);
-               if (line_len < 0)
-                       break;
-
-               if (!line)
-                       break;
-
-               line[--line_len] = '\0'; /* \n */
-
-               path = strchr(line, ':');
-               if (!path)
-                       break;
-               *path = '\0';
-
-               path = strdup(line);
-               if (!path)
-                       break;
-
-               if (!strstr(path, dir)) {
-                       if (strncmp(path, "kernel/", 7))
-                               break;
-
-                       free(path);
-                       path = calloc(1, strlen(dir) + strlen(line) + 1);
-                       if (!path)
-                               break;
-                       strcat(path, dir);
-                       strcat(path, line);
-               }
-
-               modfile = fopen(path, "r");
-               if (modfile == NULL)
-                       break;
-               fclose(modfile);
-
-               name = strdup(path);
-               if (!name)
-                       break;
-
-               name = strtok(name, "/");
-               tmp = name;
-
-               while (tmp) {
-                       tmp = strtok(NULL, "/");
-                       if (tmp)
-                               name = tmp;
-               }
-
-               name = strsep(&name, ".");
-               if (!name)
-                       break;
-
-               /* Quirk: replace '-' with '_' in all modules */
-               for (len = strlen(name); len; len--) {
-                       if (*(name+len) == '-')
-                               *(name+len) = '_';
-               }
-
-               module = module__new(name, path);
-               if (!module)
-                       break;
-               mod_dso__insert_module(self, module);
-
-               module->sections = sec_dso__new_dso("sections");
-               if (!module->sections)
-                       break;
-
-               module->active = mod_dso__load_sections(module);
-
-               if (module->active > 0)
-                       count++;
-       }
-
-       if (feof(file))
-               err = count;
-       else
-               fprintf(stderr, "load_module_paths: modules.dep parsing failure!\n");
-
-out_failure:
-       if (dpath)
-               free(dpath);
-       if (file)
-               fclose(file);
-       if (line)
-               free(line);
-
-       return err;
-}
-
-int mod_dso__load_modules(struct mod_dso *dso)
-{
-       int err;
-
-       err = mod_dso__load_module_paths(dso);
-
-       return err;
-}
diff --git a/tools/perf/util/module.h b/tools/perf/util/module.h
deleted file mode 100644 (file)
index 8a592ef..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _PERF_MODULE_
-#define _PERF_MODULE_ 1
-
-#include <linux/types.h>
-#include "../types.h"
-#include <linux/list.h>
-#include <linux/rbtree.h>
-
-struct section {
-       struct rb_node  rb_node;
-       u64             hash;
-       u64             vma;
-       char            *name;
-       char            *path;
-};
-
-struct sec_dso {
-       struct list_head node;
-       struct rb_root   secs;
-       struct section    *(*find_section)(struct sec_dso *, const char *name);
-       char             name[0];
-};
-
-struct module {
-       struct rb_node  rb_node;
-       u64             hash;
-       char            *name;
-       char            *path;
-       struct sec_dso  *sections;
-       int             active;
-};
-
-struct mod_dso {
-       struct list_head node;
-       struct rb_root   mods;
-       struct module    *(*find_module)(struct mod_dso *, const char *name);
-       char             name[0];
-};
-
-struct sec_dso *sec_dso__new_dso(const char *name);
-void sec_dso__delete_sections(struct sec_dso *self);
-void sec_dso__delete_self(struct sec_dso *self);
-size_t sec_dso__fprintf(struct sec_dso *self, FILE *fp);
-struct section *sec_dso__find_section(struct sec_dso *self, const char *name);
-
-struct mod_dso *mod_dso__new_dso(const char *name);
-void mod_dso__delete_modules(struct mod_dso *self);
-void mod_dso__delete_self(struct mod_dso *self);
-size_t mod_dso__fprintf(struct mod_dso *self, FILE *fp);
-struct module *mod_dso__find_module(struct mod_dso *self, const char *name);
-int mod_dso__load_modules(struct mod_dso *dso);
-
-#endif /* _PERF_MODULE_ */
index 8cfb48cbbea01d8fbd5ba6aee541d07c360b7d3d..9e5dbd66d34d7aa95aad038d2168eaed5ef920b3 100644 (file)
@@ -1,4 +1,4 @@
-
+#include "../../../include/linux/hw_breakpoint.h"
 #include "util.h"
 #include "../perf.h"
 #include "parse-options.h"
@@ -7,10 +7,12 @@
 #include "string.h"
 #include "cache.h"
 #include "header.h"
+#include "debugfs.h"
 
-int                                    nr_counters;
+int                            nr_counters;
 
 struct perf_event_attr         attrs[MAX_COUNTERS];
+char                           *filters[MAX_COUNTERS];
 
 struct event_symbol {
        u8              type;
@@ -46,6 +48,8 @@ static struct event_symbol event_symbols[] = {
   { CSW(PAGE_FAULTS_MAJ),      "major-faults",         ""              },
   { CSW(CONTEXT_SWITCHES),     "context-switches",     "cs"            },
   { CSW(CPU_MIGRATIONS),       "cpu-migrations",       "migrations"    },
+  { CSW(ALIGNMENT_FAULTS),     "alignment-faults",     ""              },
+  { CSW(EMULATION_FAULTS),     "emulation-faults",     ""              },
 };
 
 #define __PERF_EVENT_FIELD(config, name) \
@@ -74,6 +78,8 @@ static const char *sw_event_names[] = {
        "CPU-migrations",
        "minor-faults",
        "major-faults",
+       "alignment-faults",
+       "emulation-faults",
 };
 
 #define MAX_ALIASES 8
@@ -148,16 +154,6 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
 
 #define MAX_EVENT_LENGTH 512
 
-int valid_debugfs_mount(const char *debugfs)
-{
-       struct statfs st_fs;
-
-       if (statfs(debugfs, &st_fs) < 0)
-               return -ENOENT;
-       else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
-               return -ENOENT;
-       return 0;
-}
 
 struct tracepoint_path *tracepoint_id_to_path(u64 config)
 {
@@ -170,7 +166,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
        char evt_path[MAXPATHLEN];
        char dir_path[MAXPATHLEN];
 
-       if (valid_debugfs_mount(debugfs_path))
+       if (debugfs_valid_mountpoint(debugfs_path))
                return NULL;
 
        sys_dir = opendir(debugfs_path);
@@ -201,7 +197,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
                        if (id == config) {
                                closedir(evt_dir);
                                closedir(sys_dir);
-                               path = calloc(1, sizeof(path));
+                               path = zalloc(sizeof(path));
                                path->system = malloc(MAX_EVENT_LENGTH);
                                if (!path->system) {
                                        free(path);
@@ -509,7 +505,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
        char sys_name[MAX_EVENT_LENGTH];
        unsigned int sys_length, evt_length;
 
-       if (valid_debugfs_mount(debugfs_path))
+       if (debugfs_valid_mountpoint(debugfs_path))
                return 0;
 
        evt_name = strchr(*strp, ':');
@@ -544,6 +540,81 @@ static enum event_result parse_tracepoint_event(const char **strp,
                                                     attr, strp);
 }
 
+static enum event_result
+parse_breakpoint_type(const char *type, const char **strp,
+                     struct perf_event_attr *attr)
+{
+       int i;
+
+       for (i = 0; i < 3; i++) {
+               if (!type[i])
+                       break;
+
+               switch (type[i]) {
+               case 'r':
+                       attr->bp_type |= HW_BREAKPOINT_R;
+                       break;
+               case 'w':
+                       attr->bp_type |= HW_BREAKPOINT_W;
+                       break;
+               case 'x':
+                       attr->bp_type |= HW_BREAKPOINT_X;
+                       break;
+               default:
+                       return EVT_FAILED;
+               }
+       }
+       if (!attr->bp_type) /* Default */
+               attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
+
+       *strp = type + i;
+
+       return EVT_HANDLED;
+}
+
+static enum event_result
+parse_breakpoint_event(const char **strp, struct perf_event_attr *attr)
+{
+       const char *target;
+       const char *type;
+       char *endaddr;
+       u64 addr;
+       enum event_result err;
+
+       target = strchr(*strp, ':');
+       if (!target)
+               return EVT_FAILED;
+
+       if (strncmp(*strp, "mem", target - *strp) != 0)
+               return EVT_FAILED;
+
+       target++;
+
+       addr = strtoull(target, &endaddr, 0);
+       if (target == endaddr)
+               return EVT_FAILED;
+
+       attr->bp_addr = addr;
+       *strp = endaddr;
+
+       type = strchr(target, ':');
+
+       /* If no type is defined, just rw as default */
+       if (!type) {
+               attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;
+       } else {
+               err = parse_breakpoint_type(++type, strp, attr);
+               if (err == EVT_FAILED)
+                       return EVT_FAILED;
+       }
+
+       /* We should find a nice way to override the access type */
+       attr->bp_len = HW_BREAKPOINT_LEN_4;
+       attr->type = PERF_TYPE_BREAKPOINT;
+
+       return EVT_HANDLED;
+}
+
 static int check_events(const char *str, unsigned int i)
 {
        int n;
@@ -677,6 +748,12 @@ parse_event_symbols(const char **str, struct perf_event_attr *attr)
        if (ret != EVT_FAILED)
                goto modifier;
 
+       ret = parse_breakpoint_event(str, attr);
+       if (ret != EVT_FAILED)
+               goto modifier;
+
+       fprintf(stderr, "invalid or unsupported event: '%s'\n", *str);
+       fprintf(stderr, "Run 'perf list' for a list of valid events\n");
        return EVT_FAILED;
 
 modifier:
@@ -708,7 +785,6 @@ static void store_event_type(const char *orgname)
        perf_header__push_event(id, orgname);
 }
 
-
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
        struct perf_event_attr attr;
@@ -745,6 +821,28 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
        return 0;
 }
 
+int parse_filter(const struct option *opt __used, const char *str,
+                int unset __used)
+{
+       int i = nr_counters - 1;
+       int len = strlen(str);
+
+       if (i < 0 || attrs[i].type != PERF_TYPE_TRACEPOINT) {
+               fprintf(stderr,
+                       "-F option should follow a -e tracepoint option\n");
+               return -1;
+       }
+
+       filters[i] = malloc(len + 1);
+       if (!filters[i]) {
+               fprintf(stderr, "not enough memory to hold filter string\n");
+               return -1;
+       }
+       strcpy(filters[i], str);
+
+       return 0;
+}
+
 static const char * const event_type_descriptors[] = {
        "",
        "Hardware event",
@@ -764,7 +862,7 @@ static void print_tracepoint_events(void)
        char evt_path[MAXPATHLEN];
        char dir_path[MAXPATHLEN];
 
-       if (valid_debugfs_mount(debugfs_path))
+       if (debugfs_valid_mountpoint(debugfs_path))
                return;
 
        sys_dir = opendir(debugfs_path);
@@ -782,7 +880,7 @@ static void print_tracepoint_events(void)
                for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
                        snprintf(evt_path, MAXPATHLEN, "%s:%s",
                                 sys_dirent.d_name, evt_dirent.d_name);
-                       fprintf(stderr, "  %-42s [%s]\n", evt_path,
+                       printf("  %-42s [%s]\n", evt_path,
                                event_type_descriptors[PERF_TYPE_TRACEPOINT+1]);
                }
                closedir(evt_dir);
@@ -799,8 +897,8 @@ void print_events(void)
        unsigned int i, type, op, prev_type = -1;
        char name[40];
 
-       fprintf(stderr, "\n");
-       fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
+       printf("\n");
+       printf("List of pre-defined events (to be used in -e):\n");
 
        for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
                type = syms->type + 1;
@@ -808,19 +906,19 @@ void print_events(void)
                        type = 0;
 
                if (type != prev_type)
-                       fprintf(stderr, "\n");
+                       printf("\n");
 
                if (strlen(syms->alias))
                        sprintf(name, "%s OR %s", syms->symbol, syms->alias);
                else
                        strcpy(name, syms->symbol);
-               fprintf(stderr, "  %-42s [%s]\n", name,
+               printf("  %-42s [%s]\n", name,
                        event_type_descriptors[type]);
 
                prev_type = type;
        }
 
-       fprintf(stderr, "\n");
+       printf("\n");
        for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
                for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
                        /* skip invalid cache type */
@@ -828,17 +926,20 @@ void print_events(void)
                                continue;
 
                        for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
-                               fprintf(stderr, "  %-42s [%s]\n",
+                               printf("  %-42s [%s]\n",
                                        event_cache_name(type, op, i),
                                        event_type_descriptors[4]);
                        }
                }
        }
 
-       fprintf(stderr, "\n");
-       fprintf(stderr, "  %-42s [raw hardware event descriptor]\n",
+       printf("\n");
+       printf("  %-42s [raw hardware event descriptor]\n",
                "rNNN");
-       fprintf(stderr, "\n");
+       printf("\n");
+
+       printf("  %-42s [hardware breakpoint]\n", "mem:<addr>[:access]");
+       printf("\n");
 
        print_tracepoint_events();
 
index 30c608112845954bb2de6a8831862830f7f4467b..b8c1f64bc9351ac7f889340c20eb7755614f5227 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _PARSE_EVENTS_H
-#define _PARSE_EVENTS_H
+#ifndef __PERF_PARSE_EVENTS_H
+#define __PERF_PARSE_EVENTS_H
 /*
  * Parse symbolic events/counts passed in as options:
  */
@@ -17,11 +17,13 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
 extern int                     nr_counters;
 
 extern struct perf_event_attr attrs[MAX_COUNTERS];
+extern char *filters[MAX_COUNTERS];
 
 extern const char *event_name(int ctr);
 extern const char *__event_name(int type, u64 config);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
+extern int parse_filter(const struct option *opt, const char *str, int unset);
 
 #define EVENTS_HELP_MAX (128*1024)
 
@@ -31,4 +33,4 @@ extern char debugfs_path[];
 extern int valid_debugfs_mount(const char *debugfs);
 
 
-#endif /* _PARSE_EVENTS_H */
+#endif /* __PERF_PARSE_EVENTS_H */
index 2ee248ff27e5c5af9fe315c198308e9a2b9fa584..948805af43c21e45296c6caa8adda25106a66423 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef PARSE_OPTIONS_H
-#define PARSE_OPTIONS_H
+#ifndef __PERF_PARSE_OPTIONS_H
+#define __PERF_PARSE_OPTIONS_H
 
 enum parse_opt_type {
        /* special types */
@@ -174,4 +174,4 @@ extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
 
 extern const char *parse_options_fix_filename(const char *prefix, const char *file);
 
-#endif
+#endif /* __PERF_PARSE_OPTIONS_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
new file mode 100644 (file)
index 0000000..cd7fbda
--- /dev/null
@@ -0,0 +1,484 @@
+/*
+ * probe-event.c : perf-probe definition to kprobe_events format converter
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+
+#undef _GNU_SOURCE
+#include "event.h"
+#include "string.h"
+#include "strlist.h"
+#include "debug.h"
+#include "parse-events.h"  /* For debugfs_path */
+#include "probe-event.h"
+
+#define MAX_CMDLEN 256
+#define MAX_PROBE_ARGS 128
+#define PERFPROBE_GROUP "probe"
+
+#define semantic_error(msg ...) die("Semantic error :" msg)
+
+/* If there is no space to write, returns -E2BIG. */
+static int e_snprintf(char *str, size_t size, const char *format, ...)
+{
+       int ret;
+       va_list ap;
+       va_start(ap, format);
+       ret = vsnprintf(str, size, format, ap);
+       va_end(ap);
+       if (ret >= (int)size)
+               ret = -E2BIG;
+       return ret;
+}
+
+/* Parse probepoint definition. */
+static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
+{
+       char *ptr, *tmp;
+       char c, nc = 0;
+       /*
+        * <Syntax>
+        * perf probe SRC:LN
+        * perf probe FUNC[+OFFS|%return][@SRC]
+        */
+
+       ptr = strpbrk(arg, ":+@%");
+       if (ptr) {
+               nc = *ptr;
+               *ptr++ = '\0';
+       }
+
+       /* Check arg is function or file and copy it */
+       if (strchr(arg, '.'))   /* File */
+               pp->file = strdup(arg);
+       else                    /* Function */
+               pp->function = strdup(arg);
+       DIE_IF(pp->file == NULL && pp->function == NULL);
+
+       /* Parse other options */
+       while (ptr) {
+               arg = ptr;
+               c = nc;
+               ptr = strpbrk(arg, ":+@%");
+               if (ptr) {
+                       nc = *ptr;
+                       *ptr++ = '\0';
+               }
+               switch (c) {
+               case ':':       /* Line number */
+                       pp->line = strtoul(arg, &tmp, 0);
+                       if (*tmp != '\0')
+                               semantic_error("There is non-digit charactor"
+                                               " in line number.");
+                       break;
+               case '+':       /* Byte offset from a symbol */
+                       pp->offset = strtoul(arg, &tmp, 0);
+                       if (*tmp != '\0')
+                               semantic_error("There is non-digit charactor"
+                                               " in offset.");
+                       break;
+               case '@':       /* File name */
+                       if (pp->file)
+                               semantic_error("SRC@SRC is not allowed.");
+                       pp->file = strdup(arg);
+                       DIE_IF(pp->file == NULL);
+                       if (ptr)
+                               semantic_error("@SRC must be the last "
+                                              "option.");
+                       break;
+               case '%':       /* Probe places */
+                       if (strcmp(arg, "return") == 0) {
+                               pp->retprobe = 1;
+                       } else  /* Others not supported yet */
+                               semantic_error("%%%s is not supported.", arg);
+                       break;
+               default:
+                       DIE_IF("Program has a bug.");
+                       break;
+               }
+       }
+
+       /* Exclusion check */
+       if (pp->line && pp->offset)
+               semantic_error("Offset can't be used with line number.");
+
+       if (!pp->line && pp->file && !pp->function)
+               semantic_error("File always requires line number.");
+
+       if (pp->offset && !pp->function)
+               semantic_error("Offset requires an entry function.");
+
+       if (pp->retprobe && !pp->function)
+               semantic_error("Return probe requires an entry function.");
+
+       if ((pp->offset || pp->line) && pp->retprobe)
+               semantic_error("Offset/Line can't be used with return probe.");
+
+       pr_debug("symbol:%s file:%s line:%d offset:%d, return:%d\n",
+                pp->function, pp->file, pp->line, pp->offset, pp->retprobe);
+}
+
+/* Parse perf-probe event definition */
+int parse_perf_probe_event(const char *str, struct probe_point *pp)
+{
+       char **argv;
+       int argc, i, need_dwarf = 0;
+
+       argv = argv_split(str, &argc);
+       if (!argv)
+               die("argv_split failed.");
+       if (argc > MAX_PROBE_ARGS + 1)
+               semantic_error("Too many arguments");
+
+       /* Parse probe point */
+       parse_perf_probe_probepoint(argv[0], pp);
+       if (pp->file || pp->line)
+               need_dwarf = 1;
+
+       /* Copy arguments and ensure return probe has no C argument */
+       pp->nr_args = argc - 1;
+       pp->args = zalloc(sizeof(char *) * pp->nr_args);
+       for (i = 0; i < pp->nr_args; i++) {
+               pp->args[i] = strdup(argv[i + 1]);
+               if (!pp->args[i])
+                       die("Failed to copy argument.");
+               if (is_c_varname(pp->args[i])) {
+                       if (pp->retprobe)
+                               semantic_error("You can't specify local"
+                                               " variable for kretprobe");
+                       need_dwarf = 1;
+               }
+       }
+
+       argv_free(argv);
+       return need_dwarf;
+}
+
+/* Parse kprobe_events event into struct probe_point */
+void parse_trace_kprobe_event(const char *str, char **group, char **event,
+                             struct probe_point *pp)
+{
+       char pr;
+       char *p;
+       int ret, i, argc;
+       char **argv;
+
+       pr_debug("Parsing kprobe_events: %s\n", str);
+       argv = argv_split(str, &argc);
+       if (!argv)
+               die("argv_split failed.");
+       if (argc < 2)
+               semantic_error("Too less arguments.");
+
+       /* Scan event and group name. */
+       ret = sscanf(argv[0], "%c:%a[^/ \t]/%a[^ \t]",
+                    &pr, (float *)(void *)group, (float *)(void *)event);
+       if (ret != 3)
+               semantic_error("Failed to parse event name: %s", argv[0]);
+       pr_debug("Group:%s Event:%s probe:%c\n", *group, *event, pr);
+
+       if (!pp)
+               goto end;
+
+       pp->retprobe = (pr == 'r');
+
+       /* Scan function name and offset */
+       ret = sscanf(argv[1], "%a[^+]+%d", (float *)(void *)&pp->function, &pp->offset);
+       if (ret == 1)
+               pp->offset = 0;
+
+       /* kprobe_events doesn't have this information */
+       pp->line = 0;
+       pp->file = NULL;
+
+       pp->nr_args = argc - 2;
+       pp->args = zalloc(sizeof(char *) * pp->nr_args);
+       for (i = 0; i < pp->nr_args; i++) {
+               p = strchr(argv[i + 2], '=');
+               if (p)  /* We don't need which register is assigned. */
+                       *p = '\0';
+               pp->args[i] = strdup(argv[i + 2]);
+               if (!pp->args[i])
+                       die("Failed to copy argument.");
+       }
+
+end:
+       argv_free(argv);
+}
+
+int synthesize_perf_probe_event(struct probe_point *pp)
+{
+       char *buf;
+       char offs[64] = "", line[64] = "";
+       int i, len, ret;
+
+       pp->probes[0] = buf = zalloc(MAX_CMDLEN);
+       if (!buf)
+               die("Failed to allocate memory by zalloc.");
+       if (pp->offset) {
+               ret = e_snprintf(offs, 64, "+%d", pp->offset);
+               if (ret <= 0)
+                       goto error;
+       }
+       if (pp->line) {
+               ret = e_snprintf(line, 64, ":%d", pp->line);
+               if (ret <= 0)
+                       goto error;
+       }
+
+       if (pp->function)
+               ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->function,
+                                offs, pp->retprobe ? "%return" : "", line);
+       else
+               ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->file, line);
+       if (ret <= 0)
+               goto error;
+       len = ret;
+
+       for (i = 0; i < pp->nr_args; i++) {
+               ret = e_snprintf(&buf[len], MAX_CMDLEN - len, " %s",
+                                pp->args[i]);
+               if (ret <= 0)
+                       goto error;
+               len += ret;
+       }
+       pp->found = 1;
+
+       return pp->found;
+error:
+       free(pp->probes[0]);
+
+       return ret;
+}
+
+int synthesize_trace_kprobe_event(struct probe_point *pp)
+{
+       char *buf;
+       int i, len, ret;
+
+       pp->probes[0] = buf = zalloc(MAX_CMDLEN);
+       if (!buf)
+               die("Failed to allocate memory by zalloc.");
+       ret = e_snprintf(buf, MAX_CMDLEN, "%s+%d", pp->function, pp->offset);
+       if (ret <= 0)
+               goto error;
+       len = ret;
+
+       for (i = 0; i < pp->nr_args; i++) {
+               ret = e_snprintf(&buf[len], MAX_CMDLEN - len, " %s",
+                                pp->args[i]);
+               if (ret <= 0)
+                       goto error;
+               len += ret;
+       }
+       pp->found = 1;
+
+       return pp->found;
+error:
+       free(pp->probes[0]);
+
+       return ret;
+}
+
+static int open_kprobe_events(int flags, int mode)
+{
+       char buf[PATH_MAX];
+       int ret;
+
+       ret = e_snprintf(buf, PATH_MAX, "%s/../kprobe_events", debugfs_path);
+       if (ret < 0)
+               die("Failed to make kprobe_events path.");
+
+       ret = open(buf, flags, mode);
+       if (ret < 0) {
+               if (errno == ENOENT)
+                       die("kprobe_events file does not exist -"
+                           " please rebuild with CONFIG_KPROBE_TRACER.");
+               else
+                       die("Could not open kprobe_events file: %s",
+                           strerror(errno));
+       }
+       return ret;
+}
+
+/* Get raw string list of current kprobe_events */
+static struct strlist *get_trace_kprobe_event_rawlist(int fd)
+{
+       int ret, idx;
+       FILE *fp;
+       char buf[MAX_CMDLEN];
+       char *p;
+       struct strlist *sl;
+
+       sl = strlist__new(true, NULL);
+
+       fp = fdopen(dup(fd), "r");
+       while (!feof(fp)) {
+               p = fgets(buf, MAX_CMDLEN, fp);
+               if (!p)
+                       break;
+
+               idx = strlen(p) - 1;
+               if (p[idx] == '\n')
+                       p[idx] = '\0';
+               ret = strlist__add(sl, buf);
+               if (ret < 0)
+                       die("strlist__add failed: %s", strerror(-ret));
+       }
+       fclose(fp);
+
+       return sl;
+}
+
+/* Free and zero clear probe_point */
+static void clear_probe_point(struct probe_point *pp)
+{
+       int i;
+
+       if (pp->function)
+               free(pp->function);
+       if (pp->file)
+               free(pp->file);
+       for (i = 0; i < pp->nr_args; i++)
+               free(pp->args[i]);
+       if (pp->args)
+               free(pp->args);
+       for (i = 0; i < pp->found; i++)
+               free(pp->probes[i]);
+       memset(pp, 0, sizeof(pp));
+}
+
+/* List up current perf-probe events */
+void show_perf_probe_events(void)
+{
+       unsigned int i;
+       int fd;
+       char *group, *event;
+       struct probe_point pp;
+       struct strlist *rawlist;
+       struct str_node *ent;
+
+       fd = open_kprobe_events(O_RDONLY, 0);
+       rawlist = get_trace_kprobe_event_rawlist(fd);
+       close(fd);
+
+       for (i = 0; i < strlist__nr_entries(rawlist); i++) {
+               ent = strlist__entry(rawlist, i);
+               parse_trace_kprobe_event(ent->s, &group, &event, &pp);
+               synthesize_perf_probe_event(&pp);
+               printf("[%s:%s]\t%s\n", group, event, pp.probes[0]);
+               free(group);
+               free(event);
+               clear_probe_point(&pp);
+       }
+
+       strlist__delete(rawlist);
+}
+
+/* Get current perf-probe event names */
+static struct strlist *get_perf_event_names(int fd)
+{
+       unsigned int i;
+       char *group, *event;
+       struct strlist *sl, *rawlist;
+       struct str_node *ent;
+
+       rawlist = get_trace_kprobe_event_rawlist(fd);
+
+       sl = strlist__new(false, NULL);
+       for (i = 0; i < strlist__nr_entries(rawlist); i++) {
+               ent = strlist__entry(rawlist, i);
+               parse_trace_kprobe_event(ent->s, &group, &event, NULL);
+               strlist__add(sl, event);
+               free(group);
+       }
+
+       strlist__delete(rawlist);
+
+       return sl;
+}
+
+static int write_trace_kprobe_event(int fd, const char *buf)
+{
+       int ret;
+
+       ret = write(fd, buf, strlen(buf));
+       if (ret <= 0)
+               die("Failed to create event.");
+       else
+               printf("Added new event: %s\n", buf);
+
+       return ret;
+}
+
+static void get_new_event_name(char *buf, size_t len, const char *base,
+                              struct strlist *namelist)
+{
+       int i, ret;
+       for (i = 0; i < MAX_EVENT_INDEX; i++) {
+               ret = e_snprintf(buf, len, "%s_%d", base, i);
+               if (ret < 0)
+                       die("snprintf() failed: %s", strerror(-ret));
+               if (!strlist__has_entry(namelist, buf))
+                       break;
+       }
+       if (i == MAX_EVENT_INDEX)
+               die("Too many events are on the same function.");
+}
+
+void add_trace_kprobe_events(struct probe_point *probes, int nr_probes)
+{
+       int i, j, fd;
+       struct probe_point *pp;
+       char buf[MAX_CMDLEN];
+       char event[64];
+       struct strlist *namelist;
+
+       fd = open_kprobe_events(O_RDWR, O_APPEND);
+       /* Get current event names */
+       namelist = get_perf_event_names(fd);
+
+       for (j = 0; j < nr_probes; j++) {
+               pp = probes + j;
+               for (i = 0; i < pp->found; i++) {
+                       /* Get an unused new event name */
+                       get_new_event_name(event, 64, pp->function, namelist);
+                       snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s\n",
+                                pp->retprobe ? 'r' : 'p',
+                                PERFPROBE_GROUP, event,
+                                pp->probes[i]);
+                       write_trace_kprobe_event(fd, buf);
+                       /* Add added event name to namelist */
+                       strlist__add(namelist, event);
+               }
+       }
+       close(fd);
+}
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
new file mode 100644 (file)
index 0000000..0c6fe56
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _PROBE_EVENT_H
+#define _PROBE_EVENT_H
+
+#include "probe-finder.h"
+#include "strlist.h"
+
+extern int parse_perf_probe_event(const char *str, struct probe_point *pp);
+extern int synthesize_perf_probe_event(struct probe_point *pp);
+extern void parse_trace_kprobe_event(const char *str, char **group,
+                                    char **event, struct probe_point *pp);
+extern int synthesize_trace_kprobe_event(struct probe_point *pp);
+extern void add_trace_kprobe_events(struct probe_point *probes, int nr_probes);
+extern void show_perf_probe_events(void);
+
+/* Maximum index number of event-name postfix */
+#define MAX_EVENT_INDEX        1024
+
+#endif /*_PROBE_EVENT_H */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
new file mode 100644 (file)
index 0000000..293cdfc
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * probe-finder.c : C expression to kprobe event converter
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+#include "event.h"
+#include "debug.h"
+#include "util.h"
+#include "probe-finder.h"
+
+
+/* Dwarf_Die Linkage to parent Die */
+struct die_link {
+       struct die_link *parent;        /* Parent die */
+       Dwarf_Die die;                  /* Current die */
+};
+
+static Dwarf_Debug __dw_debug;
+static Dwarf_Error __dw_error;
+
+/*
+ * Generic dwarf analysis helpers
+ */
+
+#define X86_32_MAX_REGS 8
+const char *x86_32_regs_table[X86_32_MAX_REGS] = {
+       "%ax",
+       "%cx",
+       "%dx",
+       "%bx",
+       "$stack",       /* Stack address instead of %sp */
+       "%bp",
+       "%si",
+       "%di",
+};
+
+#define X86_64_MAX_REGS 16
+const char *x86_64_regs_table[X86_64_MAX_REGS] = {
+       "%ax",
+       "%dx",
+       "%cx",
+       "%bx",
+       "%si",
+       "%di",
+       "%bp",
+       "%sp",
+       "%r8",
+       "%r9",
+       "%r10",
+       "%r11",
+       "%r12",
+       "%r13",
+       "%r14",
+       "%r15",
+};
+
+/* TODO: switching by dwarf address size */
+#ifdef __x86_64__
+#define ARCH_MAX_REGS X86_64_MAX_REGS
+#define arch_regs_table x86_64_regs_table
+#else
+#define ARCH_MAX_REGS X86_32_MAX_REGS
+#define arch_regs_table x86_32_regs_table
+#endif
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+static const char *get_arch_regstr(unsigned int n)
+{
+       return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
+}
+
+/*
+ * Compare the tail of two strings.
+ * Return 0 if whole of either string is same as another's tail part.
+ */
+static int strtailcmp(const char *s1, const char *s2)
+{
+       int i1 = strlen(s1);
+       int i2 = strlen(s2);
+       while (--i1 > 0 && --i2 > 0) {
+               if (s1[i1] != s2[i2])
+                       return s1[i1] - s2[i2];
+       }
+       return 0;
+}
+
+/* Find the fileno of the target file. */
+static Dwarf_Unsigned cu_find_fileno(Dwarf_Die cu_die, const char *fname)
+{
+       Dwarf_Signed cnt, i;
+       Dwarf_Unsigned found = 0;
+       char **srcs;
+       int ret;
+
+       if (!fname)
+               return 0;
+
+       ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error);
+       if (ret == DW_DLV_OK) {
+               for (i = 0; i < cnt && !found; i++) {
+                       if (strtailcmp(srcs[i], fname) == 0)
+                               found = i + 1;
+                       dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
+               }
+               for (; i < cnt; i++)
+                       dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
+               dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST);
+       }
+       if (found)
+               pr_debug("found fno: %d\n", (int)found);
+       return found;
+}
+
+/* Compare diename and tname */
+static int die_compare_name(Dwarf_Die dw_die, const char *tname)
+{
+       char *name;
+       int ret;
+       ret = dwarf_diename(dw_die, &name, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = strcmp(tname, name);
+               dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
+       } else
+               ret = -1;
+       return ret;
+}
+
+/* Check the address is in the subprogram(function). */
+static int die_within_subprogram(Dwarf_Die sp_die, Dwarf_Addr addr,
+                                Dwarf_Signed *offs)
+{
+       Dwarf_Addr lopc, hipc;
+       int ret;
+
+       /* TODO: check ranges */
+       ret = dwarf_lowpc(sp_die, &lopc, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_NO_ENTRY)
+               return 0;
+       ret = dwarf_highpc(sp_die, &hipc, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       if (lopc <= addr && addr < hipc) {
+               *offs = addr - lopc;
+               return 1;
+       } else
+               return 0;
+}
+
+/* Check the die is inlined function */
+static Dwarf_Bool die_inlined_subprogram(Dwarf_Die dw_die)
+{
+       /* TODO: check strictly */
+       Dwarf_Bool inl;
+       int ret;
+
+       ret = dwarf_hasattr(dw_die, DW_AT_inline, &inl, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       return inl;
+}
+
+/* Get the offset of abstruct_origin */
+static Dwarf_Off die_get_abstract_origin(Dwarf_Die dw_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Off cu_offs;
+       int ret;
+
+       ret = dwarf_attr(dw_die, DW_AT_abstract_origin, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_formref(attr, &cu_offs, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return cu_offs;
+}
+
+/* Get entry pc(or low pc, 1st entry of ranges)  of the die */
+static Dwarf_Addr die_get_entrypc(Dwarf_Die dw_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Addr addr;
+       Dwarf_Off offs;
+       Dwarf_Ranges *ranges;
+       Dwarf_Signed cnt;
+       int ret;
+
+       /* Try to get entry pc */
+       ret = dwarf_attr(dw_die, DW_AT_entry_pc, &attr, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = dwarf_formaddr(attr, &addr, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+               return addr;
+       }
+
+       /* Try to get low pc */
+       ret = dwarf_lowpc(dw_die, &addr, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK)
+               return addr;
+
+       /* Try to get ranges */
+       ret = dwarf_attr(dw_die, DW_AT_ranges, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_formref(attr, &offs, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_get_ranges(__dw_debug, offs, &ranges, &cnt, NULL,
+                               &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       addr = ranges[0].dwr_addr1;
+       dwarf_ranges_dealloc(__dw_debug, ranges, cnt);
+       return addr;
+}
+
+/*
+ * Search a Die from Die tree.
+ * Note: cur_link->die should be deallocated in this function.
+ */
+static int __search_die_tree(struct die_link *cur_link,
+                            int (*die_cb)(struct die_link *, void *),
+                            void *data)
+{
+       Dwarf_Die new_die;
+       struct die_link new_link;
+       int ret;
+
+       if (!die_cb)
+               return 0;
+
+       /* Check current die */
+       while (!(ret = die_cb(cur_link, data))) {
+               /* Check child die */
+               ret = dwarf_child(cur_link->die, &new_die, &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               if (ret == DW_DLV_OK) {
+                       new_link.parent = cur_link;
+                       new_link.die = new_die;
+                       ret = __search_die_tree(&new_link, die_cb, data);
+                       if (ret)
+                               break;
+               }
+
+               /* Move to next sibling */
+               ret = dwarf_siblingof(__dw_debug, cur_link->die, &new_die,
+                                     &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
+               cur_link->die = new_die;
+               if (ret == DW_DLV_NO_ENTRY)
+                       return 0;
+       }
+       dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
+       return ret;
+}
+
+/* Search a die in its children's die tree */
+static int search_die_from_children(Dwarf_Die parent_die,
+                                   int (*die_cb)(struct die_link *, void *),
+                                   void *data)
+{
+       struct die_link new_link;
+       int ret;
+
+       new_link.parent = NULL;
+       ret = dwarf_child(parent_die, &new_link.die, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK)
+               return __search_die_tree(&new_link, die_cb, data);
+       else
+               return 0;
+}
+
+/* Find a locdesc corresponding to the address */
+static int attr_get_locdesc(Dwarf_Attribute attr, Dwarf_Locdesc *desc,
+                           Dwarf_Addr addr)
+{
+       Dwarf_Signed lcnt;
+       Dwarf_Locdesc **llbuf;
+       int ret, i;
+
+       ret = dwarf_loclist_n(attr, &llbuf, &lcnt, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = DW_DLV_NO_ENTRY;
+       for (i = 0; i < lcnt; ++i) {
+               if (llbuf[i]->ld_lopc <= addr &&
+                   llbuf[i]->ld_hipc > addr) {
+                       memcpy(desc, llbuf[i], sizeof(Dwarf_Locdesc));
+                       desc->ld_s =
+                               malloc(sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
+                       DIE_IF(desc->ld_s == NULL);
+                       memcpy(desc->ld_s, llbuf[i]->ld_s,
+                               sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
+                       ret = DW_DLV_OK;
+                       break;
+               }
+               dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
+               dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
+       }
+       /* Releasing loop */
+       for (; i < lcnt; ++i) {
+               dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
+               dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
+       }
+       dwarf_dealloc(__dw_debug, llbuf, DW_DLA_LIST);
+       return ret;
+}
+
+/* Get decl_file attribute value (file number) */
+static Dwarf_Unsigned die_get_decl_file(Dwarf_Die sp_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Unsigned fno;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_decl_file, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_formudata(attr, &fno, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return fno;
+}
+
+/* Get decl_line attribute value (line number) */
+static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Unsigned lno;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_decl_line, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_formudata(attr, &lno, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return lno;
+}
+
+/*
+ * Probe finder related functions
+ */
+
+/* Show a location */
+static void show_location(Dwarf_Loc *loc, struct probe_finder *pf)
+{
+       Dwarf_Small op;
+       Dwarf_Unsigned regn;
+       Dwarf_Signed offs;
+       int deref = 0, ret;
+       const char *regs;
+
+       op = loc->lr_atom;
+
+       /* If this is based on frame buffer, set the offset */
+       if (op == DW_OP_fbreg) {
+               deref = 1;
+               offs = (Dwarf_Signed)loc->lr_number;
+               op = pf->fbloc.ld_s[0].lr_atom;
+               loc = &pf->fbloc.ld_s[0];
+       } else
+               offs = 0;
+
+       if (op >= DW_OP_breg0 && op <= DW_OP_breg31) {
+               regn = op - DW_OP_breg0;
+               offs += (Dwarf_Signed)loc->lr_number;
+               deref = 1;
+       } else if (op >= DW_OP_reg0 && op <= DW_OP_reg31) {
+               regn = op - DW_OP_reg0;
+       } else if (op == DW_OP_bregx) {
+               regn = loc->lr_number;
+               offs += (Dwarf_Signed)loc->lr_number2;
+               deref = 1;
+       } else if (op == DW_OP_regx) {
+               regn = loc->lr_number;
+       } else
+               die("Dwarf_OP %d is not supported.\n", op);
+
+       regs = get_arch_regstr(regn);
+       if (!regs)
+               die("%lld exceeds max register number.\n", regn);
+
+       if (deref)
+               ret = snprintf(pf->buf, pf->len,
+                                " %s=%+lld(%s)", pf->var, offs, regs);
+       else
+               ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
+       DIE_IF(ret < 0);
+       DIE_IF(ret >= pf->len);
+}
+
+/* Show a variables in kprobe event format */
+static void show_variable(Dwarf_Die vr_die, struct probe_finder *pf)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Locdesc ld;
+       int ret;
+
+       ret = dwarf_attr(vr_die, DW_AT_location, &attr, &__dw_error);
+       if (ret != DW_DLV_OK)
+               goto error;
+       ret = attr_get_locdesc(attr, &ld, (pf->addr - pf->cu_base));
+       if (ret != DW_DLV_OK)
+               goto error;
+       /* TODO? */
+       DIE_IF(ld.ld_cents != 1);
+       show_location(&ld.ld_s[0], pf);
+       free(ld.ld_s);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return ;
+error:
+       die("Failed to find the location of %s at this address.\n"
+           " Perhaps, it has been optimized out.\n", pf->var);
+}
+
+static int variable_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       Dwarf_Half tag;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if ((tag == DW_TAG_formal_parameter ||
+            tag == DW_TAG_variable) &&
+           (die_compare_name(dlink->die, pf->var) == 0)) {
+               show_variable(dlink->die, pf);
+               return 1;
+       }
+       /* TODO: Support struct members and arrays */
+       return 0;
+}
+
+/* Find a variable in a subprogram die */
+static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf)
+{
+       int ret;
+
+       if (!is_c_varname(pf->var)) {
+               /* Output raw parameters */
+               ret = snprintf(pf->buf, pf->len, " %s", pf->var);
+               DIE_IF(ret < 0);
+               DIE_IF(ret >= pf->len);
+               return ;
+       }
+
+       pr_debug("Searching '%s' variable in context.\n", pf->var);
+       /* Search child die for local variables and parameters. */
+       ret = search_die_from_children(sp_die, variable_callback, pf);
+       if (!ret)
+               die("Failed to find '%s' in this function.\n", pf->var);
+}
+
+/* Get a frame base on the address */
+static void get_current_frame_base(Dwarf_Die sp_die, struct probe_finder *pf)
+{
+       Dwarf_Attribute attr;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_frame_base, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = attr_get_locdesc(attr, &pf->fbloc, (pf->addr - pf->cu_base));
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+}
+
+static void free_current_frame_base(struct probe_finder *pf)
+{
+       free(pf->fbloc.ld_s);
+       memset(&pf->fbloc, 0, sizeof(Dwarf_Locdesc));
+}
+
+/* Show a probe point to output buffer */
+static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs,
+                           struct probe_finder *pf)
+{
+       struct probe_point *pp = pf->pp;
+       char *name;
+       char tmp[MAX_PROBE_BUFFER];
+       int ret, i, len;
+
+       /* Output name of probe point */
+       ret = dwarf_diename(sp_die, &name, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%u", name,
+                               (unsigned int)offs);
+               /* Copy the function name if possible */
+               if (!pp->function) {
+                       pp->function = strdup(name);
+                       pp->offset = offs;
+               }
+               dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
+       } else {
+               /* This function has no name. */
+               ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%llx", pf->addr);
+               if (!pp->function) {
+                       /* TODO: Use _stext */
+                       pp->function = strdup("");
+                       pp->offset = (int)pf->addr;
+               }
+       }
+       DIE_IF(ret < 0);
+       DIE_IF(ret >= MAX_PROBE_BUFFER);
+       len = ret;
+       pr_debug("Probe point found: %s\n", tmp);
+
+       /* Find each argument */
+       get_current_frame_base(sp_die, pf);
+       for (i = 0; i < pp->nr_args; i++) {
+               pf->var = pp->args[i];
+               pf->buf = &tmp[len];
+               pf->len = MAX_PROBE_BUFFER - len;
+               find_variable(sp_die, pf);
+               len += strlen(pf->buf);
+       }
+       free_current_frame_base(pf);
+
+       pp->probes[pp->found] = strdup(tmp);
+       pp->found++;
+}
+
+static int probeaddr_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       Dwarf_Half tag;
+       Dwarf_Signed offs;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       /* Check the address is in this subprogram */
+       if (tag == DW_TAG_subprogram &&
+           die_within_subprogram(dlink->die, pf->addr, &offs)) {
+               show_probepoint(dlink->die, offs, pf);
+               return 1;
+       }
+       return 0;
+}
+
+/* Find probe point from its line number */
+static void find_by_line(struct probe_finder *pf)
+{
+       Dwarf_Signed cnt, i, clm;
+       Dwarf_Line *lines;
+       Dwarf_Unsigned lineno = 0;
+       Dwarf_Addr addr;
+       Dwarf_Unsigned fno;
+       int ret;
+
+       ret = dwarf_srclines(pf->cu_die, &lines, &cnt, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+
+       for (i = 0; i < cnt; i++) {
+               ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               if (fno != pf->fno)
+                       continue;
+
+               ret = dwarf_lineno(lines[i], &lineno, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               if (lineno != pf->lno)
+                       continue;
+
+               ret = dwarf_lineoff(lines[i], &clm, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+
+               ret = dwarf_lineaddr(lines[i], &addr, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               pr_debug("Probe line found: line[%d]:%u,%d addr:0x%llx\n",
+                        (int)i, (unsigned)lineno, (int)clm, addr);
+               pf->addr = addr;
+               /* Search a real subprogram including this line, */
+               ret = search_die_from_children(pf->cu_die,
+                                              probeaddr_callback, pf);
+               if (ret == 0)
+                       die("Probe point is not found in subprograms.\n");
+               /* Continuing, because target line might be inlined. */
+       }
+       dwarf_srclines_dealloc(__dw_debug, lines, cnt);
+}
+
+/* Search function from function name */
+static int probefunc_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       struct probe_point *pp = pf->pp;
+       struct die_link *lk;
+       Dwarf_Signed offs;
+       Dwarf_Half tag;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (tag == DW_TAG_subprogram) {
+               if (die_compare_name(dlink->die, pp->function) == 0) {
+                       if (pp->line) { /* Function relative line */
+                               pf->fno = die_get_decl_file(dlink->die);
+                               pf->lno = die_get_decl_line(dlink->die)
+                                        + pp->line;
+                               find_by_line(pf);
+                               return 1;
+                       }
+                       if (die_inlined_subprogram(dlink->die)) {
+                               /* Inlined function, save it. */
+                               ret = dwarf_die_CU_offset(dlink->die,
+                                                         &pf->inl_offs,
+                                                         &__dw_error);
+                               DIE_IF(ret != DW_DLV_OK);
+                               pr_debug("inline definition offset %lld\n",
+                                        pf->inl_offs);
+                               return 0;       /* Continue to search */
+                       }
+                       /* Get probe address */
+                       pf->addr = die_get_entrypc(dlink->die);
+                       pf->addr += pp->offset;
+                       /* TODO: Check the address in this function */
+                       show_probepoint(dlink->die, pp->offset, pf);
+                       return 1; /* Exit; no same symbol in this CU. */
+               }
+       } else if (tag == DW_TAG_inlined_subroutine && pf->inl_offs) {
+               if (die_get_abstract_origin(dlink->die) == pf->inl_offs) {
+                       /* Get probe address */
+                       pf->addr = die_get_entrypc(dlink->die);
+                       pf->addr += pp->offset;
+                       pr_debug("found inline addr: 0x%llx\n", pf->addr);
+                       /* Inlined function. Get a real subprogram */
+                       for (lk = dlink->parent; lk != NULL; lk = lk->parent) {
+                               tag = 0;
+                               dwarf_tag(lk->die, &tag, &__dw_error);
+                               DIE_IF(ret == DW_DLV_ERROR);
+                               if (tag == DW_TAG_subprogram &&
+                                   !die_inlined_subprogram(lk->die))
+                                       goto found;
+                       }
+                       die("Failed to find real subprogram.\n");
+found:
+                       /* Get offset from subprogram */
+                       ret = die_within_subprogram(lk->die, pf->addr, &offs);
+                       DIE_IF(!ret);
+                       show_probepoint(lk->die, offs, pf);
+                       /* Continue to search */
+               }
+       }
+       return 0;
+}
+
+static void find_by_func(struct probe_finder *pf)
+{
+       search_die_from_children(pf->cu_die, probefunc_callback, pf);
+}
+
+/* Find a probe point */
+int find_probepoint(int fd, struct probe_point *pp)
+{
+       Dwarf_Half addr_size = 0;
+       Dwarf_Unsigned next_cuh = 0;
+       int cu_number = 0, ret;
+       struct probe_finder pf = {.pp = pp};
+
+       ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error);
+       if (ret != DW_DLV_OK) {
+               pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO.\n");
+               return -ENOENT;
+       }
+
+       pp->found = 0;
+       while (++cu_number) {
+               /* Search CU (Compilation Unit) */
+               ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL,
+                       &addr_size, &next_cuh, &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               if (ret == DW_DLV_NO_ENTRY)
+                       break;
+
+               /* Get the DIE(Debugging Information Entry) of this CU */
+               ret = dwarf_siblingof(__dw_debug, 0, &pf.cu_die, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+
+               /* Check if target file is included. */
+               if (pp->file)
+                       pf.fno = cu_find_fileno(pf.cu_die, pp->file);
+
+               if (!pp->file || pf.fno) {
+                       /* Save CU base address (for frame_base) */
+                       ret = dwarf_lowpc(pf.cu_die, &pf.cu_base, &__dw_error);
+                       DIE_IF(ret == DW_DLV_ERROR);
+                       if (ret == DW_DLV_NO_ENTRY)
+                               pf.cu_base = 0;
+                       if (pp->function)
+                               find_by_func(&pf);
+                       else {
+                               pf.lno = pp->line;
+                               find_by_line(&pf);
+                       }
+               }
+               dwarf_dealloc(__dw_debug, pf.cu_die, DW_DLA_DIE);
+       }
+       ret = dwarf_finish(__dw_debug, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+
+       return pp->found;
+}
+
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
new file mode 100644 (file)
index 0000000..bdebca6
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef _PROBE_FINDER_H
+#define _PROBE_FINDER_H
+
+#define MAX_PATH_LEN 256
+#define MAX_PROBE_BUFFER 1024
+#define MAX_PROBES 128
+
+static inline int is_c_varname(const char *name)
+{
+       /* TODO */
+       return isalpha(name[0]) || name[0] == '_';
+}
+
+struct probe_point {
+       /* Inputs */
+       char    *file;          /* File name */
+       int     line;           /* Line number */
+
+       char    *function;      /* Function name */
+       int     offset;         /* Offset bytes */
+
+       int     nr_args;        /* Number of arguments */
+       char    **args;         /* Arguments */
+
+       int     retprobe;       /* Return probe */
+
+       /* Output */
+       int     found;          /* Number of found probe points */
+       char    *probes[MAX_PROBES];    /* Output buffers (will be allocated)*/
+};
+
+#ifndef NO_LIBDWARF
+extern int find_probepoint(int fd, struct probe_point *pp);
+
+#include <libdwarf/dwarf.h>
+#include <libdwarf/libdwarf.h>
+
+struct probe_finder {
+       struct probe_point      *pp;    /* Target probe point */
+
+       /* For function searching */
+       Dwarf_Addr      addr;           /* Address */
+       Dwarf_Unsigned  fno;            /* File number */
+       Dwarf_Unsigned  lno;            /* Line number */
+       Dwarf_Off       inl_offs;       /* Inline offset */
+       Dwarf_Die       cu_die;         /* Current CU */
+
+       /* For variable searching */
+       Dwarf_Addr      cu_base;        /* Current CU base address */
+       Dwarf_Locdesc   fbloc;          /* Location of Current Frame Base */
+       const char      *var;           /* Current variable name */
+       char            *buf;           /* Current output buffer */
+       int             len;            /* Length of output buffer */
+};
+#endif /* NO_LIBDWARF */
+
+#endif /*_PROBE_FINDER_H */
index a5454a1d1c137ba3640add2945f1be186727d36c..b6a01973391975193a8d1ef1903674e2dc409016 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef QUOTE_H
-#define QUOTE_H
+#ifndef __PERF_QUOTE_H
+#define __PERF_QUOTE_H
 
 #include <stddef.h>
 #include <stdio.h>
@@ -65,4 +65,4 @@ extern void perl_quote_print(FILE *stream, const char *src);
 extern void python_quote_print(FILE *stream, const char *src);
 extern void tcl_quote_print(FILE *stream, const char *src);
 
-#endif
+#endif /* __PERF_QUOTE_H */
index cc1837deba88af6371785e4721f632529d19face..d79028727ce2c5dfba2317f71c67dcfa7f4f72bf 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef RUN_COMMAND_H
-#define RUN_COMMAND_H
+#ifndef __PERF_RUN_COMMAND_H
+#define __PERF_RUN_COMMAND_H
 
 enum {
        ERR_RUN_COMMAND_FORK = 10000,
@@ -85,4 +85,4 @@ struct async {
 int start_async(struct async *async);
 int finish_async(struct async *async);
 
-#endif
+#endif /* __PERF_RUN_COMMAND_H */
index 618083bce0c66a551fd4d894b31520a67b25bac9..1a53c11265fdda78357112f3c3e7bd72a936afe8 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef SIGCHAIN_H
-#define SIGCHAIN_H
+#ifndef __PERF_SIGCHAIN_H
+#define __PERF_SIGCHAIN_H
 
 typedef void (*sigchain_fun)(int);
 
@@ -8,4 +8,4 @@ int sigchain_pop(int sig);
 
 void sigchain_push_common(sigchain_fun f);
 
-#endif /* SIGCHAIN_H */
+#endif /* __PERF_SIGCHAIN_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
new file mode 100644 (file)
index 0000000..b490354
--- /dev/null
@@ -0,0 +1,290 @@
+#include "sort.h"
+
+regex_t                parent_regex;
+char           default_parent_pattern[] = "^sys_|^do_page_fault";
+char           *parent_pattern = default_parent_pattern;
+char           default_sort_order[] = "comm,dso,symbol";
+char           *sort_order = default_sort_order;
+int            sort__need_collapse = 0;
+int            sort__has_parent = 0;
+
+enum sort_type sort__first_dimension;
+
+unsigned int dsos__col_width;
+unsigned int comms__col_width;
+unsigned int threads__col_width;
+static unsigned int parent_symbol__col_width;
+char * field_sep;
+
+LIST_HEAD(hist_entry__sort_list);
+
+struct sort_entry sort_thread = {
+       .header = "Command:  Pid",
+       .cmp    = sort__thread_cmp,
+       .print  = sort__thread_print,
+       .width  = &threads__col_width,
+};
+
+struct sort_entry sort_comm = {
+       .header         = "Command",
+       .cmp            = sort__comm_cmp,
+       .collapse       = sort__comm_collapse,
+       .print          = sort__comm_print,
+       .width          = &comms__col_width,
+};
+
+struct sort_entry sort_dso = {
+       .header = "Shared Object",
+       .cmp    = sort__dso_cmp,
+       .print  = sort__dso_print,
+       .width  = &dsos__col_width,
+};
+
+struct sort_entry sort_sym = {
+       .header = "Symbol",
+       .cmp    = sort__sym_cmp,
+       .print  = sort__sym_print,
+};
+
+struct sort_entry sort_parent = {
+       .header = "Parent symbol",
+       .cmp    = sort__parent_cmp,
+       .print  = sort__parent_print,
+       .width  = &parent_symbol__col_width,
+};
+
+struct sort_dimension {
+       const char              *name;
+       struct sort_entry       *entry;
+       int                     taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+       { .name = "pid",        .entry = &sort_thread,  },
+       { .name = "comm",       .entry = &sort_comm,    },
+       { .name = "dso",        .entry = &sort_dso,     },
+       { .name = "symbol",     .entry = &sort_sym,     },
+       { .name = "parent",     .entry = &sort_parent,  },
+};
+
+int64_t cmp_null(void *l, void *r)
+{
+       if (!l && !r)
+               return 0;
+       else if (!l)
+               return -1;
+       else
+               return 1;
+}
+
+/* --sort pid */
+
+int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+int repsep_fprintf(FILE *fp, const char *fmt, ...)
+{
+       int n;
+       va_list ap;
+
+       va_start(ap, fmt);
+       if (!field_sep)
+               n = vfprintf(fp, fmt, ap);
+       else {
+               char *bf = NULL;
+               n = vasprintf(&bf, fmt, ap);
+               if (n > 0) {
+                       char *sep = bf;
+
+                       while (1) {
+                               sep = strchr(sep, *field_sep);
+                               if (sep == NULL)
+                                       break;
+                               *sep = '.';
+                       }
+               }
+               fputs(bf, fp);
+               free(bf);
+       }
+       va_end(ap);
+       return n;
+}
+
+size_t
+sort__thread_print(FILE *fp, struct hist_entry *self, unsigned int width)
+{
+       return repsep_fprintf(fp, "%*s:%5d", width - 6,
+                             self->thread->comm ?: "", self->thread->pid);
+}
+
+size_t
+sort__comm_print(FILE *fp, struct hist_entry *self, unsigned int width)
+{
+       return repsep_fprintf(fp, "%*s", width, self->thread->comm);
+}
+
+/* --sort dso */
+
+int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct dso *dso_l = left->map ? left->map->dso : NULL;
+       struct dso *dso_r = right->map ? right->map->dso : NULL;
+       const char *dso_name_l, *dso_name_r;
+
+       if (!dso_l || !dso_r)
+               return cmp_null(dso_l, dso_r);
+
+       if (verbose) {
+               dso_name_l = dso_l->long_name;
+               dso_name_r = dso_r->long_name;
+       } else {
+               dso_name_l = dso_l->short_name;
+               dso_name_r = dso_r->short_name;
+       }
+
+       return strcmp(dso_name_l, dso_name_r);
+}
+
+size_t
+sort__dso_print(FILE *fp, struct hist_entry *self, unsigned int width)
+{
+       if (self->map && self->map->dso) {
+               const char *dso_name = !verbose ? self->map->dso->short_name :
+                                                 self->map->dso->long_name;
+               return repsep_fprintf(fp, "%-*s", width, dso_name);
+       }
+
+       return repsep_fprintf(fp, "%*llx", width, (u64)self->ip);
+}
+
+/* --sort symbol */
+
+int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       u64 ip_l, ip_r;
+
+       if (left->sym == right->sym)
+               return 0;
+
+       ip_l = left->sym ? left->sym->start : left->ip;
+       ip_r = right->sym ? right->sym->start : right->ip;
+
+       return (int64_t)(ip_r - ip_l);
+}
+
+
+size_t
+sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
+{
+       size_t ret = 0;
+
+       if (verbose) {
+               char o = self->map ? dso__symtab_origin(self->map->dso) : '!';
+               ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip, o);
+       }
+
+       ret += repsep_fprintf(fp, "[%c] ", self->level);
+       if (self->sym)
+               ret += repsep_fprintf(fp, "%s", self->sym->name);
+       else
+               ret += repsep_fprintf(fp, "%#016llx", (u64)self->ip);
+
+       return ret;
+}
+
+/* --sort comm */
+
+int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       return right->thread->pid - left->thread->pid;
+}
+
+int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+       char *comm_l = left->thread->comm;
+       char *comm_r = right->thread->comm;
+
+       if (!comm_l || !comm_r)
+               return cmp_null(comm_l, comm_r);
+
+       return strcmp(comm_l, comm_r);
+}
+
+/* --sort parent */
+
+int64_t
+sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+       struct symbol *sym_l = left->parent;
+       struct symbol *sym_r = right->parent;
+
+       if (!sym_l || !sym_r)
+               return cmp_null(sym_l, sym_r);
+
+       return strcmp(sym_l->name, sym_r->name);
+}
+
+size_t
+sort__parent_print(FILE *fp, struct hist_entry *self, unsigned int width)
+{
+       return repsep_fprintf(fp, "%-*s", width,
+                             self->parent ? self->parent->name : "[other]");
+}
+
+int sort_dimension__add(const char *tok)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+               struct sort_dimension *sd = &sort_dimensions[i];
+
+               if (sd->taken)
+                       continue;
+
+               if (strncasecmp(tok, sd->name, strlen(tok)))
+                       continue;
+
+               if (sd->entry->collapse)
+                       sort__need_collapse = 1;
+
+               if (sd->entry == &sort_parent) {
+                       int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
+                       if (ret) {
+                               char err[BUFSIZ];
+
+                               regerror(ret, &parent_regex, err, sizeof(err));
+                               fprintf(stderr, "Invalid regex: %s\n%s",
+                                       parent_pattern, err);
+                               exit(-1);
+                       }
+                       sort__has_parent = 1;
+               }
+
+               if (list_empty(&hist_entry__sort_list)) {
+                       if (!strcmp(sd->name, "pid"))
+                               sort__first_dimension = SORT_PID;
+                       else if (!strcmp(sd->name, "comm"))
+                               sort__first_dimension = SORT_COMM;
+                       else if (!strcmp(sd->name, "dso"))
+                               sort__first_dimension = SORT_DSO;
+                       else if (!strcmp(sd->name, "symbol"))
+                               sort__first_dimension = SORT_SYM;
+                       else if (!strcmp(sd->name, "parent"))
+                               sort__first_dimension = SORT_PARENT;
+               }
+
+               list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+               sd->taken = 1;
+
+               return 0;
+       }
+
+       return -ESRCH;
+}
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
new file mode 100644 (file)
index 0000000..333e664
--- /dev/null
@@ -0,0 +1,99 @@
+#ifndef __PERF_SORT_H
+#define __PERF_SORT_H
+#include "../builtin.h"
+
+#include "util.h"
+
+#include "color.h"
+#include <linux/list.h>
+#include "cache.h"
+#include <linux/rbtree.h>
+#include "symbol.h"
+#include "string.h"
+#include "callchain.h"
+#include "strlist.h"
+#include "values.h"
+
+#include "../perf.h"
+#include "debug.h"
+#include "header.h"
+
+#include "parse-options.h"
+#include "parse-events.h"
+
+#include "thread.h"
+#include "sort.h"
+
+extern regex_t parent_regex;
+extern char *sort_order;
+extern char default_parent_pattern[];
+extern char *parent_pattern;
+extern char default_sort_order[];
+extern int sort__need_collapse;
+extern int sort__has_parent;
+extern char *field_sep;
+extern struct sort_entry sort_comm;
+extern struct sort_entry sort_dso;
+extern struct sort_entry sort_sym;
+extern struct sort_entry sort_parent;
+extern unsigned int dsos__col_width;
+extern unsigned int comms__col_width;
+extern unsigned int threads__col_width;
+extern enum sort_type sort__first_dimension;
+
+struct hist_entry {
+       struct rb_node          rb_node;
+       u64                     count;
+       struct thread           *thread;
+       struct map              *map;
+       struct symbol           *sym;
+       u64                     ip;
+       char                    level;
+       struct symbol           *parent;
+       struct callchain_node   callchain;
+       struct rb_root          sorted_chain;
+};
+
+enum sort_type {
+       SORT_PID,
+       SORT_COMM,
+       SORT_DSO,
+       SORT_SYM,
+       SORT_PARENT
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+       struct list_head list;
+
+       const char *header;
+
+       int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+       int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+       size_t  (*print)(FILE *fp, struct hist_entry *, unsigned int width);
+       unsigned int *width;
+       bool    elide;
+};
+
+extern struct sort_entry sort_thread;
+extern struct list_head hist_entry__sort_list;
+
+extern int repsep_fprintf(FILE *fp, const char *fmt, ...);
+extern size_t sort__thread_print(FILE *, struct hist_entry *, unsigned int);
+extern size_t sort__comm_print(FILE *, struct hist_entry *, unsigned int);
+extern size_t sort__dso_print(FILE *, struct hist_entry *, unsigned int);
+extern size_t sort__sym_print(FILE *, struct hist_entry *, unsigned int __used);
+extern int64_t cmp_null(void *, void *);
+extern int64_t sort__thread_cmp(struct hist_entry *, struct hist_entry *);
+extern int64_t sort__comm_cmp(struct hist_entry *, struct hist_entry *);
+extern int64_t sort__comm_collapse(struct hist_entry *, struct hist_entry *);
+extern int64_t sort__dso_cmp(struct hist_entry *, struct hist_entry *);
+extern int64_t sort__sym_cmp(struct hist_entry *, struct hist_entry *);
+extern int64_t sort__parent_cmp(struct hist_entry *, struct hist_entry *);
+extern size_t sort__parent_print(FILE *, struct hist_entry *, unsigned int);
+extern int sort_dimension__add(const char *);
+
+#endif /* __PERF_SORT_H */
index d2aa86c014c1b9a3882cfd924affaa116a122e79..a3d121d6c83e1d64355ef0cc6bc4c8595110a9fd 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef STRBUF_H
-#define STRBUF_H
+#ifndef __PERF_STRBUF_H
+#define __PERF_STRBUF_H
 
 /*
  * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
@@ -134,4 +134,4 @@ extern int launch_editor(const char *path, struct strbuf *buffer, const char *co
 extern int strbuf_branchname(struct strbuf *sb, const char *name);
 extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
 
-#endif /* STRBUF_H */
+#endif /* __PERF_STRBUF_H */
index c93eca9a7be39f67c5c3638d5d6ddc13770457dd..f24a8cc933d5f1aeede2875f774efc00b8610328 100644 (file)
@@ -1,4 +1,5 @@
 #include "string.h"
+#include "util.h"
 
 static int hex(char ch)
 {
@@ -32,3 +33,196 @@ int hex2u64(const char *ptr, u64 *long_val)
 
        return p - ptr;
 }
+
+char *strxfrchar(char *s, char from, char to)
+{
+       char *p = s;
+
+       while ((p = strchr(p, from)) != NULL)
+               *p++ = to;
+
+       return s;
+}
+
+#define K 1024LL
+/*
+ * perf_atoll()
+ * Parse (\d+)(b|B|kb|KB|mb|MB|gb|GB|tb|TB) (e.g. "256MB")
+ * and return its numeric value
+ */
+s64 perf_atoll(const char *str)
+{
+       unsigned int i;
+       s64 length = -1, unit = 1;
+
+       if (!isdigit(str[0]))
+               goto out_err;
+
+       for (i = 1; i < strlen(str); i++) {
+               switch (str[i]) {
+               case 'B':
+               case 'b':
+                       break;
+               case 'K':
+                       if (str[i + 1] != 'B')
+                               goto out_err;
+                       else
+                               goto kilo;
+               case 'k':
+                       if (str[i + 1] != 'b')
+                               goto out_err;
+kilo:
+                       unit = K;
+                       break;
+               case 'M':
+                       if (str[i + 1] != 'B')
+                               goto out_err;
+                       else
+                               goto mega;
+               case 'm':
+                       if (str[i + 1] != 'b')
+                               goto out_err;
+mega:
+                       unit = K * K;
+                       break;
+               case 'G':
+                       if (str[i + 1] != 'B')
+                               goto out_err;
+                       else
+                               goto giga;
+               case 'g':
+                       if (str[i + 1] != 'b')
+                               goto out_err;
+giga:
+                       unit = K * K * K;
+                       break;
+               case 'T':
+                       if (str[i + 1] != 'B')
+                               goto out_err;
+                       else
+                               goto tera;
+               case 't':
+                       if (str[i + 1] != 'b')
+                               goto out_err;
+tera:
+                       unit = K * K * K * K;
+                       break;
+               case '\0':      /* only specified figures */
+                       unit = 1;
+                       break;
+               default:
+                       if (!isdigit(str[i]))
+                               goto out_err;
+                       break;
+               }
+       }
+
+       length = atoll(str) * unit;
+       goto out;
+
+out_err:
+       length = -1;
+out:
+       return length;
+}
+
+/*
+ * Helper function for splitting a string into an argv-like array.
+ * originaly copied from lib/argv_split.c
+ */
+static const char *skip_sep(const char *cp)
+{
+       while (*cp && isspace(*cp))
+               cp++;
+
+       return cp;
+}
+
+static const char *skip_arg(const char *cp)
+{
+       while (*cp && !isspace(*cp))
+               cp++;
+
+       return cp;
+}
+
+static int count_argc(const char *str)
+{
+       int count = 0;
+
+       while (*str) {
+               str = skip_sep(str);
+               if (*str) {
+                       count++;
+                       str = skip_arg(str);
+               }
+       }
+
+       return count;
+}
+
+/**
+ * argv_free - free an argv
+ * @argv - the argument vector to be freed
+ *
+ * Frees an argv and the strings it points to.
+ */
+void argv_free(char **argv)
+{
+       char **p;
+       for (p = argv; *p; p++)
+               free(*p);
+
+       free(argv);
+}
+
+/**
+ * argv_split - split a string at whitespace, returning an argv
+ * @str: the string to be split
+ * @argcp: returned argument count
+ *
+ * Returns an array of pointers to strings which are split out from
+ * @str.  This is performed by strictly splitting on white-space; no
+ * quote processing is performed.  Multiple whitespace characters are
+ * considered to be a single argument separator.  The returned array
+ * is always NULL-terminated.  Returns NULL on memory allocation
+ * failure.
+ */
+char **argv_split(const char *str, int *argcp)
+{
+       int argc = count_argc(str);
+       char **argv = zalloc(sizeof(*argv) * (argc+1));
+       char **argvp;
+
+       if (argv == NULL)
+               goto out;
+
+       if (argcp)
+               *argcp = argc;
+
+       argvp = argv;
+
+       while (*str) {
+               str = skip_sep(str);
+
+               if (*str) {
+                       const char *p = str;
+                       char *t;
+
+                       str = skip_arg(str);
+
+                       t = strndup(p, str-p);
+                       if (t == NULL)
+                               goto fail;
+                       *argvp++ = t;
+               }
+       }
+       *argvp = NULL;
+
+out:
+       return argv;
+
+fail:
+       argv_free(argv);
+       return NULL;
+}
index bf39dfadfd24dd00fe845f1c321bf0994ad768b0..bfecec265a1a423e3e2816b725f0540f234f5be4 100644 (file)
@@ -1,11 +1,15 @@
-#ifndef _PERF_STRING_H_
-#define _PERF_STRING_H_
+#ifndef __PERF_STRING_H_
+#define __PERF_STRING_H_
 
 #include "types.h"
 
 int hex2u64(const char *ptr, u64 *val);
+char *strxfrchar(char *s, char from, char to);
+s64 perf_atoll(const char *str);
+char **argv_split(const char *str, int *argcp);
+void argv_free(char **argv);
 
 #define _STR(x) #x
 #define STR(x) _STR(x)
 
-#endif
+#endif /* __PERF_STRING_H */
index 921818e44a542e9aa1f8398d950121cb39fb5c24..cb4659306d7bd02221091716e6cc4151d560d1ef 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef STRLIST_H_
-#define STRLIST_H_
+#ifndef __PERF_STRLIST_H
+#define __PERF_STRLIST_H
 
 #include <linux/rbtree.h>
 #include <stdbool.h>
@@ -36,4 +36,4 @@ static inline unsigned int strlist__nr_entries(const struct strlist *self)
 }
 
 int strlist__parse_list(struct strlist *self, const char *s);
-#endif /* STRLIST_H_ */
+#endif /* __PERF_STRLIST_H */
index cd93195aedb39dd9cc40abd5ccec0b58b0c8010f..e0781989cc31f5d1dd004dc66c22a584d371189d 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _INCLUDE_GUARD_SVG_HELPER_
-#define _INCLUDE_GUARD_SVG_HELPER_
+#ifndef __PERF_SVGHELPER_H
+#define __PERF_SVGHELPER_H
 
 #include "types.h"
 
@@ -25,4 +25,4 @@ extern void svg_close(void);
 
 extern int svg_page_width;
 
-#endif
+#endif /* __PERF_SVGHELPER_H */
index 226f44a2357de7baf900271a58b12cddffed50c0..fffcb937cdcb207f470f1bf335f1e316e283bc12 100644 (file)
@@ -2,14 +2,20 @@
 #include "../perf.h"
 #include "string.h"
 #include "symbol.h"
+#include "thread.h"
 
 #include "debug.h"
 
+#include <asm/bug.h>
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
+#include <limits.h>
+#include <sys/utsname.h>
 
-const char *sym_hist_filter;
+#ifndef NT_GNU_BUILD_ID
+#define NT_GNU_BUILD_ID 3
+#endif
 
 enum dso_origin {
        DSO__ORIG_KERNEL = 0,
@@ -18,94 +24,189 @@ enum dso_origin {
        DSO__ORIG_UBUNTU,
        DSO__ORIG_BUILDID,
        DSO__ORIG_DSO,
+       DSO__ORIG_KMODULE,
        DSO__ORIG_NOT_FOUND,
 };
 
-static struct symbol *symbol__new(u64 start, u64 len,
-                                 const char *name, unsigned int priv_size,
-                                 u64 obj_start, int v)
+static void dsos__add(struct list_head *head, struct dso *dso);
+static struct map *thread__find_map_by_name(struct thread *self, char *name);
+static struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
+struct symbol *dso__find_symbol(struct dso *self, enum map_type type, u64 addr);
+static int dso__load_kernel_sym(struct dso *self, struct map *map,
+                               struct thread *thread, symbol_filter_t filter);
+unsigned int symbol__priv_size;
+static int vmlinux_path__nr_entries;
+static char **vmlinux_path;
+
+static struct symbol_conf symbol_conf__defaults = {
+       .use_modules      = true,
+       .try_vmlinux_path = true,
+};
+
+static struct thread kthread_mem;
+struct thread *kthread = &kthread_mem;
+
+bool dso__loaded(const struct dso *self, enum map_type type)
 {
-       size_t namelen = strlen(name) + 1;
-       struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
+       return self->loaded & (1 << type);
+}
 
-       if (!self)
-               return NULL;
+static void dso__set_loaded(struct dso *self, enum map_type type)
+{
+       self->loaded |= (1 << type);
+}
 
-       if (v >= 2)
-               printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
-                       (u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
+static void symbols__fixup_end(struct rb_root *self)
+{
+       struct rb_node *nd, *prevnd = rb_first(self);
+       struct symbol *curr, *prev;
+
+       if (prevnd == NULL)
+               return;
 
-       self->obj_start= obj_start;
-       self->hist = NULL;
-       self->hist_sum = 0;
+       curr = rb_entry(prevnd, struct symbol, rb_node);
 
-       if (sym_hist_filter && !strcmp(name, sym_hist_filter))
-               self->hist = calloc(sizeof(u64), len);
+       for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+               prev = curr;
+               curr = rb_entry(nd, struct symbol, rb_node);
 
-       if (priv_size) {
-               memset(self, 0, priv_size);
-               self = ((void *)self) + priv_size;
+               if (prev->end == prev->start)
+                       prev->end = curr->start - 1;
        }
+
+       /* Last entry */
+       if (curr->end == curr->start)
+               curr->end = roundup(curr->start, 4096);
+}
+
+static void __thread__fixup_maps_end(struct thread *self, enum map_type type)
+{
+       struct map *prev, *curr;
+       struct rb_node *nd, *prevnd = rb_first(&self->maps[type]);
+
+       if (prevnd == NULL)
+               return;
+
+       curr = rb_entry(prevnd, struct map, rb_node);
+
+       for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+               prev = curr;
+               curr = rb_entry(nd, struct map, rb_node);
+               prev->end = curr->start - 1;
+       }
+
+       /*
+        * We still haven't the actual symbols, so guess the
+        * last map final address.
+        */
+       curr->end = ~0UL;
+}
+
+static void thread__fixup_maps_end(struct thread *self)
+{
+       int i;
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               __thread__fixup_maps_end(self, i);
+}
+
+static struct symbol *symbol__new(u64 start, u64 len, const char *name)
+{
+       size_t namelen = strlen(name) + 1;
+       struct symbol *self = zalloc(symbol__priv_size +
+                                    sizeof(*self) + namelen);
+       if (self == NULL)
+               return NULL;
+
+       if (symbol__priv_size)
+               self = ((void *)self) + symbol__priv_size;
+
        self->start = start;
        self->end   = len ? start + len - 1 : start;
+
+       pr_debug3("%s: %s %#Lx-%#Lx\n", __func__, name, start, self->end);
+
        memcpy(self->name, name, namelen);
 
        return self;
 }
 
-static void symbol__delete(struct symbol *self, unsigned int priv_size)
+static void symbol__delete(struct symbol *self)
 {
-       free(((void *)self) - priv_size);
+       free(((void *)self) - symbol__priv_size);
 }
 
 static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 {
-       if (!self->module)
-               return fprintf(fp, " %llx-%llx %s\n",
+       return fprintf(fp, " %llx-%llx %s\n",
                       self->start, self->end, self->name);
-       else
-               return fprintf(fp, " %llx-%llx %s \t[%s]\n",
-                      self->start, self->end, self->name, self->module->name);
 }
 
-struct dso *dso__new(const char *name, unsigned int sym_priv_size)
+static void dso__set_long_name(struct dso *self, char *name)
+{
+       if (name == NULL)
+               return;
+       self->long_name = name;
+       self->long_name_len = strlen(name);
+}
+
+static void dso__set_basename(struct dso *self)
+{
+       self->short_name = basename(self->long_name);
+}
+
+struct dso *dso__new(const char *name)
 {
        struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
 
        if (self != NULL) {
+               int i;
                strcpy(self->name, name);
-               self->syms = RB_ROOT;
-               self->sym_priv_size = sym_priv_size;
+               dso__set_long_name(self, self->name);
+               self->short_name = self->name;
+               for (i = 0; i < MAP__NR_TYPES; ++i)
+                       self->symbols[i] = RB_ROOT;
                self->find_symbol = dso__find_symbol;
                self->slen_calculated = 0;
                self->origin = DSO__ORIG_NOT_FOUND;
+               self->loaded = 0;
+               self->has_build_id = 0;
        }
 
        return self;
 }
 
-static void dso__delete_symbols(struct dso *self)
+static void symbols__delete(struct rb_root *self)
 {
        struct symbol *pos;
-       struct rb_node *next = rb_first(&self->syms);
+       struct rb_node *next = rb_first(self);
 
        while (next) {
                pos = rb_entry(next, struct symbol, rb_node);
                next = rb_next(&pos->rb_node);
-               rb_erase(&pos->rb_node, &self->syms);
-               symbol__delete(pos, self->sym_priv_size);
+               rb_erase(&pos->rb_node, self);
+               symbol__delete(pos);
        }
 }
 
 void dso__delete(struct dso *self)
 {
-       dso__delete_symbols(self);
+       int i;
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               symbols__delete(&self->symbols[i]);
+       if (self->long_name != self->name)
+               free(self->long_name);
        free(self);
 }
 
-static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+void dso__set_build_id(struct dso *self, void *build_id)
 {
-       struct rb_node **p = &self->syms.rb_node;
+       memcpy(self->build_id, build_id, sizeof(self->build_id));
+       self->has_build_id = 1;
+}
+
+static void symbols__insert(struct rb_root *self, struct symbol *sym)
+{
+       struct rb_node **p = &self->rb_node;
        struct rb_node *parent = NULL;
        const u64 ip = sym->start;
        struct symbol *s;
@@ -119,17 +220,17 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
                        p = &(*p)->rb_right;
        }
        rb_link_node(&sym->rb_node, parent, p);
-       rb_insert_color(&sym->rb_node, &self->syms);
+       rb_insert_color(&sym->rb_node, self);
 }
 
-struct symbol *dso__find_symbol(struct dso *self, u64 ip)
+static struct symbol *symbols__find(struct rb_root *self, u64 ip)
 {
        struct rb_node *n;
 
        if (self == NULL)
                return NULL;
 
-       n = self->syms.rb_node;
+       n = self->rb_node;
 
        while (n) {
                struct symbol *s = rb_entry(n, struct symbol, rb_node);
@@ -145,12 +246,42 @@ struct symbol *dso__find_symbol(struct dso *self, u64 ip)
        return NULL;
 }
 
-size_t dso__fprintf(struct dso *self, FILE *fp)
+struct symbol *dso__find_symbol(struct dso *self, enum map_type type, u64 addr)
 {
-       size_t ret = fprintf(fp, "dso: %s\n", self->name);
+       return symbols__find(&self->symbols[type], addr);
+}
+
+int build_id__sprintf(u8 *self, int len, char *bf)
+{
+       char *bid = bf;
+       u8 *raw = self;
+       int i;
+
+       for (i = 0; i < len; ++i) {
+               sprintf(bid, "%02x", *raw);
+               ++raw;
+               bid += 2;
+       }
+
+       return raw - self;
+}
+
+size_t dso__fprintf_buildid(struct dso *self, FILE *fp)
+{
+       char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+       build_id__sprintf(self->build_id, sizeof(self->build_id), sbuild_id);
+       return fprintf(fp, "%s", sbuild_id);
+}
 
+size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp)
+{
        struct rb_node *nd;
-       for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+       size_t ret = fprintf(fp, "dso: %s (", self->short_name);
+
+       ret += dso__fprintf_buildid(self, fp);
+       ret += fprintf(fp, ")\n");
+       for (nd = rb_first(&self->symbols[type]); nd; nd = rb_next(nd)) {
                struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
                ret += symbol__fprintf(pos, fp);
        }
@@ -158,13 +289,17 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
        return ret;
 }
 
-static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int v)
+/*
+ * Loads the function entries in /proc/kallsyms into kernel_map->dso,
+ * so that we can in the next step set the symbol ->end address and then
+ * call kernel_maps__split_kallsyms.
+ */
+static int dso__load_all_kallsyms(struct dso *self, struct map *map)
 {
-       struct rb_node *nd, *prevnd;
        char *line = NULL;
        size_t n;
+       struct rb_root *root = &self->symbols[map->type];
        FILE *file = fopen("/proc/kallsyms", "r");
-       int count = 0;
 
        if (file == NULL)
                goto out_failure;
@@ -174,6 +309,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int v)
                struct symbol *sym;
                int line_len, len;
                char symbol_type;
+               char *symbol_name;
 
                line_len = getline(&line, &n, file);
                if (line_len < 0)
@@ -196,44 +332,26 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int v)
                 */
                if (symbol_type != 'T' && symbol_type != 'W')
                        continue;
+
+               symbol_name = line + len + 2;
                /*
-                * Well fix up the end later, when we have all sorted.
+                * Will fix up the end later, when we have all symbols sorted.
                 */
-               sym = symbol__new(start, 0xdead, line + len + 2,
-                                 self->sym_priv_size, 0, v);
+               sym = symbol__new(start, 0, symbol_name);
 
                if (sym == NULL)
                        goto out_delete_line;
-
-               if (filter && filter(self, sym))
-                       symbol__delete(sym, self->sym_priv_size);
-               else {
-                       dso__insert_symbol(self, sym);
-                       count++;
-               }
-       }
-
-       /*
-        * Now that we have all sorted out, just set the ->end of all
-        * symbols
-        */
-       prevnd = rb_first(&self->syms);
-
-       if (prevnd == NULL)
-               goto out_delete_line;
-
-       for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-               struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
-                             *curr = rb_entry(nd, struct symbol, rb_node);
-
-               prev->end = curr->start - 1;
-               prevnd = nd;
+               /*
+                * We will pass the symbols to the filter later, in
+                * map__split_kallsyms, when we have split the maps per module
+                */
+               symbols__insert(root, sym);
        }
 
        free(line);
        fclose(file);
 
-       return count;
+       return 0;
 
 out_delete_line:
        free(line);
@@ -241,14 +359,114 @@ out_failure:
        return -1;
 }
 
-static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int v)
+/*
+ * Split the symbols into maps, making sure there are no overlaps, i.e. the
+ * kernel range is broken in several maps, named [kernel].N, as we don't have
+ * the original ELF section names vmlinux have.
+ */
+static int dso__split_kallsyms(struct dso *self, struct map *map, struct thread *thread,
+                              symbol_filter_t filter)
+{
+       struct map *curr_map = map;
+       struct symbol *pos;
+       int count = 0;
+       struct rb_root *root = &self->symbols[map->type];
+       struct rb_node *next = rb_first(root);
+       int kernel_range = 0;
+
+       while (next) {
+               char *module;
+
+               pos = rb_entry(next, struct symbol, rb_node);
+               next = rb_next(&pos->rb_node);
+
+               module = strchr(pos->name, '\t');
+               if (module) {
+                       if (!thread->use_modules)
+                               goto discard_symbol;
+
+                       *module++ = '\0';
+
+                       if (strcmp(self->name, module)) {
+                               curr_map = thread__find_map_by_name(thread, module);
+                               if (curr_map == NULL) {
+                                       pr_debug("/proc/{kallsyms,modules} "
+                                                "inconsistency!\n");
+                                       return -1;
+                               }
+                       }
+                       /*
+                        * So that we look just like we get from .ko files,
+                        * i.e. not prelinked, relative to map->start.
+                        */
+                       pos->start = curr_map->map_ip(curr_map, pos->start);
+                       pos->end   = curr_map->map_ip(curr_map, pos->end);
+               } else if (curr_map != map) {
+                       char dso_name[PATH_MAX];
+                       struct dso *dso;
+
+                       snprintf(dso_name, sizeof(dso_name), "[kernel].%d",
+                                kernel_range++);
+
+                       dso = dso__new(dso_name);
+                       if (dso == NULL)
+                               return -1;
+
+                       curr_map = map__new2(pos->start, dso, map->type);
+                       if (map == NULL) {
+                               dso__delete(dso);
+                               return -1;
+                       }
+
+                       curr_map->map_ip = curr_map->unmap_ip = identity__map_ip;
+                       __thread__insert_map(thread, curr_map);
+                       ++kernel_range;
+               }
+
+               if (filter && filter(curr_map, pos)) {
+discard_symbol:                rb_erase(&pos->rb_node, root);
+                       symbol__delete(pos);
+               } else {
+                       if (curr_map != map) {
+                               rb_erase(&pos->rb_node, root);
+                               symbols__insert(&curr_map->dso->symbols[curr_map->type], pos);
+                       }
+                       count++;
+               }
+       }
+
+       return count;
+}
+
+
+static int dso__load_kallsyms(struct dso *self, struct map *map,
+                             struct thread *thread, symbol_filter_t filter)
+{
+       if (dso__load_all_kallsyms(self, map) < 0)
+               return -1;
+
+       symbols__fixup_end(&self->symbols[map->type]);
+       self->origin = DSO__ORIG_KERNEL;
+
+       return dso__split_kallsyms(self, map, thread, filter);
+}
+
+size_t kernel_maps__fprintf(FILE *fp)
+{
+       size_t printed = fprintf(fp, "Kernel maps:\n");
+       printed += thread__fprintf_maps(kthread, fp);
+       return printed + fprintf(fp, "END kernel maps\n");
+}
+
+static int dso__load_perf_map(struct dso *self, struct map *map,
+                             symbol_filter_t filter)
 {
        char *line = NULL;
        size_t n;
        FILE *file;
        int nr_syms = 0;
 
-       file = fopen(self->name, "r");
+       file = fopen(self->long_name, "r");
        if (file == NULL)
                goto out_failure;
 
@@ -278,16 +496,15 @@ static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int v)
                if (len + 2 >= line_len)
                        continue;
 
-               sym = symbol__new(start, size, line + len,
-                                 self->sym_priv_size, start, v);
+               sym = symbol__new(start, size, line + len);
 
                if (sym == NULL)
                        goto out_delete_line;
 
-               if (filter && filter(self, sym))
-                       symbol__delete(sym, self->sym_priv_size);
+               if (filter && filter(map, sym))
+                       symbol__delete(sym);
                else {
-                       dso__insert_symbol(self, sym);
+                       symbols__insert(&self->symbols[map->type], sym);
                        nr_syms++;
                }
        }
@@ -393,7 +610,8 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
  * And always look at the original dso, not at debuginfo packages, that
  * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
  */
-static int dso__synthesize_plt_symbols(struct  dso *self, int v)
+static int dso__synthesize_plt_symbols(struct  dso *self, struct map *map,
+                                      symbol_filter_t filter)
 {
        uint32_t nr_rel_entries, idx;
        GElf_Sym sym;
@@ -409,7 +627,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, int v)
        Elf *elf;
        int nr = 0, symidx, fd, err = 0;
 
-       fd = open(self->name, O_RDONLY);
+       fd = open(self->long_name, O_RDONLY);
        if (fd < 0)
                goto out;
 
@@ -477,12 +695,16 @@ static int dso__synthesize_plt_symbols(struct  dso *self, int v)
                                 "%s@plt", elf_sym__name(&sym, symstrs));
 
                        f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-                                       sympltname, self->sym_priv_size, 0, v);
+                                       sympltname);
                        if (!f)
                                goto out_elf_end;
 
-                       dso__insert_symbol(self, f);
-                       ++nr;
+                       if (filter && filter(map, f))
+                               symbol__delete(f);
+                       else {
+                               symbols__insert(&self->symbols[map->type], f);
+                               ++nr;
+                       }
                }
        } else if (shdr_rel_plt.sh_type == SHT_REL) {
                GElf_Rel pos_mem, *pos;
@@ -495,12 +717,16 @@ static int dso__synthesize_plt_symbols(struct  dso *self, int v)
                                 "%s@plt", elf_sym__name(&sym, symstrs));
 
                        f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-                                       sympltname, self->sym_priv_size, 0, v);
+                                       sympltname);
                        if (!f)
                                goto out_elf_end;
 
-                       dso__insert_symbol(self, f);
-                       ++nr;
+                       if (filter && filter(map, f))
+                               symbol__delete(f);
+                       else {
+                               symbols__insert(&self->symbols[map->type], f);
+                               ++nr;
+                       }
                }
        }
 
@@ -513,14 +739,18 @@ out_close:
        if (err == 0)
                return nr;
 out:
-       fprintf(stderr, "%s: problems reading %s PLT info.\n",
-               __func__, self->name);
+       pr_warning("%s: problems reading %s PLT info.\n",
+                  __func__, self->long_name);
        return 0;
 }
 
-static int dso__load_sym(struct dso *self, int fd, const char *name,
-                        symbol_filter_t filter, int v, struct module *mod)
+static int dso__load_sym(struct dso *self, struct map *map,
+                        struct thread *thread, const char *name, int fd,
+                        symbol_filter_t filter, int kernel, int kmodule)
 {
+       struct map *curr_map = map;
+       struct dso *curr_dso = self;
+       size_t dso_name_len = strlen(self->short_name);
        Elf_Data *symstrs, *secstrs;
        uint32_t nr_syms;
        int err = -1;
@@ -531,19 +761,16 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
        GElf_Sym sym;
        Elf_Scn *sec, *sec_strndx;
        Elf *elf;
-       int nr = 0, kernel = !strcmp("[kernel]", self->name);
+       int nr = 0;
 
        elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
        if (elf == NULL) {
-               if (v)
-                       fprintf(stderr, "%s: cannot read %s ELF file.\n",
-                               __func__, name);
+               pr_err("%s: cannot read %s ELF file.\n", __func__, name);
                goto out_close;
        }
 
        if (gelf_getehdr(elf, &ehdr) == NULL) {
-               if (v)
-                       fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+               pr_err("%s: cannot get elf header.\n", __func__);
                goto out_elf_end;
        }
 
@@ -587,9 +814,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
        elf_symtab__for_each_symbol(syms, nr_syms, idx, sym) {
                struct symbol *f;
                const char *elf_name;
-               char *demangled;
-               u64 obj_start;
-               struct section *section = NULL;
+               char *demangled = NULL;
                int is_label = elf_sym__is_label(&sym);
                const char *section_name;
 
@@ -605,52 +830,85 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
                if (is_label && !elf_sec__is_text(&shdr, secstrs))
                        continue;
 
+               elf_name = elf_sym__name(&sym, symstrs);
                section_name = elf_sec__name(&shdr, secstrs);
-               obj_start = sym.st_value;
 
-               if (self->adjust_symbols) {
-                       if (v >= 2)
-                               printf("adjusting symbol: st_value: %Lx sh_addr: %Lx sh_offset: %Lx\n",
-                                       (u64)sym.st_value, (u64)shdr.sh_addr, (u64)shdr.sh_offset);
+               if (kernel || kmodule) {
+                       char dso_name[PATH_MAX];
 
-                       sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-               }
+                       if (strcmp(section_name,
+                                  curr_dso->short_name + dso_name_len) == 0)
+                               goto new_symbol;
 
-               if (mod) {
-                       section = mod->sections->find_section(mod->sections, section_name);
-                       if (section)
-                               sym.st_value += section->vma;
-                       else {
-                               fprintf(stderr, "dso__load_sym() module %s lookup of %s failed\n",
-                                       mod->name, section_name);
-                               goto out_elf_end;
+                       if (strcmp(section_name, ".text") == 0) {
+                               curr_map = map;
+                               curr_dso = self;
+                               goto new_symbol;
                        }
+
+                       snprintf(dso_name, sizeof(dso_name),
+                                "%s%s", self->short_name, section_name);
+
+                       curr_map = thread__find_map_by_name(thread, dso_name);
+                       if (curr_map == NULL) {
+                               u64 start = sym.st_value;
+
+                               if (kmodule)
+                                       start += map->start + shdr.sh_offset;
+
+                               curr_dso = dso__new(dso_name);
+                               if (curr_dso == NULL)
+                                       goto out_elf_end;
+                               curr_map = map__new2(start, curr_dso,
+                                                    MAP__FUNCTION);
+                               if (curr_map == NULL) {
+                                       dso__delete(curr_dso);
+                                       goto out_elf_end;
+                               }
+                               curr_map->map_ip = identity__map_ip;
+                               curr_map->unmap_ip = identity__map_ip;
+                               curr_dso->origin = DSO__ORIG_KERNEL;
+                               __thread__insert_map(kthread, curr_map);
+                               dsos__add(&dsos__kernel, curr_dso);
+                       } else
+                               curr_dso = curr_map->dso;
+
+                       goto new_symbol;
+               }
+
+               if (curr_dso->adjust_symbols) {
+                       pr_debug2("adjusting symbol: st_value: %Lx sh_addr: "
+                                 "%Lx sh_offset: %Lx\n", (u64)sym.st_value,
+                                 (u64)shdr.sh_addr, (u64)shdr.sh_offset);
+                       sym.st_value -= shdr.sh_addr - shdr.sh_offset;
                }
                /*
                 * We need to figure out if the object was created from C++ sources
                 * DWARF DW_compile_unit has this, but we don't always have access
                 * to it...
                 */
-               elf_name = elf_sym__name(&sym, symstrs);
                demangled = bfd_demangle(NULL, elf_name, DMGL_PARAMS | DMGL_ANSI);
                if (demangled != NULL)
                        elf_name = demangled;
-
-               f = symbol__new(sym.st_value, sym.st_size, elf_name,
-                               self->sym_priv_size, obj_start, v);
+new_symbol:
+               f = symbol__new(sym.st_value, sym.st_size, elf_name);
                free(demangled);
                if (!f)
                        goto out_elf_end;
 
-               if (filter && filter(self, f))
-                       symbol__delete(f, self->sym_priv_size);
+               if (filter && filter(curr_map, f))
+                       symbol__delete(f);
                else {
-                       f->module = mod;
-                       dso__insert_symbol(self, f);
+                       symbols__insert(&curr_dso->symbols[curr_map->type], f);
                        nr++;
                }
        }
 
+       /*
+        * For misannotated, zeroed, ASM function sizes.
+        */
+       if (nr > 0)
+               symbols__fixup_end(&self->symbols[map->type]);
        err = nr;
 out_elf_end:
        elf_end(elf);
@@ -658,63 +916,153 @@ out_close:
        return err;
 }
 
-#define BUILD_ID_SIZE 128
+static bool dso__build_id_equal(const struct dso *self, u8 *build_id)
+{
+       return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0;
+}
 
-static char *dso__read_build_id(struct dso *self, int v)
+static bool __dsos__read_build_ids(struct list_head *head)
 {
-       int i;
+       bool have_build_id = false;
+       struct dso *pos;
+
+       list_for_each_entry(pos, head, node)
+               if (filename__read_build_id(pos->long_name, pos->build_id,
+                                           sizeof(pos->build_id)) > 0) {
+                       have_build_id     = true;
+                       pos->has_build_id = true;
+               }
+
+       return have_build_id;
+}
+
+bool dsos__read_build_ids(void)
+{
+       return __dsos__read_build_ids(&dsos__kernel) ||
+              __dsos__read_build_ids(&dsos__user);
+}
+
+/*
+ * Align offset to 4 bytes as needed for note name and descriptor data.
+ */
+#define NOTE_ALIGN(n) (((n) + 3) & -4U)
+
+int filename__read_build_id(const char *filename, void *bf, size_t size)
+{
+       int fd, err = -1;
        GElf_Ehdr ehdr;
        GElf_Shdr shdr;
-       Elf_Data *build_id_data;
+       Elf_Data *data;
        Elf_Scn *sec;
-       char *build_id = NULL, *bid;
-       unsigned char *raw;
+       Elf_Kind ek;
+       void *ptr;
        Elf *elf;
-       int fd = open(self->name, O_RDONLY);
 
+       if (size < BUILD_ID_SIZE)
+               goto out;
+
+       fd = open(filename, O_RDONLY);
        if (fd < 0)
                goto out;
 
        elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
        if (elf == NULL) {
-               if (v)
-                       fprintf(stderr, "%s: cannot read %s ELF file.\n",
-                               __func__, self->name);
+               pr_debug2("%s: cannot read %s ELF file.\n", __func__, filename);
                goto out_close;
        }
 
+       ek = elf_kind(elf);
+       if (ek != ELF_K_ELF)
+               goto out_elf_end;
+
        if (gelf_getehdr(elf, &ehdr) == NULL) {
-               if (v)
-                       fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+               pr_err("%s: cannot get elf header.\n", __func__);
                goto out_elf_end;
        }
 
-       sec = elf_section_by_name(elf, &ehdr, &shdr, ".note.gnu.build-id", NULL);
-       if (sec == NULL)
-               goto out_elf_end;
+       sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                 ".note.gnu.build-id", NULL);
+       if (sec == NULL) {
+               sec = elf_section_by_name(elf, &ehdr, &shdr,
+                                         ".notes", NULL);
+               if (sec == NULL)
+                       goto out_elf_end;
+       }
 
-       build_id_data = elf_getdata(sec, NULL);
-       if (build_id_data == NULL)
-               goto out_elf_end;
-       build_id = malloc(BUILD_ID_SIZE);
-       if (build_id == NULL)
+       data = elf_getdata(sec, NULL);
+       if (data == NULL)
                goto out_elf_end;
-       raw = build_id_data->d_buf + 16;
-       bid = build_id;
 
-       for (i = 0; i < 20; ++i) {
-               sprintf(bid, "%02x", *raw);
-               ++raw;
-               bid += 2;
+       ptr = data->d_buf;
+       while (ptr < (data->d_buf + data->d_size)) {
+               GElf_Nhdr *nhdr = ptr;
+               int namesz = NOTE_ALIGN(nhdr->n_namesz),
+                   descsz = NOTE_ALIGN(nhdr->n_descsz);
+               const char *name;
+
+               ptr += sizeof(*nhdr);
+               name = ptr;
+               ptr += namesz;
+               if (nhdr->n_type == NT_GNU_BUILD_ID &&
+                   nhdr->n_namesz == sizeof("GNU")) {
+                       if (memcmp(name, "GNU", sizeof("GNU")) == 0) {
+                               memcpy(bf, ptr, BUILD_ID_SIZE);
+                               err = BUILD_ID_SIZE;
+                               break;
+                       }
+               }
+               ptr += descsz;
        }
-       if (v >= 2)
-               printf("%s(%s): %s\n", __func__, self->name, build_id);
 out_elf_end:
        elf_end(elf);
 out_close:
        close(fd);
 out:
-       return build_id;
+       return err;
+}
+
+int sysfs__read_build_id(const char *filename, void *build_id, size_t size)
+{
+       int fd, err = -1;
+
+       if (size < BUILD_ID_SIZE)
+               goto out;
+
+       fd = open(filename, O_RDONLY);
+       if (fd < 0)
+               goto out;
+
+       while (1) {
+               char bf[BUFSIZ];
+               GElf_Nhdr nhdr;
+               int namesz, descsz;
+
+               if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr))
+                       break;
+
+               namesz = NOTE_ALIGN(nhdr.n_namesz);
+               descsz = NOTE_ALIGN(nhdr.n_descsz);
+               if (nhdr.n_type == NT_GNU_BUILD_ID &&
+                   nhdr.n_namesz == sizeof("GNU")) {
+                       if (read(fd, bf, namesz) != namesz)
+                               break;
+                       if (memcmp(bf, "GNU", sizeof("GNU")) == 0) {
+                               if (read(fd, build_id,
+                                   BUILD_ID_SIZE) == BUILD_ID_SIZE) {
+                                       err = 0;
+                                       break;
+                               }
+                       } else if (read(fd, bf, descsz) != descsz)
+                               break;
+               } else {
+                       int n = namesz + descsz;
+                       if (read(fd, bf, n) != n)
+                               break;
+               }
+       }
+       close(fd);
+out:
+       return err;
 }
 
 char dso__symtab_origin(const struct dso *self)
@@ -726,6 +1074,7 @@ char dso__symtab_origin(const struct dso *self)
                [DSO__ORIG_UBUNTU] =   'u',
                [DSO__ORIG_BUILDID] =  'b',
                [DSO__ORIG_DSO] =      'd',
+               [DSO__ORIG_KMODULE] =  'K',
        };
 
        if (self == NULL || self->origin == DSO__ORIG_NOT_FOUND)
@@ -733,20 +1082,27 @@ char dso__symtab_origin(const struct dso *self)
        return origin[self->origin];
 }
 
-int dso__load(struct dso *self, symbol_filter_t filter, int v)
+int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 {
        int size = PATH_MAX;
-       char *name = malloc(size), *build_id = NULL;
+       char *name;
+       u8 build_id[BUILD_ID_SIZE];
        int ret = -1;
        int fd;
 
+       dso__set_loaded(self, map->type);
+
+       if (self->kernel)
+               return dso__load_kernel_sym(self, map, kthread, filter);
+
+       name = malloc(size);
        if (!name)
                return -1;
 
        self->adjust_symbols = 0;
 
        if (strncmp(self->name, "/tmp/perf-", 10) == 0) {
-               ret = dso__load_perf_map(self, filter, v);
+               ret = dso__load_perf_map(self, map, filter);
                self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT :
                                         DSO__ORIG_NOT_FOUND;
                return ret;
@@ -759,34 +1115,50 @@ more:
                self->origin++;
                switch (self->origin) {
                case DSO__ORIG_FEDORA:
-                       snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+                       snprintf(name, size, "/usr/lib/debug%s.debug",
+                                self->long_name);
                        break;
                case DSO__ORIG_UBUNTU:
-                       snprintf(name, size, "/usr/lib/debug%s", self->name);
+                       snprintf(name, size, "/usr/lib/debug%s",
+                                self->long_name);
                        break;
                case DSO__ORIG_BUILDID:
-                       build_id = dso__read_build_id(self, v);
-                       if (build_id != NULL) {
+                       if (filename__read_build_id(self->long_name, build_id,
+                                                   sizeof(build_id))) {
+                               char build_id_hex[BUILD_ID_SIZE * 2 + 1];
+
+                               build_id__sprintf(build_id, sizeof(build_id),
+                                                 build_id_hex);
                                snprintf(name, size,
                                         "/usr/lib/debug/.build-id/%.2s/%s.debug",
-                                       build_id, build_id + 2);
-                               free(build_id);
+                                       build_id_hex, build_id_hex + 2);
+                               if (self->has_build_id)
+                                       goto compare_build_id;
                                break;
                        }
                        self->origin++;
                        /* Fall thru */
                case DSO__ORIG_DSO:
-                       snprintf(name, size, "%s", self->name);
+                       snprintf(name, size, "%s", self->long_name);
                        break;
 
                default:
                        goto out;
                }
 
+               if (self->has_build_id) {
+                       if (filename__read_build_id(name, build_id,
+                                                   sizeof(build_id)) < 0)
+                               goto more;
+compare_build_id:
+                       if (!dso__build_id_equal(self, build_id))
+                               goto more;
+               }
+
                fd = open(name, O_RDONLY);
        } while (fd < 0);
 
-       ret = dso__load_sym(self, fd, name, filter, v, NULL);
+       ret = dso__load_sym(self, map, NULL, name, fd, filter, 0, 0);
        close(fd);
 
        /*
@@ -796,7 +1168,7 @@ more:
                goto more;
 
        if (ret > 0) {
-               int nr_plt = dso__synthesize_plt_symbols(self, v);
+               int nr_plt = dso__synthesize_plt_symbols(self, map, filter);
                if (nr_plt > 0)
                        ret += nr_plt;
        }
@@ -807,151 +1179,279 @@ out:
        return ret;
 }
 
-static int dso__load_module(struct dso *self, struct mod_dso *mods, const char *name,
-                            symbol_filter_t filter, int v)
+static struct map *thread__find_map_by_name(struct thread *self, char *name)
 {
-       struct module *mod = mod_dso__find_module(mods, name);
-       int err = 0, fd;
+       struct rb_node *nd;
 
-       if (mod == NULL || !mod->active)
-               return err;
+       for (nd = rb_first(&self->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
+               struct map *map = rb_entry(nd, struct map, rb_node);
 
-       fd = open(mod->path, O_RDONLY);
+               if (map->dso && strcmp(map->dso->name, name) == 0)
+                       return map;
+       }
 
-       if (fd < 0)
-               return err;
+       return NULL;
+}
 
-       err = dso__load_sym(self, fd, name, filter, v, mod);
-       close(fd);
+static int dsos__set_modules_path_dir(char *dirname)
+{
+       struct dirent *dent;
+       DIR *dir = opendir(dirname);
 
-       return err;
+       if (!dir) {
+               pr_debug("%s: cannot open %s dir\n", __func__, dirname);
+               return -1;
+       }
+
+       while ((dent = readdir(dir)) != NULL) {
+               char path[PATH_MAX];
+
+               if (dent->d_type == DT_DIR) {
+                       if (!strcmp(dent->d_name, ".") ||
+                           !strcmp(dent->d_name, ".."))
+                               continue;
+
+                       snprintf(path, sizeof(path), "%s/%s",
+                                dirname, dent->d_name);
+                       if (dsos__set_modules_path_dir(path) < 0)
+                               goto failure;
+               } else {
+                       char *dot = strrchr(dent->d_name, '.'),
+                            dso_name[PATH_MAX];
+                       struct map *map;
+                       char *long_name;
+
+                       if (dot == NULL || strcmp(dot, ".ko"))
+                               continue;
+                       snprintf(dso_name, sizeof(dso_name), "[%.*s]",
+                                (int)(dot - dent->d_name), dent->d_name);
+
+                       strxfrchar(dso_name, '-', '_');
+                       map = thread__find_map_by_name(kthread, dso_name);
+                       if (map == NULL)
+                               continue;
+
+                       snprintf(path, sizeof(path), "%s/%s",
+                                dirname, dent->d_name);
+
+                       long_name = strdup(path);
+                       if (long_name == NULL)
+                               goto failure;
+                       dso__set_long_name(map->dso, long_name);
+               }
+       }
+
+       return 0;
+failure:
+       closedir(dir);
+       return -1;
 }
 
-int dso__load_modules(struct dso *self, symbol_filter_t filter, int v)
+static int dsos__set_modules_path(void)
 {
-       struct mod_dso *mods = mod_dso__new_dso("modules");
-       struct module *pos;
-       struct rb_node *next;
-       int err, count = 0;
+       struct utsname uts;
+       char modules_path[PATH_MAX];
 
-       err = mod_dso__load_modules(mods);
-
-       if (err <= 0)
-               return err;
+       if (uname(&uts) < 0)
+               return -1;
 
-       /*
-        * Iterate over modules, and load active symbols.
-        */
-       next = rb_first(&mods->mods);
-       while (next) {
-               pos = rb_entry(next, struct module, rb_node);
-               err = dso__load_module(self, mods, pos->name, filter, v);
+       snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel",
+                uts.release);
 
-               if (err < 0)
-                       break;
+       return dsos__set_modules_path_dir(modules_path);
+}
 
-               next = rb_next(&pos->rb_node);
-               count += err;
-       }
+/*
+ * Constructor variant for modules (where we know from /proc/modules where
+ * they are loaded) and for vmlinux, where only after we load all the
+ * symbols we'll know where it starts and ends.
+ */
+static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
+{
+       struct map *self = malloc(sizeof(*self));
 
-       if (err < 0) {
-               mod_dso__delete_modules(mods);
-               mod_dso__delete_self(mods);
-               return err;
+       if (self != NULL) {
+               /*
+                * ->end will be filled after we load all the symbols
+                */
+               map__init(self, type, start, 0, 0, dso);
        }
 
-       return count;
+       return self;
 }
 
-static inline void dso__fill_symbol_holes(struct dso *self)
+static int thread__create_module_maps(struct thread *self)
 {
-       struct symbol *prev = NULL;
-       struct rb_node *nd;
+       char *line = NULL;
+       size_t n;
+       FILE *file = fopen("/proc/modules", "r");
+       struct map *map;
 
-       for (nd = rb_last(&self->syms); nd; nd = rb_prev(nd)) {
-               struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
+       if (file == NULL)
+               return -1;
 
-               if (prev) {
-                       u64 hole = 0;
-                       int alias = pos->start == prev->start;
+       while (!feof(file)) {
+               char name[PATH_MAX];
+               u64 start;
+               struct dso *dso;
+               char *sep;
+               int line_len;
 
-                       if (!alias)
-                               hole = prev->start - pos->end - 1;
+               line_len = getline(&line, &n, file);
+               if (line_len < 0)
+                       break;
 
-                       if (hole || alias) {
-                               if (alias)
-                                       pos->end = prev->end;
-                               else if (hole)
-                                       pos->end = prev->start - 1;
-                       }
+               if (!line)
+                       goto out_failure;
+
+               line[--line_len] = '\0'; /* \n */
+
+               sep = strrchr(line, 'x');
+               if (sep == NULL)
+                       continue;
+
+               hex2u64(sep + 1, &start);
+
+               sep = strchr(line, ' ');
+               if (sep == NULL)
+                       continue;
+
+               *sep = '\0';
+
+               snprintf(name, sizeof(name), "[%s]", line);
+               dso = dso__new(name);
+
+               if (dso == NULL)
+                       goto out_delete_line;
+
+               map = map__new2(start, dso, MAP__FUNCTION);
+               if (map == NULL) {
+                       dso__delete(dso);
+                       goto out_delete_line;
                }
-               prev = pos;
+
+               snprintf(name, sizeof(name),
+                        "/sys/module/%s/notes/.note.gnu.build-id", line);
+               if (sysfs__read_build_id(name, dso->build_id,
+                                        sizeof(dso->build_id)) == 0)
+                       dso->has_build_id = true;
+
+               dso->origin = DSO__ORIG_KMODULE;
+               __thread__insert_map(self, map);
+               dsos__add(&dsos__kernel, dso);
        }
+
+       free(line);
+       fclose(file);
+
+       return dsos__set_modules_path();
+
+out_delete_line:
+       free(line);
+out_failure:
+       return -1;
 }
 
-static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
-                            symbol_filter_t filter, int v)
+static int dso__load_vmlinux(struct dso *self, struct map *map, struct thread *thread,
+                            const char *vmlinux, symbol_filter_t filter)
 {
-       int err, fd = open(vmlinux, O_RDONLY);
+       int err = -1, fd;
 
-       if (fd < 0)
-               return -1;
+       if (self->has_build_id) {
+               u8 build_id[BUILD_ID_SIZE];
 
-       err = dso__load_sym(self, fd, vmlinux, filter, v, NULL);
+               if (filename__read_build_id(vmlinux, build_id,
+                                           sizeof(build_id)) < 0) {
+                       pr_debug("No build_id in %s, ignoring it\n", vmlinux);
+                       return -1;
+               }
+               if (!dso__build_id_equal(self, build_id)) {
+                       char expected_build_id[BUILD_ID_SIZE * 2 + 1],
+                            vmlinux_build_id[BUILD_ID_SIZE * 2 + 1];
+
+                       build_id__sprintf(self->build_id,
+                                         sizeof(self->build_id),
+                                         expected_build_id);
+                       build_id__sprintf(build_id, sizeof(build_id),
+                                         vmlinux_build_id);
+                       pr_debug("build_id in %s is %s while expected is %s, "
+                                "ignoring it\n", vmlinux, vmlinux_build_id,
+                                expected_build_id);
+                       return -1;
+               }
+       }
 
-       if (err > 0)
-               dso__fill_symbol_holes(self);
+       fd = open(vmlinux, O_RDONLY);
+       if (fd < 0)
+               return -1;
 
+       dso__set_loaded(self, map->type);
+       err = dso__load_sym(self, map, thread, self->long_name, fd, filter, 1, 0);
        close(fd);
 
        return err;
 }
 
-int dso__load_kernel(struct dso *self, const char *vmlinux,
-                    symbol_filter_t filter, int v, int use_modules)
+static int dso__load_kernel_sym(struct dso *self, struct map *map,
+                               struct thread *thread, symbol_filter_t filter)
 {
-       int err = -1;
-
-       if (vmlinux) {
-               err = dso__load_vmlinux(self, vmlinux, filter, v);
-               if (err > 0 && use_modules) {
-                       int syms = dso__load_modules(self, filter, v);
-
-                       if (syms < 0) {
-                               fprintf(stderr, "dso__load_modules failed!\n");
-                               return syms;
+       int err;
+       bool is_kallsyms;
+
+       if (vmlinux_path != NULL) {
+               int i;
+               pr_debug("Looking at the vmlinux_path (%d entries long)\n",
+                        vmlinux_path__nr_entries);
+               for (i = 0; i < vmlinux_path__nr_entries; ++i) {
+                       err = dso__load_vmlinux(self, map, thread,
+                                               vmlinux_path[i], filter);
+                       if (err > 0) {
+                               pr_debug("Using %s for symbols\n",
+                                        vmlinux_path[i]);
+                               dso__set_long_name(self,
+                                                  strdup(vmlinux_path[i]));
+                               goto out_fixup;
                        }
-                       err += syms;
                }
        }
 
-       if (err <= 0)
-               err = dso__load_kallsyms(self, filter, v);
+       is_kallsyms = self->long_name[0] == '[';
+       if (is_kallsyms)
+               goto do_kallsyms;
 
-       if (err > 0)
-               self->origin = DSO__ORIG_KERNEL;
+       err = dso__load_vmlinux(self, map, thread, self->long_name, filter);
+       if (err <= 0) {
+               pr_info("The file %s cannot be used, "
+                       "trying to use /proc/kallsyms...", self->long_name);
+do_kallsyms:
+               err = dso__load_kallsyms(self, map, thread, filter);
+               if (err > 0 && !is_kallsyms)
+                        dso__set_long_name(self, strdup("[kernel.kallsyms]"));
+       }
+
+       if (err > 0) {
+out_fixup:
+               map__fixup_start(map);
+               map__fixup_end(map);
+       }
 
        return err;
 }
 
-LIST_HEAD(dsos);
-struct dso     *kernel_dso;
-struct dso     *vdso;
-struct dso     *hypervisor_dso;
-
-const char     *vmlinux_name = "vmlinux";
-int            modules;
+LIST_HEAD(dsos__user);
+LIST_HEAD(dsos__kernel);
+struct dso *vdso;
 
-static void dsos__add(struct dso *dso)
+static void dsos__add(struct list_head *head, struct dso *dso)
 {
-       list_add_tail(&dso->node, &dsos);
+       list_add_tail(&dso->node, head);
 }
 
-static struct dso *dsos__find(const char *name)
+static struct dso *dsos__find(struct list_head *head, const char *name)
 {
        struct dso *pos;
 
-       list_for_each_entry(pos, &dsos, node)
+       list_for_each_entry(pos, head, node)
                if (strcmp(pos->name, name) == 0)
                        return pos;
        return NULL;
@@ -959,79 +1459,170 @@ static struct dso *dsos__find(const char *name)
 
 struct dso *dsos__findnew(const char *name)
 {
-       struct dso *dso = dsos__find(name);
-       int nr;
-
-       if (dso)
-               return dso;
-
-       dso = dso__new(name, 0);
-       if (!dso)
-               goto out_delete_dso;
+       struct dso *dso = dsos__find(&dsos__user, name);
 
-       nr = dso__load(dso, NULL, verbose);
-       if (nr < 0) {
-               eprintf("Failed to open: %s\n", name);
-               goto out_delete_dso;
+       if (!dso) {
+               dso = dso__new(name);
+               if (dso != NULL) {
+                       dsos__add(&dsos__user, dso);
+                       dso__set_basename(dso);
+               }
        }
-       if (!nr)
-               eprintf("No symbols found in: %s, maybe install a debug package?\n", name);
-
-       dsos__add(dso);
 
        return dso;
+}
 
-out_delete_dso:
-       dso__delete(dso);
-       return NULL;
+static void __dsos__fprintf(struct list_head *head, FILE *fp)
+{
+       struct dso *pos;
+
+       list_for_each_entry(pos, head, node) {
+               int i;
+               for (i = 0; i < MAP__NR_TYPES; ++i)
+                       dso__fprintf(pos, i, fp);
+       }
 }
 
 void dsos__fprintf(FILE *fp)
+{
+       __dsos__fprintf(&dsos__kernel, fp);
+       __dsos__fprintf(&dsos__user, fp);
+}
+
+static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp)
 {
        struct dso *pos;
+       size_t ret = 0;
 
-       list_for_each_entry(pos, &dsos, node)
-               dso__fprintf(pos, fp);
+       list_for_each_entry(pos, head, node) {
+               ret += dso__fprintf_buildid(pos, fp);
+               ret += fprintf(fp, " %s\n", pos->long_name);
+       }
+       return ret;
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, u64 ip)
+size_t dsos__fprintf_buildid(FILE *fp)
 {
-       return dso__find_symbol(dso, ip);
+       return (__dsos__fprintf_buildid(&dsos__kernel, fp) +
+               __dsos__fprintf_buildid(&dsos__user, fp));
 }
 
-int load_kernel(void)
+static int thread__create_kernel_map(struct thread *self, const char *vmlinux)
 {
-       int err;
+       struct map *kmap;
+       struct dso *kernel = dso__new(vmlinux ?: "[kernel.kallsyms]");
 
-       kernel_dso = dso__new("[kernel]", 0);
-       if (!kernel_dso)
+       if (kernel == NULL)
                return -1;
 
-       err = dso__load_kernel(kernel_dso, vmlinux_name, NULL, verbose, modules);
-       if (err <= 0) {
-               dso__delete(kernel_dso);
-               kernel_dso = NULL;
-       } else
-               dsos__add(kernel_dso);
+       kmap = map__new2(0, kernel, MAP__FUNCTION);
+       if (kmap == NULL)
+               goto out_delete_kernel_dso;
 
-       vdso = dso__new("[vdso]", 0);
-       if (!vdso)
-               return -1;
+       kmap->map_ip       = kmap->unmap_ip = identity__map_ip;
+       kernel->short_name = "[kernel]";
+       kernel->kernel     = 1;
 
-       vdso->find_symbol = vdso__find_symbol;
+       vdso = dso__new("[vdso]");
+       if (vdso == NULL)
+               goto out_delete_kernel_map;
+       dso__set_loaded(vdso, MAP__FUNCTION);
 
-       dsos__add(vdso);
+       if (sysfs__read_build_id("/sys/kernel/notes", kernel->build_id,
+                                sizeof(kernel->build_id)) == 0)
+               kernel->has_build_id = true;
 
-       hypervisor_dso = dso__new("[hypervisor]", 0);
-       if (!hypervisor_dso)
-               return -1;
-       dsos__add(hypervisor_dso);
+       __thread__insert_map(self, kmap);
+       dsos__add(&dsos__kernel, kernel);
+       dsos__add(&dsos__user, vdso);
 
-       return err;
+       return 0;
+
+out_delete_kernel_map:
+       map__delete(kmap);
+out_delete_kernel_dso:
+       dso__delete(kernel);
+       return -1;
+}
+
+static void vmlinux_path__exit(void)
+{
+       while (--vmlinux_path__nr_entries >= 0) {
+               free(vmlinux_path[vmlinux_path__nr_entries]);
+               vmlinux_path[vmlinux_path__nr_entries] = NULL;
+       }
+
+       free(vmlinux_path);
+       vmlinux_path = NULL;
 }
 
+static int vmlinux_path__init(void)
+{
+       struct utsname uts;
+       char bf[PATH_MAX];
+
+       if (uname(&uts) < 0)
+               return -1;
+
+       vmlinux_path = malloc(sizeof(char *) * 5);
+       if (vmlinux_path == NULL)
+               return -1;
+
+       vmlinux_path[vmlinux_path__nr_entries] = strdup("vmlinux");
+       if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+               goto out_fail;
+       ++vmlinux_path__nr_entries;
+       vmlinux_path[vmlinux_path__nr_entries] = strdup("/boot/vmlinux");
+       if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+               goto out_fail;
+       ++vmlinux_path__nr_entries;
+       snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
+       vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+       if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+               goto out_fail;
+       ++vmlinux_path__nr_entries;
+       snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release);
+       vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+       if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+               goto out_fail;
+       ++vmlinux_path__nr_entries;
+       snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux",
+                uts.release);
+       vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
+       if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
+               goto out_fail;
+       ++vmlinux_path__nr_entries;
+
+       return 0;
+
+out_fail:
+       vmlinux_path__exit();
+       return -1;
+}
 
-void symbol__init(void)
+int symbol__init(struct symbol_conf *conf)
 {
+       const struct symbol_conf *pconf = conf ?: &symbol_conf__defaults;
+
        elf_version(EV_CURRENT);
+       symbol__priv_size = pconf->priv_size;
+       thread__init(kthread, 0);
+
+       if (pconf->try_vmlinux_path && vmlinux_path__init() < 0)
+               return -1;
+
+       if (thread__create_kernel_map(kthread, pconf->vmlinux_name) < 0) {
+               vmlinux_path__exit();
+               return -1;
+       }
+
+       kthread->use_modules = pconf->use_modules;
+       if (pconf->use_modules && thread__create_module_maps(kthread) < 0)
+               pr_debug("Failed to load list of modules in use, "
+                        "continuing...\n");
+       /*
+        * Now that we have all the maps created, just set the ->end of them:
+        */
+       thread__fixup_maps_end(kthread);
+       return 0;
 }
index 829da9edba649f9ed7bd01ef6e17a9aaec731ca4..17003efa0b39ab072f6656109cddd434fe248d9a 100644 (file)
@@ -1,11 +1,11 @@
-#ifndef _PERF_SYMBOL_
-#define _PERF_SYMBOL_ 1
+#ifndef __PERF_SYMBOL
+#define __PERF_SYMBOL 1
 
 #include <linux/types.h>
+#include <stdbool.h>
 #include "types.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include "module.h"
 #include "event.h"
 
 #ifdef HAVE_CPLUS_DEMANGLE
@@ -46,57 +46,75 @@ struct symbol {
        struct rb_node  rb_node;
        u64             start;
        u64             end;
-       u64             obj_start;
-       u64             hist_sum;
-       u64             *hist;
-       struct module   *module;
-       void            *priv;
        char            name[0];
 };
 
+struct symbol_conf {
+       unsigned short  priv_size;
+       bool            try_vmlinux_path,
+                       use_modules;
+       const char      *vmlinux_name;
+};
+
+extern unsigned int symbol__priv_size;
+
+static inline void *symbol__priv(struct symbol *self)
+{
+       return ((void *)self) - symbol__priv_size;
+}
+
+struct addr_location {
+       struct thread *thread;
+       struct map    *map;
+       struct symbol *sym;
+       u64           addr;
+       char          level;
+};
+
 struct dso {
        struct list_head node;
-       struct rb_root   syms;
-       struct symbol    *(*find_symbol)(struct dso *, u64 ip);
-       unsigned int     sym_priv_size;
-       unsigned char    adjust_symbols;
-       unsigned char    slen_calculated;
+       struct rb_root   symbols[MAP__NR_TYPES];
+       struct symbol    *(*find_symbol)(struct dso *self,
+                                        enum map_type type, u64 addr);
+       u8               adjust_symbols:1;
+       u8               slen_calculated:1;
+       u8               has_build_id:1;
+       u8               kernel:1;
        unsigned char    origin;
+       u8               loaded;
+       u8               build_id[BUILD_ID_SIZE];
+       u16              long_name_len;
+       const char       *short_name;
+       char             *long_name;
        char             name[0];
 };
 
-extern const char *sym_hist_filter;
-
-typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
-
-struct dso *dso__new(const char *name, unsigned int sym_priv_size);
+struct dso *dso__new(const char *name);
 void dso__delete(struct dso *self);
 
-static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
-{
-       return ((void *)sym) - self->sym_priv_size;
-}
-
-struct symbol *dso__find_symbol(struct dso *self, u64 ip);
+bool dso__loaded(const struct dso *self, enum map_type type);
 
-int dso__load_kernel(struct dso *self, const char *vmlinux,
-                    symbol_filter_t filter, int verbose, int modules);
-int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose);
-int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
 struct dso *dsos__findnew(const char *name);
+int dso__load(struct dso *self, struct map *map, symbol_filter_t filter);
 void dsos__fprintf(FILE *fp);
+size_t dsos__fprintf_buildid(FILE *fp);
 
-size_t dso__fprintf(struct dso *self, FILE *fp);
+size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
+size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
 char dso__symtab_origin(const struct dso *self);
+void dso__set_build_id(struct dso *self, void *build_id);
+
+int filename__read_build_id(const char *filename, void *bf, size_t size);
+int sysfs__read_build_id(const char *filename, void *bf, size_t size);
+bool dsos__read_build_ids(void);
+int build_id__sprintf(u8 *self, int len, char *bf);
 
-int load_kernel(void);
+size_t kernel_maps__fprintf(FILE *fp);
 
-void symbol__init(void);
+int symbol__init(struct symbol_conf *conf);
 
-extern struct list_head dsos;
-extern struct dso *kernel_dso;
+struct thread;
+struct thread *kthread;
+extern struct list_head dsos__user, dsos__kernel;
 extern struct dso *vdso;
-extern struct dso *hypervisor_dso;
-extern const char *vmlinux_name;
-extern int   modules;
-#endif /* _PERF_SYMBOL_ */
+#endif /* __PERF_SYMBOL */
index 45efb5db0d19819f281f75b17541dd9ebb76b8ad..603f5610861b841cc1a526dc1f1566a0c943d56f 100644 (file)
@@ -6,16 +6,29 @@
 #include "util.h"
 #include "debug.h"
 
+static struct rb_root threads;
+static struct thread *last_match;
+
+void thread__init(struct thread *self, pid_t pid)
+{
+       int i;
+       self->pid = pid;
+       self->comm = NULL;
+       for (i = 0; i < MAP__NR_TYPES; ++i) {
+               self->maps[i] = RB_ROOT;
+               INIT_LIST_HEAD(&self->removed_maps[i]);
+       }
+}
+
 static struct thread *thread__new(pid_t pid)
 {
-       struct thread *self = calloc(1, sizeof(*self));
+       struct thread *self = zalloc(sizeof(*self));
 
        if (self != NULL) {
-               self->pid = pid;
+               thread__init(self, pid);
                self->comm = malloc(32);
                if (self->comm)
                        snprintf(self->comm, 32, ":%d", self->pid);
-               INIT_LIST_HEAD(&self->maps);
        }
 
        return self;
@@ -29,21 +42,84 @@ int thread__set_comm(struct thread *self, const char *comm)
        return self->comm ? 0 : -ENOMEM;
 }
 
-static size_t thread__fprintf(struct thread *self, FILE *fp)
+int thread__comm_len(struct thread *self)
+{
+       if (!self->comm_len) {
+               if (!self->comm)
+                       return 0;
+               self->comm_len = strlen(self->comm);
+       }
+
+       return self->comm_len;
+}
+
+static const char *map_type__name[MAP__NR_TYPES] = {
+       [MAP__FUNCTION] = "Functions",
+};
+
+static size_t __thread__fprintf_maps(struct thread *self,
+                                    enum map_type type, FILE *fp)
+{
+       size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+       struct rb_node *nd;
+
+       for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
+               struct map *pos = rb_entry(nd, struct map, rb_node);
+               printed += fprintf(fp, "Map:");
+               printed += map__fprintf(pos, fp);
+               if (verbose > 1) {
+                       printed += dso__fprintf(pos->dso, type, fp);
+                       printed += fprintf(fp, "--\n");
+               }
+       }
+
+       return printed;
+}
+
+size_t thread__fprintf_maps(struct thread *self, FILE *fp)
+{
+       size_t printed = 0, i;
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               printed += __thread__fprintf_maps(self, i, fp);
+       return printed;
+}
+
+static size_t __thread__fprintf_removed_maps(struct thread *self,
+                                            enum map_type type, FILE *fp)
 {
        struct map *pos;
-       size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+       size_t printed = 0;
+
+       list_for_each_entry(pos, &self->removed_maps[type], node) {
+               printed += fprintf(fp, "Map:");
+               printed += map__fprintf(pos, fp);
+               if (verbose > 1) {
+                       printed += dso__fprintf(pos->dso, type, fp);
+                       printed += fprintf(fp, "--\n");
+               }
+       }
+       return printed;
+}
 
-       list_for_each_entry(pos, &self->maps, node)
-               ret += map__fprintf(pos, fp);
+static size_t thread__fprintf_removed_maps(struct thread *self, FILE *fp)
+{
+       size_t printed = 0, i;
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               printed += __thread__fprintf_removed_maps(self, i, fp);
+       return printed;
+}
 
-       return ret;
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+       size_t printed = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+       printed += thread__fprintf_removed_maps(self, fp);
+       printed += fprintf(fp, "Removed maps:\n");
+       return printed + thread__fprintf_removed_maps(self, fp);
 }
 
-struct thread *
-threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match)
+struct thread *threads__findnew(pid_t pid)
 {
-       struct rb_node **p = &threads->rb_node;
+       struct rb_node **p = &threads.rb_node;
        struct rb_node *parent = NULL;
        struct thread *th;
 
@@ -52,15 +128,15 @@ threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match)
         * so most of the time we dont have to look up
         * the full rbtree:
         */
-       if (*last_match && (*last_match)->pid == pid)
-               return *last_match;
+       if (last_match && last_match->pid == pid)
+               return last_match;
 
        while (*p != NULL) {
                parent = *p;
                th = rb_entry(parent, struct thread, rb_node);
 
                if (th->pid == pid) {
-                       *last_match = th;
+                       last_match = th;
                        return th;
                }
 
@@ -73,17 +149,16 @@ threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match)
        th = thread__new(pid);
        if (th != NULL) {
                rb_link_node(&th->rb_node, parent, p);
-               rb_insert_color(&th->rb_node, threads);
-               *last_match = th;
+               rb_insert_color(&th->rb_node, &threads);
+               last_match = th;
        }
 
        return th;
 }
 
-struct thread *
-register_idle_thread(struct rb_root *threads, struct thread **last_match)
+struct thread *register_idle_thread(void)
 {
-       struct thread *thread = threads__findnew(0, threads, last_match);
+       struct thread *thread = threads__findnew(0);
 
        if (!thread || thread__set_comm(thread, "swapper")) {
                fprintf(stderr, "problem inserting idle task.\n");
@@ -93,79 +168,116 @@ register_idle_thread(struct rb_root *threads, struct thread **last_match)
        return thread;
 }
 
-void thread__insert_map(struct thread *self, struct map *map)
+static void thread__remove_overlappings(struct thread *self, struct map *map)
 {
-       struct map *pos, *tmp;
+       struct rb_root *root = &self->maps[map->type];
+       struct rb_node *next = rb_first(root);
 
-       list_for_each_entry_safe(pos, tmp, &self->maps, node) {
-               if (map__overlap(pos, map)) {
-                       if (verbose >= 2) {
-                               printf("overlapping maps:\n");
-                               map__fprintf(map, stdout);
-                               map__fprintf(pos, stdout);
-                       }
+       while (next) {
+               struct map *pos = rb_entry(next, struct map, rb_node);
+               next = rb_next(&pos->rb_node);
 
-                       if (map->start <= pos->start && map->end > pos->start)
-                               pos->start = map->end;
+               if (!map__overlap(pos, map))
+                       continue;
 
-                       if (map->end >= pos->end && map->start < pos->end)
-                               pos->end = map->start;
+               if (verbose >= 2) {
+                       fputs("overlapping maps:\n", stderr);
+                       map__fprintf(map, stderr);
+                       map__fprintf(pos, stderr);
+               }
 
-                       if (verbose >= 2) {
-                               printf("after collision:\n");
-                               map__fprintf(pos, stdout);
-                       }
+               rb_erase(&pos->rb_node, root);
+               /*
+                * We may have references to this map, for instance in some
+                * hist_entry instances, so just move them to a separate
+                * list.
+                */
+               list_add_tail(&pos->node, &self->removed_maps[map->type]);
+       }
+}
 
-                       if (pos->start >= pos->end) {
-                               list_del_init(&pos->node);
-                               free(pos);
-                       }
-               }
+void maps__insert(struct rb_root *maps, struct map *map)
+{
+       struct rb_node **p = &maps->rb_node;
+       struct rb_node *parent = NULL;
+       const u64 ip = map->start;
+       struct map *m;
+
+       while (*p != NULL) {
+               parent = *p;
+               m = rb_entry(parent, struct map, rb_node);
+               if (ip < m->start)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
        }
 
-       list_add_tail(&map->node, &self->maps);
+       rb_link_node(&map->rb_node, parent, p);
+       rb_insert_color(&map->rb_node, maps);
 }
 
-int thread__fork(struct thread *self, struct thread *parent)
+struct map *maps__find(struct rb_root *maps, u64 ip)
 {
-       struct map *map;
+       struct rb_node **p = &maps->rb_node;
+       struct rb_node *parent = NULL;
+       struct map *m;
 
-       if (self->comm)
-               free(self->comm);
-       self->comm = strdup(parent->comm);
-       if (!self->comm)
-               return -ENOMEM;
+       while (*p != NULL) {
+               parent = *p;
+               m = rb_entry(parent, struct map, rb_node);
+               if (ip < m->start)
+                       p = &(*p)->rb_left;
+               else if (ip > m->end)
+                       p = &(*p)->rb_right;
+               else
+                       return m;
+       }
+
+       return NULL;
+}
+
+void thread__insert_map(struct thread *self, struct map *map)
+{
+       thread__remove_overlappings(self, map);
+       maps__insert(&self->maps[map->type], map);
+}
 
-       list_for_each_entry(map, &parent->maps, node) {
+static int thread__clone_maps(struct thread *self, struct thread *parent,
+                             enum map_type type)
+{
+       struct rb_node *nd;
+       for (nd = rb_first(&parent->maps[type]); nd; nd = rb_next(nd)) {
+               struct map *map = rb_entry(nd, struct map, rb_node);
                struct map *new = map__clone(map);
-               if (!new)
+               if (new == NULL)
                        return -ENOMEM;
                thread__insert_map(self, new);
        }
-
        return 0;
 }
 
-struct map *thread__find_map(struct thread *self, u64 ip)
+int thread__fork(struct thread *self, struct thread *parent)
 {
-       struct map *pos;
+       int i;
 
-       if (self == NULL)
-               return NULL;
-
-       list_for_each_entry(pos, &self->maps, node)
-               if (ip >= pos->start && ip <= pos->end)
-                       return pos;
+       if (self->comm)
+               free(self->comm);
+       self->comm = strdup(parent->comm);
+       if (!self->comm)
+               return -ENOMEM;
 
-       return NULL;
+       for (i = 0; i < MAP__NR_TYPES; ++i)
+               if (thread__clone_maps(self, parent, i) < 0)
+                       return -ENOMEM;
+       return 0;
 }
 
-size_t threads__fprintf(FILE *fp, struct rb_root *threads)
+size_t threads__fprintf(FILE *fp)
 {
        size_t ret = 0;
        struct rb_node *nd;
 
-       for (nd = rb_first(threads); nd; nd = rb_next(nd)) {
+       for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
                struct thread *pos = rb_entry(nd, struct thread, rb_node);
 
                ret += thread__fprintf(pos, fp);
@@ -173,3 +285,15 @@ size_t threads__fprintf(FILE *fp, struct rb_root *threads)
 
        return ret;
 }
+
+struct symbol *thread__find_symbol(struct thread *self,
+                                  enum map_type type, u64 addr,
+                                  symbol_filter_t filter)
+{
+       struct map *map = thread__find_map(self, type, addr);
+
+       if (map != NULL)
+               return map__find_symbol(map, map->map_ip(map, addr), filter);
+
+       return NULL;
+}
index 32aea3c1c2ad6efd174f6f67843134d033853fba..686d6e914d9e3711bfb1ea8e53e6b275715f2837 100644 (file)
@@ -1,22 +1,56 @@
+#ifndef __PERF_THREAD_H
+#define __PERF_THREAD_H
+
 #include <linux/rbtree.h>
-#include <linux/list.h>
 #include <unistd.h>
 #include "symbol.h"
 
 struct thread {
        struct rb_node          rb_node;
-       struct list_head        maps;
+       struct rb_root          maps[MAP__NR_TYPES];
+       struct list_head        removed_maps[MAP__NR_TYPES];
        pid_t                   pid;
+       bool                    use_modules;
        char                    shortname[3];
        char                    *comm;
+       int                     comm_len;
 };
 
+void thread__init(struct thread *self, pid_t pid);
 int thread__set_comm(struct thread *self, const char *comm);
-struct thread *
-threads__findnew(pid_t pid, struct rb_root *threads, struct thread **last_match);
-struct thread *
-register_idle_thread(struct rb_root *threads, struct thread **last_match);
+int thread__comm_len(struct thread *self);
+struct thread *threads__findnew(pid_t pid);
+struct thread *register_idle_thread(void);
 void thread__insert_map(struct thread *self, struct map *map);
 int thread__fork(struct thread *self, struct thread *parent);
-struct map *thread__find_map(struct thread *self, u64 ip);
-size_t threads__fprintf(FILE *fp, struct rb_root *threads);
+size_t thread__fprintf_maps(struct thread *self, FILE *fp);
+size_t threads__fprintf(FILE *fp);
+
+void maps__insert(struct rb_root *maps, struct map *map);
+struct map *maps__find(struct rb_root *maps, u64 addr);
+
+static inline struct map *thread__find_map(struct thread *self,
+                                          enum map_type type, u64 addr)
+{
+       return self ? maps__find(&self->maps[type], addr) : NULL;
+}
+
+static inline void __thread__insert_map(struct thread *self, struct map *map)
+{
+        maps__insert(&self->maps[map->type], map);
+}
+
+void thread__find_addr_location(struct thread *self, u8 cpumode,
+                               enum map_type type, u64 addr,
+                               struct addr_location *al,
+                               symbol_filter_t filter);
+struct symbol *thread__find_symbol(struct thread *self,
+                                  enum map_type type, u64 addr,
+                                  symbol_filter_t filter);
+
+static inline struct symbol *
+thread__find_function(struct thread *self, u64 addr, symbol_filter_t filter)
+{
+       return thread__find_symbol(self, MAP__FUNCTION, addr, filter);
+}
+#endif /* __PERF_THREAD_H */
index af4b0573b37fb3dfaeed2f57c9028bdbc62a1809..cace35595530a1dedeb283b0adb651a3af32cd4c 100644 (file)
 #include <ctype.h>
 #include <errno.h>
 #include <stdbool.h>
+#include <linux/kernel.h>
 
 #include "../perf.h"
 #include "trace-event.h"
 
-
 #define VERSION "0.5"
 
 #define _STR(x) #x
@@ -483,27 +483,33 @@ static struct tracepoint_path *
 get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 {
        struct tracepoint_path path, *ppath = &path;
-       int i;
+       int i, nr_tracepoints = 0;
 
        for (i = 0; i < nb_events; i++) {
                if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
                        continue;
+               ++nr_tracepoints;
                ppath->next = tracepoint_id_to_path(pattrs[i].config);
                if (!ppath->next)
                        die("%s\n", "No memory to alloc tracepoints list");
                ppath = ppath->next;
        }
 
-       return path.next;
+       return nr_tracepoints > 0 ? path.next : NULL;
 }
-void read_tracing_data(struct perf_event_attr *pattrs, int nb_events)
+
+int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
 {
        char buf[BUFSIZ];
-       struct tracepoint_path *tps;
+       struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events);
+
+       /*
+        * What? No tracepoints? No sense writing anything here, bail out.
+        */
+       if (tps == NULL)
+               return -1;
 
-       output_fd = open(output_file, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, 0644);
-       if (output_fd < 0)
-               die("creating file '%s'", output_file);
+       output_fd = fd;
 
        buf[0] = 23;
        buf[1] = 8;
@@ -530,11 +536,11 @@ void read_tracing_data(struct perf_event_attr *pattrs, int nb_events)
        page_size = getpagesize();
        write_or_die(&page_size, 4);
 
-       tps = get_tracepoints_path(pattrs, nb_events);
-
        read_header_files();
        read_ftrace_files(tps);
        read_event_files(tps);
        read_proc_kallsyms();
        read_ftrace_printk();
+
+       return 0;
 }
index 55c9659a56e2271b3dcad7056bbde78ee240df3d..0302405aa2ca7b0f6e7b77cde32c4b49462ff8c9 100644 (file)
@@ -40,12 +40,19 @@ int header_page_size_size;
 int header_page_data_offset;
 int header_page_data_size;
 
+int latency_format;
+
 static char *input_buf;
 static unsigned long long input_buf_ptr;
 static unsigned long long input_buf_siz;
 
 static int cpus;
 static int long_size;
+static int is_flag_field;
+static int is_symbolic_field;
+
+static struct format_field *
+find_any_field(struct event *event, const char *name);
 
 static void init_input_buf(char *buf, unsigned long long size)
 {
@@ -284,18 +291,19 @@ void parse_ftrace_printk(char *file, unsigned int size __unused)
        char *line;
        char *next = NULL;
        char *addr_str;
-       int ret;
        int i;
 
        line = strtok_r(file, "\n", &next);
        while (line) {
+               addr_str = strsep(&line, ":");
+               if (!line) {
+                       warning("error parsing print strings");
+                       break;
+               }
                item = malloc_or_die(sizeof(*item));
-               ret = sscanf(line, "%as : %as",
-                            (float *)(void *)&addr_str, /* workaround gcc warning */
-                            (float *)(void *)&item->printk);
                item->addr = strtoull(addr_str, NULL, 16);
-               free(addr_str);
-
+               /* fmt still has a space, skip it */
+               item->printk = strdup(line+1);
                item->next = list;
                list = item;
                line = strtok_r(NULL, "\n", &next);
@@ -522,7 +530,10 @@ static enum event_type __read_token(char **tok)
                        last_ch = ch;
                        ch = __read_char();
                        buf[i++] = ch;
-               } while (ch != quote_ch && last_ch != '\\');
+                       /* the '\' '\' will cancel itself */
+                       if (ch == '\\' && last_ch == '\\')
+                               last_ch = 0;
+               } while (ch != quote_ch || last_ch == '\\');
                /* remove the last quote */
                i--;
                goto out;
@@ -610,7 +621,7 @@ static enum event_type read_token_item(char **tok)
 static int test_type(enum event_type type, enum event_type expect)
 {
        if (type != expect) {
-               die("Error: expected type %d but read %d",
+               warning("Error: expected type %d but read %d",
                    expect, type);
                return -1;
        }
@@ -621,13 +632,13 @@ static int test_type_token(enum event_type type, char *token,
                    enum event_type expect, const char *expect_tok)
 {
        if (type != expect) {
-               die("Error: expected type %d but read %d",
+               warning("Error: expected type %d but read %d",
                    expect, type);
                return -1;
        }
 
        if (strcmp(token, expect_tok) != 0) {
-               die("Error: expected '%s' but read '%s'",
+               warning("Error: expected '%s' but read '%s'",
                    expect_tok, token);
                return -1;
        }
@@ -665,7 +676,7 @@ static int __read_expected(enum event_type expect, const char *str, int newline_
 
        free_token(token);
 
-       return 0;
+       return ret;
 }
 
 static int read_expected(enum event_type expect, const char *str)
@@ -682,10 +693,10 @@ static char *event_read_name(void)
 {
        char *token;
 
-       if (read_expected(EVENT_ITEM, (char *)"name") < 0)
+       if (read_expected(EVENT_ITEM, "name") < 0)
                return NULL;
 
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return NULL;
 
        if (read_expect_type(EVENT_ITEM, &token) < 0)
@@ -703,10 +714,10 @@ static int event_read_id(void)
        char *token;
        int id;
 
-       if (read_expected_item(EVENT_ITEM, (char *)"ID") < 0)
+       if (read_expected_item(EVENT_ITEM, "ID") < 0)
                return -1;
 
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return -1;
 
        if (read_expect_type(EVENT_ITEM, &token) < 0)
@@ -721,6 +732,24 @@ static int event_read_id(void)
        return -1;
 }
 
+static int field_is_string(struct format_field *field)
+{
+       if ((field->flags & FIELD_IS_ARRAY) &&
+           (!strstr(field->type, "char") || !strstr(field->type, "u8") ||
+            !strstr(field->type, "s8")))
+               return 1;
+
+       return 0;
+}
+
+static int field_is_dynamic(struct format_field *field)
+{
+       if (!strcmp(field->type, "__data_loc"))
+               return 1;
+
+       return 0;
+}
+
 static int event_read_fields(struct event *event, struct format_field **fields)
 {
        struct format_field *field = NULL;
@@ -738,7 +767,7 @@ static int event_read_fields(struct event *event, struct format_field **fields)
 
                count++;
 
-               if (test_type_token(type, token, EVENT_ITEM, (char *)"field"))
+               if (test_type_token(type, token, EVENT_ITEM, "field"))
                        goto fail;
                free_token(token);
 
@@ -753,7 +782,7 @@ static int event_read_fields(struct event *event, struct format_field **fields)
                        type = read_token(&token);
                }
 
-               if (test_type_token(type, token, EVENT_OP, (char *)":") < 0)
+               if (test_type_token(type, token, EVENT_OP, ":") < 0)
                        return -1;
 
                if (read_expect_type(EVENT_ITEM, &token) < 0)
@@ -865,14 +894,20 @@ static int event_read_fields(struct event *event, struct format_field **fields)
                        free(brackets);
                }
 
-               if (test_type_token(type, token,  EVENT_OP, (char *)";"))
+               if (field_is_string(field)) {
+                       field->flags |= FIELD_IS_STRING;
+                       if (field_is_dynamic(field))
+                               field->flags |= FIELD_IS_DYNAMIC;
+               }
+
+               if (test_type_token(type, token,  EVENT_OP, ";"))
                        goto fail;
                free_token(token);
 
-               if (read_expected(EVENT_ITEM, (char *)"offset") < 0)
+               if (read_expected(EVENT_ITEM, "offset") < 0)
                        goto fail_expect;
 
-               if (read_expected(EVENT_OP, (char *)":") < 0)
+               if (read_expected(EVENT_OP, ":") < 0)
                        goto fail_expect;
 
                if (read_expect_type(EVENT_ITEM, &token))
@@ -880,13 +915,13 @@ static int event_read_fields(struct event *event, struct format_field **fields)
                field->offset = strtoul(token, NULL, 0);
                free_token(token);
 
-               if (read_expected(EVENT_OP, (char *)";") < 0)
+               if (read_expected(EVENT_OP, ";") < 0)
                        goto fail_expect;
 
-               if (read_expected(EVENT_ITEM, (char *)"size") < 0)
+               if (read_expected(EVENT_ITEM, "size") < 0)
                        goto fail_expect;
 
-               if (read_expected(EVENT_OP, (char *)":") < 0)
+               if (read_expected(EVENT_OP, ":") < 0)
                        goto fail_expect;
 
                if (read_expect_type(EVENT_ITEM, &token))
@@ -894,11 +929,34 @@ static int event_read_fields(struct event *event, struct format_field **fields)
                field->size = strtoul(token, NULL, 0);
                free_token(token);
 
-               if (read_expected(EVENT_OP, (char *)";") < 0)
+               if (read_expected(EVENT_OP, ";") < 0)
                        goto fail_expect;
 
-               if (read_expect_type(EVENT_NEWLINE, &token) < 0)
-                       goto fail;
+               type = read_token(&token);
+               if (type != EVENT_NEWLINE) {
+                       /* newer versions of the kernel have a "signed" type */
+                       if (test_type_token(type, token, EVENT_ITEM, "signed"))
+                               goto fail;
+
+                       free_token(token);
+
+                       if (read_expected(EVENT_OP, ":") < 0)
+                               goto fail_expect;
+
+                       if (read_expect_type(EVENT_ITEM, &token))
+                               goto fail;
+
+                       if (strtoul(token, NULL, 0))
+                               field->flags |= FIELD_IS_SIGNED;
+
+                       free_token(token);
+                       if (read_expected(EVENT_OP, ";") < 0)
+                               goto fail_expect;
+
+                       if (read_expect_type(EVENT_NEWLINE, &token))
+                               goto fail;
+               }
+
                free_token(token);
 
                *fields = field;
@@ -921,10 +979,10 @@ static int event_read_format(struct event *event)
        char *token;
        int ret;
 
-       if (read_expected_item(EVENT_ITEM, (char *)"format") < 0)
+       if (read_expected_item(EVENT_ITEM, "format") < 0)
                return -1;
 
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return -1;
 
        if (read_expect_type(EVENT_NEWLINE, &token))
@@ -984,7 +1042,7 @@ process_cond(struct event *event, struct print_arg *top, char **tok)
 
        *tok = NULL;
        type = process_arg(event, left, &token);
-       if (test_type_token(type, token, EVENT_OP, (char *)":"))
+       if (test_type_token(type, token, EVENT_OP, ":"))
                goto out_free;
 
        arg->op.op = token;
@@ -1004,6 +1062,35 @@ out_free:
        return EVENT_ERROR;
 }
 
+static enum event_type
+process_array(struct event *event, struct print_arg *top, char **tok)
+{
+       struct print_arg *arg;
+       enum event_type type;
+       char *token = NULL;
+
+       arg = malloc_or_die(sizeof(*arg));
+       memset(arg, 0, sizeof(*arg));
+
+       *tok = NULL;
+       type = process_arg(event, arg, &token);
+       if (test_type_token(type, token, EVENT_OP, "]"))
+               goto out_free;
+
+       top->op.right = arg;
+
+       free_token(token);
+       type = read_token_item(&token);
+       *tok = token;
+
+       return type;
+
+out_free:
+       free_token(*tok);
+       free_arg(arg);
+       return EVENT_ERROR;
+}
+
 static int get_op_prio(char *op)
 {
        if (!op[1]) {
@@ -1128,6 +1215,8 @@ process_op(struct event *event, struct print_arg *arg, char **tok)
                   strcmp(token, "*") == 0 ||
                   strcmp(token, "^") == 0 ||
                   strcmp(token, "/") == 0 ||
+                  strcmp(token, "<") == 0 ||
+                  strcmp(token, ">") == 0 ||
                   strcmp(token, "==") == 0 ||
                   strcmp(token, "!=") == 0) {
 
@@ -1144,17 +1233,46 @@ process_op(struct event *event, struct print_arg *arg, char **tok)
 
                right = malloc_or_die(sizeof(*right));
 
-               type = process_arg(event, right, tok);
+               type = read_token_item(&token);
+               *tok = token;
+
+               /* could just be a type pointer */
+               if ((strcmp(arg->op.op, "*") == 0) &&
+                   type == EVENT_DELIM && (strcmp(token, ")") == 0)) {
+                       if (left->type != PRINT_ATOM)
+                               die("bad pointer type");
+                       left->atom.atom = realloc(left->atom.atom,
+                                           sizeof(left->atom.atom) + 3);
+                       strcat(left->atom.atom, " *");
+                       *arg = *left;
+                       free(arg);
+
+                       return type;
+               }
+
+               type = process_arg_token(event, right, tok, type);
 
                arg->op.right = right;
 
+       } else if (strcmp(token, "[") == 0) {
+
+               left = malloc_or_die(sizeof(*left));
+               *left = *arg;
+
+               arg->type = PRINT_OP;
+               arg->op.op = token;
+               arg->op.left = left;
+
+               arg->op.prio = 0;
+               type = process_array(event, arg, tok);
+
        } else {
-               die("unknown op '%s'", token);
+               warning("unknown op '%s'", token);
+               event->flags |= EVENT_FL_FAILED;
                /* the arg is now the left side */
                return EVENT_NONE;
        }
 
-
        if (type == EVENT_OP) {
                int prio;
 
@@ -1178,7 +1296,7 @@ process_entry(struct event *event __unused, struct print_arg *arg,
        char *field;
        char *token;
 
-       if (read_expected(EVENT_OP, (char *)"->") < 0)
+       if (read_expected(EVENT_OP, "->") < 0)
                return EVENT_ERROR;
 
        if (read_expect_type(EVENT_ITEM, &token) < 0)
@@ -1188,6 +1306,16 @@ process_entry(struct event *event __unused, struct print_arg *arg,
        arg->type = PRINT_FIELD;
        arg->field.name = field;
 
+       if (is_flag_field) {
+               arg->field.field = find_any_field(event, arg->field.name);
+               arg->field.field->flags |= FIELD_IS_FLAG;
+               is_flag_field = 0;
+       } else if (is_symbolic_field) {
+               arg->field.field = find_any_field(event, arg->field.name);
+               arg->field.field->flags |= FIELD_IS_SYMBOLIC;
+               is_symbolic_field = 0;
+       }
+
        type = read_token(&token);
        *tok = token;
 
@@ -1338,14 +1466,14 @@ process_fields(struct event *event, struct print_flag_sym **list, char **tok)
        do {
                free_token(token);
                type = read_token_item(&token);
-               if (test_type_token(type, token, EVENT_OP, (char *)"{"))
+               if (test_type_token(type, token, EVENT_OP, "{"))
                        break;
 
                arg = malloc_or_die(sizeof(*arg));
 
                free_token(token);
                type = process_arg(event, arg, &token);
-               if (test_type_token(type, token, EVENT_DELIM, (char *)","))
+               if (test_type_token(type, token, EVENT_DELIM, ","))
                        goto out_free;
 
                field = malloc_or_die(sizeof(*field));
@@ -1356,7 +1484,7 @@ process_fields(struct event *event, struct print_flag_sym **list, char **tok)
 
                free_token(token);
                type = process_arg(event, arg, &token);
-               if (test_type_token(type, token, EVENT_OP, (char *)"}"))
+               if (test_type_token(type, token, EVENT_OP, "}"))
                        goto out_free;
 
                value = arg_eval(arg);
@@ -1391,13 +1519,13 @@ process_flags(struct event *event, struct print_arg *arg, char **tok)
        memset(arg, 0, sizeof(*arg));
        arg->type = PRINT_FLAGS;
 
-       if (read_expected_item(EVENT_DELIM, (char *)"(") < 0)
+       if (read_expected_item(EVENT_DELIM, "(") < 0)
                return EVENT_ERROR;
 
        field = malloc_or_die(sizeof(*field));
 
        type = process_arg(event, field, &token);
-       if (test_type_token(type, token, EVENT_DELIM, (char *)","))
+       if (test_type_token(type, token, EVENT_DELIM, ","))
                goto out_free;
 
        arg->flags.field = field;
@@ -1408,11 +1536,11 @@ process_flags(struct event *event, struct print_arg *arg, char **tok)
                type = read_token_item(&token);
        }
 
-       if (test_type_token(type, token, EVENT_DELIM, (char *)","))
+       if (test_type_token(type, token, EVENT_DELIM, ","))
                goto out_free;
 
        type = process_fields(event, &arg->flags.flags, &token);
-       if (test_type_token(type, token, EVENT_DELIM, (char *)")"))
+       if (test_type_token(type, token, EVENT_DELIM, ")"))
                goto out_free;
 
        free_token(token);
@@ -1434,19 +1562,19 @@ process_symbols(struct event *event, struct print_arg *arg, char **tok)
        memset(arg, 0, sizeof(*arg));
        arg->type = PRINT_SYMBOL;
 
-       if (read_expected_item(EVENT_DELIM, (char *)"(") < 0)
+       if (read_expected_item(EVENT_DELIM, "(") < 0)
                return EVENT_ERROR;
 
        field = malloc_or_die(sizeof(*field));
 
        type = process_arg(event, field, &token);
-       if (test_type_token(type, token, EVENT_DELIM, (char *)","))
+       if (test_type_token(type, token, EVENT_DELIM, ","))
                goto out_free;
 
        arg->symbol.field = field;
 
        type = process_fields(event, &arg->symbol.symbols, &token);
-       if (test_type_token(type, token, EVENT_DELIM, (char *)")"))
+       if (test_type_token(type, token, EVENT_DELIM, ")"))
                goto out_free;
 
        free_token(token);
@@ -1463,7 +1591,6 @@ process_paren(struct event *event, struct print_arg *arg, char **tok)
 {
        struct print_arg *item_arg;
        enum event_type type;
-       int ptr_cast = 0;
        char *token;
 
        type = process_arg(event, arg, &token);
@@ -1471,28 +1598,13 @@ process_paren(struct event *event, struct print_arg *arg, char **tok)
        if (type == EVENT_ERROR)
                return EVENT_ERROR;
 
-       if (type == EVENT_OP) {
-               /* handle the ptr casts */
-               if (!strcmp(token, "*")) {
-                       /*
-                        * FIXME: should we zapp whitespaces before ')' ?
-                        * (may require a peek_token_item())
-                        */
-                       if (__peek_char() == ')') {
-                               ptr_cast = 1;
-                               free_token(token);
-                               type = read_token_item(&token);
-                       }
-               }
-               if (!ptr_cast) {
-                       type = process_op(event, arg, &token);
+       if (type == EVENT_OP)
+               type = process_op(event, arg, &token);
 
-                       if (type == EVENT_ERROR)
-                               return EVENT_ERROR;
-               }
-       }
+       if (type == EVENT_ERROR)
+               return EVENT_ERROR;
 
-       if (test_type_token(type, token, EVENT_DELIM, (char *)")")) {
+       if (test_type_token(type, token, EVENT_DELIM, ")")) {
                free_token(token);
                return EVENT_ERROR;
        }
@@ -1516,13 +1628,6 @@ process_paren(struct event *event, struct print_arg *arg, char **tok)
                item_arg = malloc_or_die(sizeof(*item_arg));
 
                arg->type = PRINT_TYPE;
-               if (ptr_cast) {
-                       char *old = arg->atom.atom;
-
-                       arg->atom.atom = malloc_or_die(strlen(old + 3));
-                       sprintf(arg->atom.atom, "%s *", old);
-                       free(old);
-               }
                arg->typecast.type = arg->atom.atom;
                arg->typecast.item = item_arg;
                type = process_arg_token(event, item_arg, &token, type);
@@ -1540,7 +1645,7 @@ process_str(struct event *event __unused, struct print_arg *arg, char **tok)
        enum event_type type;
        char *token;
 
-       if (read_expected(EVENT_DELIM, (char *)"(") < 0)
+       if (read_expected(EVENT_DELIM, "(") < 0)
                return EVENT_ERROR;
 
        if (read_expect_type(EVENT_ITEM, &token) < 0)
@@ -1550,7 +1655,7 @@ process_str(struct event *event __unused, struct print_arg *arg, char **tok)
        arg->string.string = token;
        arg->string.offset = -1;
 
-       if (read_expected(EVENT_DELIM, (char *)")") < 0)
+       if (read_expected(EVENT_DELIM, ")") < 0)
                return EVENT_ERROR;
 
        type = read_token(&token);
@@ -1578,9 +1683,11 @@ process_arg_token(struct event *event, struct print_arg *arg,
                        type = process_entry(event, arg, &token);
                } else if (strcmp(token, "__print_flags") == 0) {
                        free_token(token);
+                       is_flag_field = 1;
                        type = process_flags(event, arg, &token);
                } else if (strcmp(token, "__print_symbolic") == 0) {
                        free_token(token);
+                       is_symbolic_field = 1;
                        type = process_symbols(event, arg, &token);
                } else if (strcmp(token, "__get_str") == 0) {
                        free_token(token);
@@ -1637,12 +1744,18 @@ process_arg_token(struct event *event, struct print_arg *arg,
 
 static int event_read_print_args(struct event *event, struct print_arg **list)
 {
-       enum event_type type;
+       enum event_type type = EVENT_ERROR;
        struct print_arg *arg;
        char *token;
        int args = 0;
 
        do {
+               if (type == EVENT_NEWLINE) {
+                       free_token(token);
+                       type = read_token_item(&token);
+                       continue;
+               }
+
                arg = malloc_or_die(sizeof(*arg));
                memset(arg, 0, sizeof(*arg));
 
@@ -1683,18 +1796,19 @@ static int event_read_print(struct event *event)
        char *token;
        int ret;
 
-       if (read_expected_item(EVENT_ITEM, (char *)"print") < 0)
+       if (read_expected_item(EVENT_ITEM, "print") < 0)
                return -1;
 
-       if (read_expected(EVENT_ITEM, (char *)"fmt") < 0)
+       if (read_expected(EVENT_ITEM, "fmt") < 0)
                return -1;
 
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return -1;
 
        if (read_expect_type(EVENT_DQUOTE, &token) < 0)
                goto fail;
 
+ concat:
        event->print_fmt.format = token;
        event->print_fmt.args = NULL;
 
@@ -1704,7 +1818,22 @@ static int event_read_print(struct event *event)
        if (type == EVENT_NONE)
                return 0;
 
-       if (test_type_token(type, token, EVENT_DELIM, (char *)","))
+       /* Handle concatination of print lines */
+       if (type == EVENT_DQUOTE) {
+               char *cat;
+
+               cat = malloc_or_die(strlen(event->print_fmt.format) +
+                                   strlen(token) + 1);
+               strcpy(cat, event->print_fmt.format);
+               strcat(cat, token);
+               free_token(token);
+               free_token(event->print_fmt.format);
+               event->print_fmt.format = NULL;
+               token = cat;
+               goto concat;
+       }
+
+       if (test_type_token(type, token, EVENT_DELIM, ","))
                goto fail;
 
        free_token(token);
@@ -1713,7 +1842,7 @@ static int event_read_print(struct event *event)
        if (ret < 0)
                return -1;
 
-       return 0;
+       return ret;
 
  fail:
        free_token(token);
@@ -1759,7 +1888,7 @@ find_any_field(struct event *event, const char *name)
        return find_field(event, name);
 }
 
-static unsigned long long read_size(void *ptr, int size)
+unsigned long long read_size(void *ptr, int size)
 {
        switch (size) {
        case 1:
@@ -1822,37 +1951,67 @@ static int get_common_info(const char *type, int *offset, int *size)
        return 0;
 }
 
-int trace_parse_common_type(void *data)
+static int __parse_common(void *data, int *size, int *offset,
+                         const char *name)
 {
-       static int type_offset;
-       static int type_size;
        int ret;
 
-       if (!type_size) {
-               ret = get_common_info("common_type",
-                                     &type_offset,
-                                     &type_size);
+       if (!*size) {
+               ret = get_common_info(name, offset, size);
                if (ret < 0)
                        return ret;
        }
-       return read_size(data + type_offset, type_size);
+       return read_size(data + *offset, *size);
+}
+
+int trace_parse_common_type(void *data)
+{
+       static int type_offset;
+       static int type_size;
+
+       return __parse_common(data, &type_size, &type_offset,
+                             "common_type");
 }
 
-static int parse_common_pid(void *data)
+int trace_parse_common_pid(void *data)
 {
        static int pid_offset;
        static int pid_size;
+
+       return __parse_common(data, &pid_size, &pid_offset,
+                             "common_pid");
+}
+
+int parse_common_pc(void *data)
+{
+       static int pc_offset;
+       static int pc_size;
+
+       return __parse_common(data, &pc_size, &pc_offset,
+                             "common_preempt_count");
+}
+
+int parse_common_flags(void *data)
+{
+       static int flags_offset;
+       static int flags_size;
+
+       return __parse_common(data, &flags_size, &flags_offset,
+                             "common_flags");
+}
+
+int parse_common_lock_depth(void *data)
+{
+       static int ld_offset;
+       static int ld_size;
        int ret;
 
-       if (!pid_size) {
-               ret = get_common_info("common_pid",
-                                     &pid_offset,
-                                     &pid_size);
-               if (ret < 0)
-                       return ret;
-       }
+       ret = __parse_common(data, &ld_size, &ld_offset,
+                            "common_lock_depth");
+       if (ret < 0)
+               return -1;
 
-       return read_size(data + pid_offset, pid_size);
+       return ret;
 }
 
 struct event *trace_find_event(int id)
@@ -1866,11 +2025,20 @@ struct event *trace_find_event(int id)
        return event;
 }
 
+struct event *trace_find_next_event(struct event *event)
+{
+       if (!event)
+               return event_list;
+
+       return event->next;
+}
+
 static unsigned long long eval_num_arg(void *data, int size,
                                   struct event *event, struct print_arg *arg)
 {
        unsigned long long val = 0;
        unsigned long long left, right;
+       struct print_arg *larg;
 
        switch (arg->type) {
        case PRINT_NULL:
@@ -1897,6 +2065,26 @@ static unsigned long long eval_num_arg(void *data, int size,
                return 0;
                break;
        case PRINT_OP:
+               if (strcmp(arg->op.op, "[") == 0) {
+                       /*
+                        * Arrays are special, since we don't want
+                        * to read the arg as is.
+                        */
+                       if (arg->op.left->type != PRINT_FIELD)
+                               goto default_op; /* oops, all bets off */
+                       larg = arg->op.left;
+                       if (!larg->field.field) {
+                               larg->field.field =
+                                       find_any_field(event, larg->field.name);
+                               if (!larg->field.field)
+                                       die("field %s not found", larg->field.name);
+                       }
+                       right = eval_num_arg(data, size, event, arg->op.right);
+                       val = read_size(data + larg->field.field->offset +
+                                       right * long_size, long_size);
+                       break;
+               }
+ default_op:
                left = eval_num_arg(data, size, event, arg->op.left);
                right = eval_num_arg(data, size, event, arg->op.right);
                switch (arg->op.op[0]) {
@@ -1947,6 +2135,12 @@ static unsigned long long eval_num_arg(void *data, int size,
                                die("unknown op '%s'", arg->op.op);
                        val = left == right;
                        break;
+               case '-':
+                       val = left - right;
+                       break;
+               case '+':
+                       val = left + right;
+                       break;
                default:
                        die("unknown op '%s'", arg->op.op);
                }
@@ -1978,7 +2172,7 @@ static const struct flag flags[] = {
        { "HRTIMER_RESTART", 1 },
 };
 
-static unsigned long long eval_flag(const char *flag)
+unsigned long long eval_flag(const char *flag)
 {
        int i;
 
@@ -2145,8 +2339,9 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc
                        case 'u':
                        case 'x':
                        case 'i':
-                               bptr = (void *)(((unsigned long)bptr + (long_size - 1)) &
-                                               ~(long_size - 1));
+                               /* the pointers are always 4 bytes aligned */
+                               bptr = (void *)(((unsigned long)bptr + 3) &
+                                               ~3);
                                switch (ls) {
                                case 0:
                                case 1:
@@ -2270,7 +2465,27 @@ static void pretty_print(void *data, int size, struct event *event)
 
        for (; *ptr; ptr++) {
                ls = 0;
-               if (*ptr == '%') {
+               if (*ptr == '\\') {
+                       ptr++;
+                       switch (*ptr) {
+                       case 'n':
+                               printf("\n");
+                               break;
+                       case 't':
+                               printf("\t");
+                               break;
+                       case 'r':
+                               printf("\r");
+                               break;
+                       case '\\':
+                               printf("\\");
+                               break;
+                       default:
+                               printf("%c", *ptr);
+                               break;
+                       }
+
+               } else if (*ptr == '%') {
                        saveptr = ptr;
                        show_func = 0;
  cont_process:
@@ -2377,6 +2592,41 @@ static inline int log10_cpu(int nb)
        return 1;
 }
 
+static void print_lat_fmt(void *data, int size __unused)
+{
+       unsigned int lat_flags;
+       unsigned int pc;
+       int lock_depth;
+       int hardirq;
+       int softirq;
+
+       lat_flags = parse_common_flags(data);
+       pc = parse_common_pc(data);
+       lock_depth = parse_common_lock_depth(data);
+
+       hardirq = lat_flags & TRACE_FLAG_HARDIRQ;
+       softirq = lat_flags & TRACE_FLAG_SOFTIRQ;
+
+       printf("%c%c%c",
+              (lat_flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+              (lat_flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+              'X' : '.',
+              (lat_flags & TRACE_FLAG_NEED_RESCHED) ?
+              'N' : '.',
+              (hardirq && softirq) ? 'H' :
+              hardirq ? 'h' : softirq ? 's' : '.');
+
+       if (pc)
+               printf("%x", pc);
+       else
+               printf(".");
+
+       if (lock_depth < 0)
+               printf(".");
+       else
+               printf("%d", lock_depth);
+}
+
 /* taken from Linux, written by Frederic Weisbecker */
 static void print_graph_cpu(int cpu)
 {
@@ -2452,7 +2702,7 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
        if (!(event->flags & EVENT_FL_ISFUNCRET))
                return NULL;
 
-       pid = parse_common_pid(next->data);
+       pid = trace_parse_common_pid(next->data);
        field = find_field(event, "func");
        if (!field)
                die("function return does not have field func");
@@ -2620,6 +2870,11 @@ pretty_print_func_ent(void *data, int size, struct event *event,
 
        printf(" | ");
 
+       if (latency_format) {
+               print_lat_fmt(data, size);
+               printf(" | ");
+       }
+
        field = find_field(event, "func");
        if (!field)
                die("function entry does not have func field");
@@ -2663,6 +2918,11 @@ pretty_print_func_ret(void *data, int size __unused, struct event *event,
 
        printf(" | ");
 
+       if (latency_format) {
+               print_lat_fmt(data, size);
+               printf(" | ");
+       }
+
        field = find_field(event, "rettime");
        if (!field)
                die("can't find rettime in return graph");
@@ -2724,19 +2984,30 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 
        event = trace_find_event(type);
        if (!event) {
-               printf("ug! no event found for type %d\n", type);
+               warning("ug! no event found for type %d", type);
                return;
        }
 
-       pid = parse_common_pid(data);
+       pid = trace_parse_common_pid(data);
 
        if (event->flags & (EVENT_FL_ISFUNCENT | EVENT_FL_ISFUNCRET))
                return pretty_print_func_graph(data, size, event, cpu,
                                               pid, comm, secs, usecs);
 
-       printf("%16s-%-5d [%03d] %5lu.%09Lu: %s: ",
-              comm, pid,  cpu,
-              secs, nsecs, event->name);
+       if (latency_format) {
+               printf("%8.8s-%-5d %3d",
+                      comm, pid, cpu);
+               print_lat_fmt(data, size);
+       } else
+               printf("%16s-%-5d [%03d]", comm, pid,  cpu);
+
+       printf(" %5lu.%06lu: %s: ", secs, usecs, event->name);
+
+       if (event->flags & EVENT_FL_FAILED) {
+               printf("EVENT '%s' FAILED TO PARSE\n",
+                      event->name);
+               return;
+       }
 
        pretty_print(data, size, event);
        printf("\n");
@@ -2807,46 +3078,71 @@ static void print_args(struct print_arg *args)
        }
 }
 
-static void parse_header_field(char *type,
+static void parse_header_field(const char *field,
                               int *offset, int *size)
 {
        char *token;
+       int type;
 
-       if (read_expected(EVENT_ITEM, (char *)"field") < 0)
+       if (read_expected(EVENT_ITEM, "field") < 0)
                return;
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return;
+
        /* type */
        if (read_expect_type(EVENT_ITEM, &token) < 0)
-               return;
+               goto fail;
        free_token(token);
 
-       if (read_expected(EVENT_ITEM, type) < 0)
+       if (read_expected(EVENT_ITEM, field) < 0)
                return;
-       if (read_expected(EVENT_OP, (char *)";") < 0)
+       if (read_expected(EVENT_OP, ";") < 0)
                return;
-       if (read_expected(EVENT_ITEM, (char *)"offset") < 0)
+       if (read_expected(EVENT_ITEM, "offset") < 0)
                return;
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return;
        if (read_expect_type(EVENT_ITEM, &token) < 0)
-               return;
+               goto fail;
        *offset = atoi(token);
        free_token(token);
-       if (read_expected(EVENT_OP, (char *)";") < 0)
+       if (read_expected(EVENT_OP, ";") < 0)
                return;
-       if (read_expected(EVENT_ITEM, (char *)"size") < 0)
+       if (read_expected(EVENT_ITEM, "size") < 0)
                return;
-       if (read_expected(EVENT_OP, (char *)":") < 0)
+       if (read_expected(EVENT_OP, ":") < 0)
                return;
        if (read_expect_type(EVENT_ITEM, &token) < 0)
-               return;
+               goto fail;
        *size = atoi(token);
        free_token(token);
-       if (read_expected(EVENT_OP, (char *)";") < 0)
-               return;
-       if (read_expect_type(EVENT_NEWLINE, &token) < 0)
+       if (read_expected(EVENT_OP, ";") < 0)
                return;
+       type = read_token(&token);
+       if (type != EVENT_NEWLINE) {
+               /* newer versions of the kernel have a "signed" type */
+               if (type != EVENT_ITEM)
+                       goto fail;
+
+               if (strcmp(token, "signed") != 0)
+                       goto fail;
+
+               free_token(token);
+
+               if (read_expected(EVENT_OP, ":") < 0)
+                       return;
+
+               if (read_expect_type(EVENT_ITEM, &token))
+                       goto fail;
+
+               free_token(token);
+               if (read_expected(EVENT_OP, ";") < 0)
+                       return;
+
+               if (read_expect_type(EVENT_NEWLINE, &token))
+                       goto fail;
+       }
+ fail:
        free_token(token);
 }
 
@@ -2854,11 +3150,11 @@ int parse_header_page(char *buf, unsigned long size)
 {
        init_input_buf(buf, size);
 
-       parse_header_field((char *)"timestamp", &header_page_ts_offset,
+       parse_header_field("timestamp", &header_page_ts_offset,
                           &header_page_ts_size);
-       parse_header_field((char *)"commit", &header_page_size_offset,
+       parse_header_field("commit", &header_page_size_offset,
                           &header_page_size_size);
-       parse_header_field((char *)"data", &header_page_data_offset,
+       parse_header_field("data", &header_page_data_offset,
                           &header_page_data_size);
 
        return 0;
@@ -2909,6 +3205,9 @@ int parse_ftrace_file(char *buf, unsigned long size)
        if (ret < 0)
                die("failed to read ftrace event print fmt");
 
+       /* New ftrace handles args */
+       if (ret > 0)
+               return 0;
        /*
         * The arguments for ftrace files are parsed by the fields.
         * Set up the fields as their arguments.
@@ -2926,7 +3225,7 @@ int parse_ftrace_file(char *buf, unsigned long size)
        return 0;
 }
 
-int parse_event_file(char *buf, unsigned long size, char *system__unused __unused)
+int parse_event_file(char *buf, unsigned long size, char *sys)
 {
        struct event *event;
        int ret;
@@ -2946,12 +3245,18 @@ int parse_event_file(char *buf, unsigned long size, char *system__unused __unuse
                die("failed to read event id");
 
        ret = event_read_format(event);
-       if (ret < 0)
-               die("failed to read event format");
+       if (ret < 0) {
+               warning("failed to read event format for %s", event->name);
+               goto event_failed;
+       }
 
        ret = event_read_print(event);
-       if (ret < 0)
-               die("failed to read event print fmt");
+       if (ret < 0) {
+               warning("failed to read event print fmt for %s", event->name);
+               goto event_failed;
+       }
+
+       event->system = strdup(sys);
 
 #define PRINT_ARGS 0
        if (PRINT_ARGS && event->print_fmt.args)
@@ -2959,6 +3264,12 @@ int parse_event_file(char *buf, unsigned long size, char *system__unused __unuse
 
        add_event(event);
        return 0;
+
+ event_failed:
+       event->flags |= EVENT_FL_FAILED;
+       /* still add it even if it failed */
+       add_event(event);
+       return -1;
 }
 
 void parse_set_info(int nr_cpus, int long_sz)
diff --git a/tools/perf/util/trace-event-perl.c b/tools/perf/util/trace-event-perl.c
new file mode 100644 (file)
index 0000000..51e833f
--- /dev/null
@@ -0,0 +1,598 @@
+/*
+ * trace-event-perl.  Feed perf trace events to an embedded Perl interpreter.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "../perf.h"
+#include "util.h"
+#include "trace-event.h"
+#include "trace-event-perl.h"
+
+void xs_init(pTHX);
+
+void boot_Perf__Trace__Context(pTHX_ CV *cv);
+void boot_DynaLoader(pTHX_ CV *cv);
+
+void xs_init(pTHX)
+{
+       const char *file = __FILE__;
+       dXSUB_SYS;
+
+       newXS("Perf::Trace::Context::bootstrap", boot_Perf__Trace__Context,
+             file);
+       newXS("DynaLoader::boot_DynaLoader", boot_DynaLoader, file);
+}
+
+INTERP my_perl;
+
+#define FTRACE_MAX_EVENT                               \
+       ((1 << (sizeof(unsigned short) * 8)) - 1)
+
+struct event *events[FTRACE_MAX_EVENT];
+
+static struct scripting_context *scripting_context;
+
+static char *cur_field_name;
+static int zero_flag_atom;
+
+static void define_symbolic_value(const char *ev_name,
+                                 const char *field_name,
+                                 const char *field_value,
+                                 const char *field_str)
+{
+       unsigned long long value;
+       dSP;
+
+       value = eval_flag(field_value);
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+
+       XPUSHs(sv_2mortal(newSVpv(ev_name, 0)));
+       XPUSHs(sv_2mortal(newSVpv(field_name, 0)));
+       XPUSHs(sv_2mortal(newSVuv(value)));
+       XPUSHs(sv_2mortal(newSVpv(field_str, 0)));
+
+       PUTBACK;
+       if (get_cv("main::define_symbolic_value", 0))
+               call_pv("main::define_symbolic_value", G_SCALAR);
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void define_symbolic_values(struct print_flag_sym *field,
+                                  const char *ev_name,
+                                  const char *field_name)
+{
+       define_symbolic_value(ev_name, field_name, field->value, field->str);
+       if (field->next)
+               define_symbolic_values(field->next, ev_name, field_name);
+}
+
+static void define_symbolic_field(const char *ev_name,
+                                 const char *field_name)
+{
+       dSP;
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+
+       XPUSHs(sv_2mortal(newSVpv(ev_name, 0)));
+       XPUSHs(sv_2mortal(newSVpv(field_name, 0)));
+
+       PUTBACK;
+       if (get_cv("main::define_symbolic_field", 0))
+               call_pv("main::define_symbolic_field", G_SCALAR);
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void define_flag_value(const char *ev_name,
+                             const char *field_name,
+                             const char *field_value,
+                             const char *field_str)
+{
+       unsigned long long value;
+       dSP;
+
+       value = eval_flag(field_value);
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+
+       XPUSHs(sv_2mortal(newSVpv(ev_name, 0)));
+       XPUSHs(sv_2mortal(newSVpv(field_name, 0)));
+       XPUSHs(sv_2mortal(newSVuv(value)));
+       XPUSHs(sv_2mortal(newSVpv(field_str, 0)));
+
+       PUTBACK;
+       if (get_cv("main::define_flag_value", 0))
+               call_pv("main::define_flag_value", G_SCALAR);
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void define_flag_values(struct print_flag_sym *field,
+                              const char *ev_name,
+                              const char *field_name)
+{
+       define_flag_value(ev_name, field_name, field->value, field->str);
+       if (field->next)
+               define_flag_values(field->next, ev_name, field_name);
+}
+
+static void define_flag_field(const char *ev_name,
+                             const char *field_name,
+                             const char *delim)
+{
+       dSP;
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+
+       XPUSHs(sv_2mortal(newSVpv(ev_name, 0)));
+       XPUSHs(sv_2mortal(newSVpv(field_name, 0)));
+       XPUSHs(sv_2mortal(newSVpv(delim, 0)));
+
+       PUTBACK;
+       if (get_cv("main::define_flag_field", 0))
+               call_pv("main::define_flag_field", G_SCALAR);
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void define_event_symbols(struct event *event,
+                                const char *ev_name,
+                                struct print_arg *args)
+{
+       switch (args->type) {
+       case PRINT_NULL:
+               break;
+       case PRINT_ATOM:
+               define_flag_value(ev_name, cur_field_name, "0",
+                                 args->atom.atom);
+               zero_flag_atom = 0;
+               break;
+       case PRINT_FIELD:
+               if (cur_field_name)
+                       free(cur_field_name);
+               cur_field_name = strdup(args->field.name);
+               break;
+       case PRINT_FLAGS:
+               define_event_symbols(event, ev_name, args->flags.field);
+               define_flag_field(ev_name, cur_field_name, args->flags.delim);
+               define_flag_values(args->flags.flags, ev_name, cur_field_name);
+               break;
+       case PRINT_SYMBOL:
+               define_event_symbols(event, ev_name, args->symbol.field);
+               define_symbolic_field(ev_name, cur_field_name);
+               define_symbolic_values(args->symbol.symbols, ev_name,
+                                      cur_field_name);
+               break;
+       case PRINT_STRING:
+               break;
+       case PRINT_TYPE:
+               define_event_symbols(event, ev_name, args->typecast.item);
+               break;
+       case PRINT_OP:
+               if (strcmp(args->op.op, ":") == 0)
+                       zero_flag_atom = 1;
+               define_event_symbols(event, ev_name, args->op.left);
+               define_event_symbols(event, ev_name, args->op.right);
+               break;
+       default:
+               /* we should warn... */
+               return;
+       }
+
+       if (args->next)
+               define_event_symbols(event, ev_name, args->next);
+}
+
+static inline struct event *find_cache_event(int type)
+{
+       static char ev_name[256];
+       struct event *event;
+
+       if (events[type])
+               return events[type];
+
+       events[type] = event = trace_find_event(type);
+       if (!event)
+               return NULL;
+
+       sprintf(ev_name, "%s::%s", event->system, event->name);
+
+       define_event_symbols(event, ev_name, event->print_fmt.args);
+
+       return event;
+}
+
+int common_pc(struct scripting_context *context)
+{
+       int pc;
+
+       pc = parse_common_pc(context->event_data);
+
+       return pc;
+}
+
+int common_flags(struct scripting_context *context)
+{
+       int flags;
+
+       flags = parse_common_flags(context->event_data);
+
+       return flags;
+}
+
+int common_lock_depth(struct scripting_context *context)
+{
+       int lock_depth;
+
+       lock_depth = parse_common_lock_depth(context->event_data);
+
+       return lock_depth;
+}
+
+static void perl_process_event(int cpu, void *data,
+                              int size __attribute((unused)),
+                              unsigned long long nsecs, char *comm)
+{
+       struct format_field *field;
+       static char handler[256];
+       unsigned long long val;
+       unsigned long s, ns;
+       struct event *event;
+       int type;
+       int pid;
+
+       dSP;
+
+       type = trace_parse_common_type(data);
+
+       event = find_cache_event(type);
+       if (!event)
+               die("ug! no event found for type %d", type);
+
+       pid = trace_parse_common_pid(data);
+
+       sprintf(handler, "%s::%s", event->system, event->name);
+
+       s = nsecs / NSECS_PER_SEC;
+       ns = nsecs - s * NSECS_PER_SEC;
+
+       scripting_context->event_data = data;
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+
+       XPUSHs(sv_2mortal(newSVpv(handler, 0)));
+       XPUSHs(sv_2mortal(newSViv(PTR2IV(scripting_context))));
+       XPUSHs(sv_2mortal(newSVuv(cpu)));
+       XPUSHs(sv_2mortal(newSVuv(s)));
+       XPUSHs(sv_2mortal(newSVuv(ns)));
+       XPUSHs(sv_2mortal(newSViv(pid)));
+       XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+
+       /* common fields other than pid can be accessed via xsub fns */
+
+       for (field = event->format.fields; field; field = field->next) {
+               if (field->flags & FIELD_IS_STRING) {
+                       int offset;
+                       if (field->flags & FIELD_IS_DYNAMIC) {
+                               offset = *(int *)(data + field->offset);
+                               offset &= 0xffff;
+                       } else
+                               offset = field->offset;
+                       XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0)));
+               } else { /* FIELD_IS_NUMERIC */
+                       val = read_size(data + field->offset, field->size);
+                       if (field->flags & FIELD_IS_SIGNED) {
+                               XPUSHs(sv_2mortal(newSViv(val)));
+                       } else {
+                               XPUSHs(sv_2mortal(newSVuv(val)));
+                       }
+               }
+       }
+
+       PUTBACK;
+
+       if (get_cv(handler, 0))
+               call_pv(handler, G_SCALAR);
+       else if (get_cv("main::trace_unhandled", 0)) {
+               XPUSHs(sv_2mortal(newSVpv(handler, 0)));
+               XPUSHs(sv_2mortal(newSViv(PTR2IV(scripting_context))));
+               XPUSHs(sv_2mortal(newSVuv(cpu)));
+               XPUSHs(sv_2mortal(newSVuv(nsecs)));
+               XPUSHs(sv_2mortal(newSViv(pid)));
+               XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+               call_pv("main::trace_unhandled", G_SCALAR);
+       }
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void run_start_sub(void)
+{
+       dSP; /* access to Perl stack */
+       PUSHMARK(SP);
+
+       if (get_cv("main::trace_begin", 0))
+               call_pv("main::trace_begin", G_DISCARD | G_NOARGS);
+}
+
+/*
+ * Start trace script
+ */
+static int perl_start_script(const char *script)
+{
+       const char *command_line[2] = { "", NULL };
+
+       command_line[1] = script;
+
+       my_perl = perl_alloc();
+       perl_construct(my_perl);
+
+       if (perl_parse(my_perl, xs_init, 2, (char **)command_line,
+                      (char **)NULL))
+               return -1;
+
+       perl_run(my_perl);
+       if (SvTRUE(ERRSV))
+               return -1;
+
+       run_start_sub();
+
+       fprintf(stderr, "perf trace started with Perl script %s\n\n", script);
+
+       return 0;
+}
+
+/*
+ * Stop trace script
+ */
+static int perl_stop_script(void)
+{
+       dSP; /* access to Perl stack */
+       PUSHMARK(SP);
+
+       if (get_cv("main::trace_end", 0))
+               call_pv("main::trace_end", G_DISCARD | G_NOARGS);
+
+       perl_destruct(my_perl);
+       perl_free(my_perl);
+
+       fprintf(stderr, "\nperf trace Perl script stopped\n");
+
+       return 0;
+}
+
+static int perl_generate_script(const char *outfile)
+{
+       struct event *event = NULL;
+       struct format_field *f;
+       char fname[PATH_MAX];
+       int not_first, count;
+       FILE *ofp;
+
+       sprintf(fname, "%s.pl", outfile);
+       ofp = fopen(fname, "w");
+       if (ofp == NULL) {
+               fprintf(stderr, "couldn't open %s\n", fname);
+               return -1;
+       }
+
+       fprintf(ofp, "# perf trace event handlers, "
+               "generated by perf trace -g perl\n");
+
+       fprintf(ofp, "# Licensed under the terms of the GNU GPL"
+               " License version 2\n\n");
+
+       fprintf(ofp, "# The common_* event handler fields are the most useful "
+               "fields common to\n");
+
+       fprintf(ofp, "# all events.  They don't necessarily correspond to "
+               "the 'common_*' fields\n");
+
+       fprintf(ofp, "# in the format files.  Those fields not available as "
+               "handler params can\n");
+
+       fprintf(ofp, "# be retrieved using Perl functions of the form "
+               "common_*($context).\n");
+
+       fprintf(ofp, "# See Context.pm for the list of available "
+               "functions.\n\n");
+
+       fprintf(ofp, "use lib \"$ENV{'PERF_EXEC_PATH'}/scripts/perl/"
+               "Perf-Trace-Util/lib\";\n");
+
+       fprintf(ofp, "use lib \"./Perf-Trace-Util/lib\";\n");
+       fprintf(ofp, "use Perf::Trace::Core;\n");
+       fprintf(ofp, "use Perf::Trace::Context;\n");
+       fprintf(ofp, "use Perf::Trace::Util;\n\n");
+
+       fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n");
+       fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n");
+
+       while ((event = trace_find_next_event(event))) {
+               fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name);
+               fprintf(ofp, "\tmy (");
+
+               fprintf(ofp, "$event_name, ");
+               fprintf(ofp, "$context, ");
+               fprintf(ofp, "$common_cpu, ");
+               fprintf(ofp, "$common_secs, ");
+               fprintf(ofp, "$common_nsecs,\n");
+               fprintf(ofp, "\t    $common_pid, ");
+               fprintf(ofp, "$common_comm,\n\t    ");
+
+               not_first = 0;
+               count = 0;
+
+               for (f = event->format.fields; f; f = f->next) {
+                       if (not_first++)
+                               fprintf(ofp, ", ");
+                       if (++count % 5 == 0)
+                               fprintf(ofp, "\n\t    ");
+
+                       fprintf(ofp, "$%s", f->name);
+               }
+               fprintf(ofp, ") = @_;\n\n");
+
+               fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
+                       "$common_secs, $common_nsecs,\n\t             "
+                       "$common_pid, $common_comm);\n\n");
+
+               fprintf(ofp, "\tprintf(\"");
+
+               not_first = 0;
+               count = 0;
+
+               for (f = event->format.fields; f; f = f->next) {
+                       if (not_first++)
+                               fprintf(ofp, ", ");
+                       if (count && count % 4 == 0) {
+                               fprintf(ofp, "\".\n\t       \"");
+                       }
+                       count++;
+
+                       fprintf(ofp, "%s=", f->name);
+                       if (f->flags & FIELD_IS_STRING ||
+                           f->flags & FIELD_IS_FLAG ||
+                           f->flags & FIELD_IS_SYMBOLIC)
+                               fprintf(ofp, "%%s");
+                       else if (f->flags & FIELD_IS_SIGNED)
+                               fprintf(ofp, "%%d");
+                       else
+                               fprintf(ofp, "%%u");
+               }
+
+               fprintf(ofp, "\\n\",\n\t       ");
+
+               not_first = 0;
+               count = 0;
+
+               for (f = event->format.fields; f; f = f->next) {
+                       if (not_first++)
+                               fprintf(ofp, ", ");
+
+                       if (++count % 5 == 0)
+                               fprintf(ofp, "\n\t       ");
+
+                       if (f->flags & FIELD_IS_FLAG) {
+                               if ((count - 1) % 5 != 0) {
+                                       fprintf(ofp, "\n\t       ");
+                                       count = 4;
+                               }
+                               fprintf(ofp, "flag_str(\"");
+                               fprintf(ofp, "%s::%s\", ", event->system,
+                                       event->name);
+                               fprintf(ofp, "\"%s\", $%s)", f->name,
+                                       f->name);
+                       } else if (f->flags & FIELD_IS_SYMBOLIC) {
+                               if ((count - 1) % 5 != 0) {
+                                       fprintf(ofp, "\n\t       ");
+                                       count = 4;
+                               }
+                               fprintf(ofp, "symbol_str(\"");
+                               fprintf(ofp, "%s::%s\", ", event->system,
+                                       event->name);
+                               fprintf(ofp, "\"%s\", $%s)", f->name,
+                                       f->name);
+                       } else
+                               fprintf(ofp, "$%s", f->name);
+               }
+
+               fprintf(ofp, ");\n");
+               fprintf(ofp, "}\n\n");
+       }
+
+       fprintf(ofp, "sub trace_unhandled\n{\n\tmy ($event_name, $context, "
+               "$common_cpu, $common_secs, $common_nsecs,\n\t    "
+               "$common_pid, $common_comm) = @_;\n\n");
+
+       fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
+               "$common_secs, $common_nsecs,\n\t             $common_pid, "
+               "$common_comm);\n}\n\n");
+
+       fprintf(ofp, "sub print_header\n{\n"
+               "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n"
+               "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t       "
+               "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}");
+
+       fclose(ofp);
+
+       fprintf(stderr, "generated Perl script: %s\n", fname);
+
+       return 0;
+}
+
+struct scripting_ops perl_scripting_ops = {
+       .name = "Perl",
+       .start_script = perl_start_script,
+       .stop_script = perl_stop_script,
+       .process_event = perl_process_event,
+       .generate_script = perl_generate_script,
+};
+
+#ifdef NO_LIBPERL
+void setup_perl_scripting(void)
+{
+       fprintf(stderr, "Perl scripting not supported."
+               "  Install libperl and rebuild perf to enable it.  e.g. "
+               "apt-get install libperl-dev (ubuntu), yum install "
+               "perl-ExtUtils-Embed (Fedora), etc.\n");
+}
+#else
+void setup_perl_scripting(void)
+{
+       int err;
+       err = script_spec_register("Perl", &perl_scripting_ops);
+       if (err)
+               die("error registering Perl script extension");
+
+       err = script_spec_register("pl", &perl_scripting_ops);
+       if (err)
+               die("error registering pl script extension");
+
+       scripting_context = malloc(sizeof(struct scripting_context));
+}
+#endif
diff --git a/tools/perf/util/trace-event-perl.h b/tools/perf/util/trace-event-perl.h
new file mode 100644 (file)
index 0000000..8fe0d86
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef __PERF_TRACE_EVENT_PERL_H
+#define __PERF_TRACE_EVENT_PERL_H
+#ifdef NO_LIBPERL
+typedef int INTERP;
+#define dSP
+#define ENTER
+#define SAVETMPS
+#define PUTBACK
+#define SPAGAIN
+#define FREETMPS
+#define LEAVE
+#define SP
+#define ERRSV
+#define G_SCALAR               (0)
+#define G_DISCARD              (0)
+#define G_NOARGS               (0)
+#define PUSHMARK(a)
+#define SvTRUE(a)              (0)
+#define XPUSHs(s)
+#define sv_2mortal(a)
+#define newSVpv(a,b)
+#define newSVuv(a)
+#define newSViv(a)
+#define get_cv(a,b)            (0)
+#define call_pv(a,b)           (0)
+#define perl_alloc()           (0)
+#define perl_construct(a)      (0)
+#define perl_parse(a,b,c,d,e)  (0)
+#define perl_run(a)            (0)
+#define perl_destruct(a)       (0)
+#define perl_free(a)           (0)
+#define pTHX                   void
+#define CV                     void
+#define dXSUB_SYS
+#define pTHX_
+static inline void newXS(const char *a, void *b, const char *c) {}
+#else
+#include <EXTERN.h>
+#include <perl.h>
+typedef PerlInterpreter * INTERP;
+#endif
+
+struct scripting_context {
+       void *event_data;
+};
+
+int common_pc(struct scripting_context *context);
+int common_flags(struct scripting_context *context);
+int common_lock_depth(struct scripting_context *context);
+
+#endif /* __PERF_TRACE_EVENT_PERL_H */
index 1b5c847d2c223f96a42f9f4a49a5a1afbb4aead4..342dfdd43f875117a0306cb7cd4bb5f7618f82db 100644 (file)
@@ -458,9 +458,8 @@ struct record *trace_read_data(int cpu)
        return data;
 }
 
-void trace_report(void)
+void trace_report(int fd)
 {
-       const char *input_file = "trace.info";
        char buf[BUFSIZ];
        char test[] = { 23, 8, 68 };
        char *version;
@@ -468,17 +467,15 @@ void trace_report(void)
        int show_funcs = 0;
        int show_printk = 0;
 
-       input_fd = open(input_file, O_RDONLY);
-       if (input_fd < 0)
-               die("opening '%s'\n", input_file);
+       input_fd = fd;
 
        read_or_die(buf, 3);
        if (memcmp(buf, test, 3) != 0)
-               die("not an trace data file");
+               die("no trace data in the file");
 
        read_or_die(buf, 7);
        if (memcmp(buf, "tracing", 7) != 0)
-               die("not a trace file (missing tracing)");
+               die("not a trace file (missing 'tracing' tag)");
 
        version = read_string();
        if (show_version)
index 693f815c9429ca375f74bbfc96901232aa02c48f..81698d5e65039b1c299e92f413dd5f2bf844af7c 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _TRACE_EVENTS_H
-#define _TRACE_EVENTS_H
+#ifndef __PERF_TRACE_EVENTS_H
+#define __PERF_TRACE_EVENTS_H
 
 #include "parse-events.h"
 
@@ -26,6 +26,11 @@ enum {
 enum format_flags {
        FIELD_IS_ARRAY          = 1,
        FIELD_IS_POINTER        = 2,
+       FIELD_IS_SIGNED         = 4,
+       FIELD_IS_STRING         = 8,
+       FIELD_IS_DYNAMIC        = 16,
+       FIELD_IS_FLAG           = 32,
+       FIELD_IS_SYMBOLIC       = 64,
 };
 
 struct format_field {
@@ -132,15 +137,18 @@ struct event {
        int                     flags;
        struct format           format;
        struct print_fmt        print_fmt;
+       char                    *system;
 };
 
 enum {
-       EVENT_FL_ISFTRACE       = 1,
-       EVENT_FL_ISPRINT        = 2,
-       EVENT_FL_ISBPRINT       = 4,
-       EVENT_FL_ISFUNC         = 8,
-       EVENT_FL_ISFUNCENT      = 16,
-       EVENT_FL_ISFUNCRET      = 32,
+       EVENT_FL_ISFTRACE       = 0x01,
+       EVENT_FL_ISPRINT        = 0x02,
+       EVENT_FL_ISBPRINT       = 0x04,
+       EVENT_FL_ISFUNC         = 0x08,
+       EVENT_FL_ISFUNCENT      = 0x10,
+       EVENT_FL_ISFUNCRET      = 0x20,
+
+       EVENT_FL_FAILED         = 0x80000000
 };
 
 struct record {
@@ -154,7 +162,7 @@ struct record *trace_read_data(int cpu);
 
 void parse_set_info(int nr_cpus, int long_sz);
 
-void trace_report(void);
+void trace_report(int fd);
 
 void *malloc_or_die(unsigned int size);
 
@@ -166,7 +174,7 @@ void print_funcs(void);
 void print_printk(void);
 
 int parse_ftrace_file(char *buf, unsigned long size);
-int parse_event_file(char *buf, unsigned long size, char *system);
+int parse_event_file(char *buf, unsigned long size, char *sys);
 void print_event(int cpu, void *data, int size, unsigned long long nsecs,
                  char *comm);
 
@@ -233,13 +241,45 @@ extern int header_page_size_size;
 extern int header_page_data_offset;
 extern int header_page_data_size;
 
+extern int latency_format;
+
 int parse_header_page(char *buf, unsigned long size);
 int trace_parse_common_type(void *data);
+int trace_parse_common_pid(void *data);
+int parse_common_pc(void *data);
+int parse_common_flags(void *data);
+int parse_common_lock_depth(void *data);
 struct event *trace_find_event(int id);
+struct event *trace_find_next_event(struct event *event);
+unsigned long long read_size(void *ptr, int size);
 unsigned long long
 raw_field_value(struct event *event, const char *name, void *data);
 void *raw_field_ptr(struct event *event, const char *name, void *data);
+unsigned long long eval_flag(const char *flag);
+
+int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
+
+/* taken from kernel/trace/trace.h */
+enum trace_flag_type {
+       TRACE_FLAG_IRQS_OFF             = 0x01,
+       TRACE_FLAG_IRQS_NOSUPPORT       = 0x02,
+       TRACE_FLAG_NEED_RESCHED         = 0x04,
+       TRACE_FLAG_HARDIRQ              = 0x08,
+       TRACE_FLAG_SOFTIRQ              = 0x10,
+};
+
+struct scripting_ops {
+       const char *name;
+       int (*start_script) (const char *);
+       int (*stop_script) (void);
+       void (*process_event) (int cpu, void *data, int size,
+                              unsigned long long nsecs, char *comm);
+       int (*generate_script) (const char *outfile);
+};
+
+int script_spec_register(const char *spec, struct scripting_ops *ops);
 
-void read_tracing_data(struct perf_event_attr *pattrs, int nb_events);
+extern struct scripting_ops perl_scripting_ops;
+void setup_perl_scripting(void);
 
-#endif /* _TRACE_EVENTS_H */
+#endif /* __PERF_TRACE_EVENTS_H */
index 5e75f9005940ac497f32ea1e12b84a9389b77ef3..7d6b8331f8984764e23260eff33d4a13ea09317f 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _PERF_TYPES_H
-#define _PERF_TYPES_H
+#ifndef __PERF_TYPES_H
+#define __PERF_TYPES_H
 
 /*
  * We define u64 as unsigned long long for every architecture
@@ -14,4 +14,4 @@ typedef signed short     s16;
 typedef unsigned char     u8;
 typedef signed char       s8;
 
-#endif /* _PERF_TYPES_H */
+#endif /* __PERF_TYPES_H */
index 9de2329dd44d3ea0f7809c7504d41b594307c6c5..c673d8825883ce2bec91b03f5c1e474a756fde19 100644 (file)
@@ -84,6 +84,9 @@
 #include <iconv.h>
 #endif
 
+extern const char *graph_line;
+extern const char *graph_dotted_line;
+
 /* On most systems <limits.h> would have given us this, but
  * not on some systems (e.g. GNU/Hurd).
  */
@@ -134,6 +137,15 @@ extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1,
 extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
 extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
 
+#include "../../../include/linux/stringify.h"
+
+#define DIE_IF(cnd)    \
+       do { if (cnd)   \
+               die(" at (" __FILE__ ":" __stringify(__LINE__) "): "    \
+                   __stringify(cnd) "\n");                             \
+       } while (0)
+
+
 extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
 
 extern int prefixcmp(const char *str, const char *prefix);
@@ -278,17 +290,15 @@ static inline char *gitstrchrnul(const char *s, int c)
  * Wrappers:
  */
 extern char *xstrdup(const char *str);
-extern void *xmalloc(size_t size);
+extern void *xmalloc(size_t size) __attribute__((weak));
 extern void *xmemdupz(const void *data, size_t len);
 extern char *xstrndup(const char *str, size_t len);
-extern void *xrealloc(void *ptr, size_t size);
-extern void *xcalloc(size_t nmemb, size_t size);
-extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
-extern ssize_t xread(int fd, void *buf, size_t len);
-extern ssize_t xwrite(int fd, const void *buf, size_t len);
-extern int xdup(int fd);
-extern FILE *xfdopen(int fd, const char *mode);
-extern int xmkstemp(char *template);
+extern void *xrealloc(void *ptr, size_t size) __attribute__((weak));
+
+static inline void *zalloc(size_t size)
+{
+       return calloc(1, size);
+}
 
 static inline size_t xsize_t(off_t len)
 {
@@ -306,6 +316,7 @@ static inline int has_extension(const char *filename, const char *ext)
 #undef isascii
 #undef isspace
 #undef isdigit
+#undef isxdigit
 #undef isalpha
 #undef isprint
 #undef isalnum
@@ -323,6 +334,8 @@ extern unsigned char sane_ctype[256];
 #define isascii(x) (((x) & ~0x7f) == 0)
 #define isspace(x) sane_istest(x,GIT_SPACE)
 #define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isxdigit(x)    \
+       (sane_istest(toupper(x), GIT_ALPHA | GIT_DIGIT) && toupper(x) < 'G')
 #define isalpha(x) sane_istest(x,GIT_ALPHA)
 #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
 #define isprint(x) sane_istest(x,GIT_PRINT)
index cadf8cf2a590c7169a2f3be58fdd9aded515cd43..2fa967e1a88aaa0a4a4f6dfe98517be6c08c8f68 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef _PERF_VALUES_H
-#define _PERF_VALUES_H
+#ifndef __PERF_VALUES_H
+#define __PERF_VALUES_H
 
 #include "types.h"
 
@@ -24,4 +24,4 @@ void perf_read_values_add_value(struct perf_read_values *values,
 void perf_read_values_display(FILE *fp, struct perf_read_values *values,
                              int raw);
 
-#endif /* _PERF_VALUES_H */
+#endif /* __PERF_VALUES_H */
index 4574ac28396f6779fcecacfafe570dc0a5e01dc6..bf44ca85d23be29334d69ef6718d9b613f06a9bb 100644 (file)
@@ -79,43 +79,12 @@ void *xrealloc(void *ptr, size_t size)
        return ret;
 }
 
-void *xcalloc(size_t nmemb, size_t size)
-{
-       void *ret = calloc(nmemb, size);
-       if (!ret && (!nmemb || !size))
-               ret = calloc(1, 1);
-       if (!ret) {
-               release_pack_memory(nmemb * size, -1);
-               ret = calloc(nmemb, size);
-               if (!ret && (!nmemb || !size))
-                       ret = calloc(1, 1);
-               if (!ret)
-                       die("Out of memory, calloc failed");
-       }
-       return ret;
-}
-
-void *xmmap(void *start, size_t length,
-       int prot, int flags, int fd, off_t offset)
-{
-       void *ret = mmap(start, length, prot, flags, fd, offset);
-       if (ret == MAP_FAILED) {
-               if (!length)
-                       return NULL;
-               release_pack_memory(length, fd);
-               ret = mmap(start, length, prot, flags, fd, offset);
-               if (ret == MAP_FAILED)
-                       die("Out of memory? mmap failed: %s", strerror(errno));
-       }
-       return ret;
-}
-
 /*
  * xread() is the same a read(), but it automatically restarts read()
  * operations with a recoverable error (EAGAIN and EINTR). xread()
  * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
  */
-ssize_t xread(int fd, void *buf, size_t len)
+static ssize_t xread(int fd, void *buf, size_t len)
 {
        ssize_t nr;
        while (1) {
@@ -131,7 +100,7 @@ ssize_t xread(int fd, void *buf, size_t len)
  * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
  * GUARANTEE that "len" bytes is written even if the operation is successful.
  */
-ssize_t xwrite(int fd, const void *buf, size_t len)
+static ssize_t xwrite(int fd, const void *buf, size_t len)
 {
        ssize_t nr;
        while (1) {
@@ -179,29 +148,3 @@ ssize_t write_in_full(int fd, const void *buf, size_t count)
 
        return total;
 }
-
-int xdup(int fd)
-{
-       int ret = dup(fd);
-       if (ret < 0)
-               die("dup failed: %s", strerror(errno));
-       return ret;
-}
-
-FILE *xfdopen(int fd, const char *mode)
-{
-       FILE *stream = fdopen(fd, mode);
-       if (stream == NULL)
-               die("Out of memory? fdopen failed: %s", strerror(errno));
-       return stream;
-}
-
-int xmkstemp(char *template)
-{
-       int fd;
-
-       fd = mkstemp(template);
-       if (fd < 0)
-               die("Unable to create temporary file: %s", strerror(errno));
-       return fd;
-}