4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
42 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
45 #include <linux/err.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <linux/random.h>
49 #include <linux/stringify.h>
50 #include <linux/time64.h>
53 # define O_CLOEXEC 02000000
57 struct perf_tool tool;
58 struct syscalltbl *sctbl;
61 struct syscall *table;
63 struct perf_evsel *sys_enter,
67 struct record_opts opts;
68 struct perf_evlist *evlist;
70 struct thread *current;
73 unsigned long nr_events;
74 struct strlist *ev_qualifier;
83 double duration_filter;
89 unsigned int max_stack;
90 unsigned int min_stack;
91 bool not_ev_qualifier;
95 bool multiple_threads;
101 bool kernel_syscallchains;
111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 return bswap_##bits(value);\
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
141 static int tp_field__init_uint(struct tp_field *field,
142 struct format_field *format_field,
145 field->offset = format_field->offset;
147 switch (format_field->size) {
149 field->integer = tp_field__u8;
152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
169 return sample->raw_data + field->offset;
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
174 field->offset = format_field->offset;
175 field->pointer = tp_field__ptr;
182 struct tp_field args, ret;
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 struct tp_field *field,
190 struct format_field *format_field = perf_evsel__field(evsel, name);
192 if (format_field == NULL)
195 return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 ({ struct syscall_tp *sc = evsel->priv;\
200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 struct tp_field *field,
206 struct format_field *format_field = perf_evsel__field(evsel, name);
208 if (format_field == NULL)
211 return tp_field__init_ptr(field, format_field);
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 ({ struct syscall_tp *sc = evsel->priv;\
216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 perf_evsel__delete(evsel);
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
226 evsel->priv = malloc(sizeof(struct syscall_tp));
227 if (evsel->priv != NULL) {
228 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231 evsel->handler = handler;
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
248 evsel = perf_evsel__newtp("syscalls", direction);
253 if (perf_evsel__init_syscall_tp(evsel, handler))
259 perf_evsel__delete_priv(evsel);
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 ({ struct syscall_tp *fields = evsel->priv; \
265 fields->name.integer(&fields->name, sample); })
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 ({ struct syscall_tp *fields = evsel->priv; \
269 fields->name.pointer(&fields->name, sample); })
274 const char **entries;
277 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
278 .nr_entries = ARRAY_SIZE(array), \
282 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
284 .nr_entries = ARRAY_SIZE(array), \
288 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
290 struct syscall_arg *arg)
292 struct strarray *sa = arg->parm;
293 int idx = arg->val - sa->offset;
295 if (idx < 0 || idx >= sa->nr_entries)
296 return scnprintf(bf, size, intfmt, arg->val);
298 return scnprintf(bf, size, "%s", sa->entries[idx]);
301 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
302 struct syscall_arg *arg)
304 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
309 #if defined(__i386__) || defined(__x86_64__)
311 * FIXME: Make this available to all arches as soon as the ioctl beautifier
312 * gets rewritten to support all arches.
314 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
315 struct syscall_arg *arg)
317 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
320 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
321 #endif /* defined(__i386__) || defined(__x86_64__) */
323 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
324 struct syscall_arg *arg);
326 #define SCA_FD syscall_arg__scnprintf_fd
329 #define AT_FDCWD -100
332 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
333 struct syscall_arg *arg)
338 return scnprintf(bf, size, "CWD");
340 return syscall_arg__scnprintf_fd(bf, size, arg);
343 #define SCA_FDAT syscall_arg__scnprintf_fd_at
345 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
346 struct syscall_arg *arg);
348 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
350 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
351 struct syscall_arg *arg)
353 return scnprintf(bf, size, "%#lx", arg->val);
356 #define SCA_HEX syscall_arg__scnprintf_hex
358 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
359 struct syscall_arg *arg)
361 return scnprintf(bf, size, "%d", arg->val);
364 #define SCA_INT syscall_arg__scnprintf_int
366 static const char *bpf_cmd[] = {
367 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
368 "MAP_GET_NEXT_KEY", "PROG_LOAD",
370 static DEFINE_STRARRAY(bpf_cmd);
372 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
373 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
375 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
376 static DEFINE_STRARRAY(itimers);
378 static const char *keyctl_options[] = {
379 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
380 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
381 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
382 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
383 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
385 static DEFINE_STRARRAY(keyctl_options);
387 static const char *whences[] = { "SET", "CUR", "END",
395 static DEFINE_STRARRAY(whences);
397 static const char *fcntl_cmds[] = {
398 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
399 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
400 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
403 static DEFINE_STRARRAY(fcntl_cmds);
405 static const char *rlimit_resources[] = {
406 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
407 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
410 static DEFINE_STRARRAY(rlimit_resources);
412 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
413 static DEFINE_STRARRAY(sighow);
415 static const char *clockid[] = {
416 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
417 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
418 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
420 static DEFINE_STRARRAY(clockid);
422 static const char *socket_families[] = {
423 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
424 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
425 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
426 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
427 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
428 "ALG", "NFC", "VSOCK",
430 static DEFINE_STRARRAY(socket_families);
432 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
433 struct syscall_arg *arg)
438 if (mode == F_OK) /* 0 */
439 return scnprintf(bf, size, "F");
441 if (mode & n##_OK) { \
442 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
457 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
459 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
460 struct syscall_arg *arg);
462 #define SCA_FILENAME syscall_arg__scnprintf_filename
464 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
465 struct syscall_arg *arg)
467 int printed = 0, flags = arg->val;
470 if (flags & O_##n) { \
471 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
480 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
485 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
487 #if defined(__i386__) || defined(__x86_64__)
489 * FIXME: Make this available to all arches.
491 #define TCGETS 0x5401
493 static const char *tioctls[] = {
494 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
495 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
496 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
497 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
498 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
499 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
500 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
501 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
502 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
503 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
504 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
505 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
506 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
507 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
508 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
511 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
512 #endif /* defined(__i386__) || defined(__x86_64__) */
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK 0x0001
518 #define GRND_RANDOM 0x0002
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 struct syscall_arg *arg)
524 int printed = 0, flags = arg->val;
527 if (flags & GRND_##n) { \
528 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 flags &= ~GRND_##n; \
537 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544 #define STRARRAY(arg, name, array) \
545 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
546 .arg_parm = { [arg] = &strarray__##array, }
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
563 static struct syscall_fmt {
566 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
573 { .name = "access", .errmsg = true,
574 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
575 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
576 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
577 { .name = "brk", .hexret = true,
578 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
579 { .name = "chdir", .errmsg = true, },
580 { .name = "chmod", .errmsg = true, },
581 { .name = "chroot", .errmsg = true, },
582 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
583 { .name = "clone", .errpid = true, },
584 { .name = "close", .errmsg = true,
585 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
586 { .name = "connect", .errmsg = true, },
587 { .name = "creat", .errmsg = true, },
588 { .name = "dup", .errmsg = true, },
589 { .name = "dup2", .errmsg = true, },
590 { .name = "dup3", .errmsg = true, },
591 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
592 { .name = "eventfd2", .errmsg = true,
593 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
594 { .name = "faccessat", .errmsg = true, },
595 { .name = "fadvise64", .errmsg = true, },
596 { .name = "fallocate", .errmsg = true, },
597 { .name = "fchdir", .errmsg = true, },
598 { .name = "fchmod", .errmsg = true, },
599 { .name = "fchmodat", .errmsg = true,
600 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
601 { .name = "fchown", .errmsg = true, },
602 { .name = "fchownat", .errmsg = true,
603 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
604 { .name = "fcntl", .errmsg = true,
605 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
606 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
607 { .name = "fdatasync", .errmsg = true, },
608 { .name = "flock", .errmsg = true,
609 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
610 { .name = "fsetxattr", .errmsg = true, },
611 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
612 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
613 { .name = "fstatfs", .errmsg = true, },
614 { .name = "fsync", .errmsg = true, },
615 { .name = "ftruncate", .errmsg = true, },
616 { .name = "futex", .errmsg = true,
617 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
618 { .name = "futimesat", .errmsg = true,
619 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
620 { .name = "getdents", .errmsg = true, },
621 { .name = "getdents64", .errmsg = true, },
622 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
623 { .name = "getpid", .errpid = true, },
624 { .name = "getpgid", .errpid = true, },
625 { .name = "getppid", .errpid = true, },
626 { .name = "getrandom", .errmsg = true,
627 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
628 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
629 { .name = "getxattr", .errmsg = true, },
630 { .name = "inotify_add_watch", .errmsg = true, },
631 { .name = "ioctl", .errmsg = true,
633 #if defined(__i386__) || defined(__x86_64__)
635 * FIXME: Make this available to all arches.
637 [1] = SCA_STRHEXARRAY, /* cmd */
638 [2] = SCA_HEX, /* arg */ },
639 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
641 [2] = SCA_HEX, /* arg */ }, },
643 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
644 { .name = "kill", .errmsg = true,
645 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
646 { .name = "lchown", .errmsg = true, },
647 { .name = "lgetxattr", .errmsg = true, },
648 { .name = "linkat", .errmsg = true,
649 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
650 { .name = "listxattr", .errmsg = true, },
651 { .name = "llistxattr", .errmsg = true, },
652 { .name = "lremovexattr", .errmsg = true, },
653 { .name = "lseek", .errmsg = true,
654 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
655 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
656 { .name = "lsetxattr", .errmsg = true, },
657 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
658 { .name = "lsxattr", .errmsg = true, },
659 { .name = "madvise", .errmsg = true,
660 .arg_scnprintf = { [0] = SCA_HEX, /* start */
661 [2] = SCA_MADV_BHV, /* behavior */ }, },
662 { .name = "mkdir", .errmsg = true, },
663 { .name = "mkdirat", .errmsg = true,
664 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
665 { .name = "mknod", .errmsg = true, },
666 { .name = "mknodat", .errmsg = true,
667 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
668 { .name = "mlock", .errmsg = true,
669 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
670 { .name = "mlockall", .errmsg = true,
671 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
672 { .name = "mmap", .hexret = true,
673 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
674 [2] = SCA_MMAP_PROT, /* prot */
675 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
676 { .name = "mprotect", .errmsg = true,
677 .arg_scnprintf = { [0] = SCA_HEX, /* start */
678 [2] = SCA_MMAP_PROT, /* prot */ }, },
679 { .name = "mq_unlink", .errmsg = true,
680 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
681 { .name = "mremap", .hexret = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
683 [3] = SCA_MREMAP_FLAGS, /* flags */
684 [4] = SCA_HEX, /* new_addr */ }, },
685 { .name = "munlock", .errmsg = true,
686 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
687 { .name = "munmap", .errmsg = true,
688 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
689 { .name = "name_to_handle_at", .errmsg = true,
690 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
691 { .name = "newfstatat", .errmsg = true,
692 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
693 { .name = "open", .errmsg = true,
694 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
695 { .name = "open_by_handle_at", .errmsg = true,
696 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
697 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
698 { .name = "openat", .errmsg = true,
699 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
700 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
701 { .name = "perf_event_open", .errmsg = true,
702 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
703 [3] = SCA_FD, /* group_fd */
704 [4] = SCA_PERF_FLAGS, /* flags */ }, },
705 { .name = "pipe2", .errmsg = true,
706 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
707 { .name = "poll", .errmsg = true, .timeout = true, },
708 { .name = "ppoll", .errmsg = true, .timeout = true, },
709 { .name = "pread", .errmsg = true, .alias = "pread64", },
710 { .name = "preadv", .errmsg = true, .alias = "pread", },
711 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
712 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
713 { .name = "pwritev", .errmsg = true, },
714 { .name = "read", .errmsg = true, },
715 { .name = "readlink", .errmsg = true, },
716 { .name = "readlinkat", .errmsg = true,
717 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
718 { .name = "readv", .errmsg = true, },
719 { .name = "recvfrom", .errmsg = true,
720 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
721 { .name = "recvmmsg", .errmsg = true,
722 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
723 { .name = "recvmsg", .errmsg = true,
724 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
725 { .name = "removexattr", .errmsg = true, },
726 { .name = "renameat", .errmsg = true,
727 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
728 { .name = "rmdir", .errmsg = true, },
729 { .name = "rt_sigaction", .errmsg = true,
730 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
731 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
732 { .name = "rt_sigqueueinfo", .errmsg = true,
733 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
734 { .name = "rt_tgsigqueueinfo", .errmsg = true,
735 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
736 { .name = "sched_getattr", .errmsg = true, },
737 { .name = "sched_setattr", .errmsg = true, },
738 { .name = "sched_setscheduler", .errmsg = true,
739 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
740 { .name = "seccomp", .errmsg = true,
741 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
742 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
743 { .name = "select", .errmsg = true, .timeout = true, },
744 { .name = "sendmmsg", .errmsg = true,
745 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
746 { .name = "sendmsg", .errmsg = true,
747 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
748 { .name = "sendto", .errmsg = true,
749 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
750 { .name = "set_tid_address", .errpid = true, },
751 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
752 { .name = "setpgid", .errmsg = true, },
753 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
754 { .name = "setxattr", .errmsg = true, },
755 { .name = "shutdown", .errmsg = true, },
756 { .name = "socket", .errmsg = true,
757 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
758 [1] = SCA_SK_TYPE, /* type */ },
759 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
760 { .name = "socketpair", .errmsg = true,
761 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
762 [1] = SCA_SK_TYPE, /* type */ },
763 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
764 { .name = "stat", .errmsg = true, .alias = "newstat", },
765 { .name = "statfs", .errmsg = true, },
766 { .name = "statx", .errmsg = true,
767 .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
768 [2] = SCA_STATX_FLAGS, /* flags */
769 [3] = SCA_STATX_MASK, /* mask */ }, },
770 { .name = "swapoff", .errmsg = true,
771 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
772 { .name = "swapon", .errmsg = true,
773 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
774 { .name = "symlinkat", .errmsg = true,
775 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
776 { .name = "tgkill", .errmsg = true,
777 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
778 { .name = "tkill", .errmsg = true,
779 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
780 { .name = "truncate", .errmsg = true, },
781 { .name = "uname", .errmsg = true, .alias = "newuname", },
782 { .name = "unlinkat", .errmsg = true,
783 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
784 { .name = "utime", .errmsg = true, },
785 { .name = "utimensat", .errmsg = true,
786 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
787 { .name = "utimes", .errmsg = true, },
788 { .name = "vmsplice", .errmsg = true, },
789 { .name = "wait4", .errpid = true,
790 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
791 { .name = "waitid", .errpid = true,
792 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
793 { .name = "write", .errmsg = true, },
794 { .name = "writev", .errmsg = true, },
797 static int syscall_fmt__cmp(const void *name, const void *fmtp)
799 const struct syscall_fmt *fmt = fmtp;
800 return strcmp(name, fmt->name);
803 static struct syscall_fmt *syscall_fmt__find(const char *name)
805 const int nmemb = ARRAY_SIZE(syscall_fmts);
806 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
810 struct event_format *tp_format;
812 struct format_field *args;
815 struct syscall_fmt *fmt;
816 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
821 * We need to have this 'calculated' boolean because in some cases we really
822 * don't know what is the duration of a syscall, for instance, when we start
823 * a session and some threads are waiting for a syscall to finish, say 'poll',
824 * in which case all we can do is to print "( ? ) for duration and for the
827 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
829 double duration = (double)t / NSEC_PER_MSEC;
830 size_t printed = fprintf(fp, "(");
833 printed += fprintf(fp, " ? ");
834 else if (duration >= 1.0)
835 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
836 else if (duration >= 0.01)
837 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
839 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
840 return printed + fprintf(fp, "): ");
844 * filename.ptr: The filename char pointer that will be vfs_getname'd
845 * filename.entry_str_pos: Where to insert the string translated from
846 * filename.ptr by the vfs_getname tracepoint/kprobe.
848 struct thread_trace {
851 unsigned long nr_events;
852 unsigned long pfmaj, pfmin;
857 short int entry_str_pos;
859 unsigned int namelen;
867 struct intlist *syscall_stats;
870 static struct thread_trace *thread_trace__new(void)
872 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
875 ttrace->paths.max = -1;
877 ttrace->syscall_stats = intlist__new(NULL);
882 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
884 struct thread_trace *ttrace;
889 if (thread__priv(thread) == NULL)
890 thread__set_priv(thread, thread_trace__new());
892 if (thread__priv(thread) == NULL)
895 ttrace = thread__priv(thread);
900 color_fprintf(fp, PERF_COLOR_RED,
901 "WARNING: not enough memory, dropping samples!\n");
905 #define TRACE_PFMAJ (1 << 0)
906 #define TRACE_PFMIN (1 << 1)
908 static const size_t trace__entry_str_size = 2048;
910 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
912 struct thread_trace *ttrace = thread__priv(thread);
914 if (fd > ttrace->paths.max) {
915 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
920 if (ttrace->paths.max != -1) {
921 memset(npath + ttrace->paths.max + 1, 0,
922 (fd - ttrace->paths.max) * sizeof(char *));
924 memset(npath, 0, (fd + 1) * sizeof(char *));
927 ttrace->paths.table = npath;
928 ttrace->paths.max = fd;
931 ttrace->paths.table[fd] = strdup(pathname);
933 return ttrace->paths.table[fd] != NULL ? 0 : -1;
936 static int thread__read_fd_path(struct thread *thread, int fd)
938 char linkname[PATH_MAX], pathname[PATH_MAX];
942 if (thread->pid_ == thread->tid) {
943 scnprintf(linkname, sizeof(linkname),
944 "/proc/%d/fd/%d", thread->pid_, fd);
946 scnprintf(linkname, sizeof(linkname),
947 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
950 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
953 ret = readlink(linkname, pathname, sizeof(pathname));
955 if (ret < 0 || ret > st.st_size)
958 pathname[ret] = '\0';
959 return trace__set_fd_pathname(thread, fd, pathname);
962 static const char *thread__fd_path(struct thread *thread, int fd,
965 struct thread_trace *ttrace = thread__priv(thread);
973 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
976 ++trace->stats.proc_getname;
977 if (thread__read_fd_path(thread, fd))
981 return ttrace->paths.table[fd];
984 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
985 struct syscall_arg *arg)
988 size_t printed = scnprintf(bf, size, "%d", fd);
989 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
992 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
997 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
998 struct syscall_arg *arg)
1001 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1002 struct thread_trace *ttrace = thread__priv(arg->thread);
1004 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1005 zfree(&ttrace->paths.table[fd]);
1010 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1013 struct thread_trace *ttrace = thread__priv(thread);
1015 ttrace->filename.ptr = ptr;
1016 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1019 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1020 struct syscall_arg *arg)
1022 unsigned long ptr = arg->val;
1024 if (!arg->trace->vfs_getname)
1025 return scnprintf(bf, size, "%#x", ptr);
1027 thread__set_filename_pos(arg->thread, bf, ptr);
1031 static bool trace__filter_duration(struct trace *trace, double t)
1033 return t < (trace->duration_filter * NSEC_PER_MSEC);
1036 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1038 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1040 return fprintf(fp, "%10.3f ", ts);
1044 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1045 * using ttrace->entry_time for a thread that receives a sys_exit without
1046 * first having received a sys_enter ("poll" issued before tracing session
1047 * starts, lost sys_enter exit due to ring buffer overflow).
1049 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1052 return __trace__fprintf_tstamp(trace, tstamp, fp);
1054 return fprintf(fp, " ? ");
1057 static bool done = false;
1058 static bool interrupted = false;
1060 static void sig_handler(int sig)
1063 interrupted = sig == SIGINT;
1066 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1067 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1069 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1070 printed += fprintf_duration(duration, duration_calculated, fp);
1072 if (trace->multiple_threads) {
1073 if (trace->show_comm)
1074 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1075 printed += fprintf(fp, "%d ", thread->tid);
1081 static int trace__process_event(struct trace *trace, struct machine *machine,
1082 union perf_event *event, struct perf_sample *sample)
1086 switch (event->header.type) {
1087 case PERF_RECORD_LOST:
1088 color_fprintf(trace->output, PERF_COLOR_RED,
1089 "LOST %" PRIu64 " events!\n", event->lost.lost);
1090 ret = machine__process_lost_event(machine, event, sample);
1093 ret = machine__process_event(machine, event, sample);
1100 static int trace__tool_process(struct perf_tool *tool,
1101 union perf_event *event,
1102 struct perf_sample *sample,
1103 struct machine *machine)
1105 struct trace *trace = container_of(tool, struct trace, tool);
1106 return trace__process_event(trace, machine, event, sample);
1109 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1111 struct machine *machine = vmachine;
1113 if (machine->kptr_restrict_warned)
1116 if (symbol_conf.kptr_restrict) {
1117 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1118 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1119 "Kernel samples will not be resolved.\n");
1120 machine->kptr_restrict_warned = true;
1124 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1127 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1129 int err = symbol__init(NULL);
1134 trace->host = machine__new_host();
1135 if (trace->host == NULL)
1138 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1141 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1142 evlist->threads, trace__tool_process, false,
1143 trace->opts.proc_map_timeout);
1150 static int syscall__set_arg_fmts(struct syscall *sc)
1152 struct format_field *field;
1155 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1156 if (sc->arg_scnprintf == NULL)
1160 sc->arg_parm = sc->fmt->arg_parm;
1162 for (field = sc->args; field; field = field->next) {
1163 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1164 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1165 else if (strcmp(field->type, "const char *") == 0 &&
1166 (strcmp(field->name, "filename") == 0 ||
1167 strcmp(field->name, "path") == 0 ||
1168 strcmp(field->name, "pathname") == 0))
1169 sc->arg_scnprintf[idx] = SCA_FILENAME;
1170 else if (field->flags & FIELD_IS_POINTER)
1171 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1172 else if (strcmp(field->type, "pid_t") == 0)
1173 sc->arg_scnprintf[idx] = SCA_PID;
1174 else if (strcmp(field->type, "umode_t") == 0)
1175 sc->arg_scnprintf[idx] = SCA_MODE_T;
1176 else if ((strcmp(field->type, "int") == 0 ||
1177 strcmp(field->type, "unsigned int") == 0 ||
1178 strcmp(field->type, "long") == 0) &&
1179 (len = strlen(field->name)) >= 2 &&
1180 strcmp(field->name + len - 2, "fd") == 0) {
1182 * /sys/kernel/tracing/events/syscalls/sys_enter*
1183 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1188 sc->arg_scnprintf[idx] = SCA_FD;
1196 static int trace__read_syscall_info(struct trace *trace, int id)
1200 const char *name = syscalltbl__name(trace->sctbl, id);
1205 if (id > trace->syscalls.max) {
1206 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1208 if (nsyscalls == NULL)
1211 if (trace->syscalls.max != -1) {
1212 memset(nsyscalls + trace->syscalls.max + 1, 0,
1213 (id - trace->syscalls.max) * sizeof(*sc));
1215 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1218 trace->syscalls.table = nsyscalls;
1219 trace->syscalls.max = id;
1222 sc = trace->syscalls.table + id;
1225 sc->fmt = syscall_fmt__find(sc->name);
1227 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1228 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1230 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1231 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1232 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1235 if (IS_ERR(sc->tp_format))
1238 sc->args = sc->tp_format->format.fields;
1239 sc->nr_args = sc->tp_format->format.nr_fields;
1241 * We need to check and discard the first variable '__syscall_nr'
1242 * or 'nr' that mean the syscall number. It is needless here.
1243 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1245 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1246 sc->args = sc->args->next;
1250 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1252 return syscall__set_arg_fmts(sc);
1255 static int trace__validate_ev_qualifier(struct trace *trace)
1258 struct str_node *pos;
1260 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1261 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1262 sizeof(trace->ev_qualifier_ids.entries[0]));
1264 if (trace->ev_qualifier_ids.entries == NULL) {
1265 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1273 strlist__for_each_entry(pos, trace->ev_qualifier) {
1274 const char *sc = pos->s;
1275 int id = syscalltbl__id(trace->sctbl, sc);
1279 fputs("Error:\tInvalid syscall ", trace->output);
1282 fputs(", ", trace->output);
1285 fputs(sc, trace->output);
1288 trace->ev_qualifier_ids.entries[i++] = id;
1292 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1293 "\nHint:\tand: 'man syscalls'\n", trace->output);
1294 zfree(&trace->ev_qualifier_ids.entries);
1295 trace->ev_qualifier_ids.nr = 0;
1302 * args is to be interpreted as a series of longs but we need to handle
1303 * 8-byte unaligned accesses. args points to raw_data within the event
1304 * and raw_data is guaranteed to be 8-byte unaligned because it is
1305 * preceded by raw_size which is a u32. So we need to copy args to a temp
1306 * variable to read it. Most notably this avoids extended load instructions
1307 * on unaligned addresses
1310 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1311 unsigned char *args, struct trace *trace,
1312 struct thread *thread)
1318 if (sc->args != NULL) {
1319 struct format_field *field;
1321 struct syscall_arg arg = {
1328 for (field = sc->args; field;
1329 field = field->next, ++arg.idx, bit <<= 1) {
1333 /* special care for unaligned accesses */
1334 p = args + sizeof(unsigned long) * arg.idx;
1335 memcpy(&val, p, sizeof(val));
1338 * Suppress this argument if its value is zero and
1339 * and we don't have a string associated in an
1343 !(sc->arg_scnprintf &&
1344 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1345 sc->arg_parm[arg.idx]))
1348 printed += scnprintf(bf + printed, size - printed,
1349 "%s%s: ", printed ? ", " : "", field->name);
1350 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1353 arg.parm = sc->arg_parm[arg.idx];
1354 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1355 size - printed, &arg);
1357 printed += scnprintf(bf + printed, size - printed,
1361 } else if (IS_ERR(sc->tp_format)) {
1363 * If we managed to read the tracepoint /format file, then we
1364 * may end up not having any args, like with gettid(), so only
1365 * print the raw args when we didn't manage to read it.
1370 /* special care for unaligned accesses */
1371 p = args + sizeof(unsigned long) * i;
1372 memcpy(&val, p, sizeof(val));
1373 printed += scnprintf(bf + printed, size - printed,
1375 printed ? ", " : "", i, val);
1383 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1384 union perf_event *event,
1385 struct perf_sample *sample);
1387 static struct syscall *trace__syscall_info(struct trace *trace,
1388 struct perf_evsel *evsel, int id)
1394 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1395 * before that, leaving at a higher verbosity level till that is
1396 * explained. Reproduced with plain ftrace with:
1398 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1399 * grep "NR -1 " /t/trace_pipe
1401 * After generating some load on the machine.
1405 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1406 id, perf_evsel__name(evsel), ++n);
1411 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1412 trace__read_syscall_info(trace, id))
1415 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1418 return &trace->syscalls.table[id];
1422 fprintf(trace->output, "Problems reading syscall %d", id);
1423 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1424 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1425 fputs(" information\n", trace->output);
1430 static void thread__update_stats(struct thread_trace *ttrace,
1431 int id, struct perf_sample *sample)
1433 struct int_node *inode;
1434 struct stats *stats;
1437 inode = intlist__findnew(ttrace->syscall_stats, id);
1441 stats = inode->priv;
1442 if (stats == NULL) {
1443 stats = malloc(sizeof(struct stats));
1447 inode->priv = stats;
1450 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1451 duration = sample->time - ttrace->entry_time;
1453 update_stats(stats, duration);
1456 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1458 struct thread_trace *ttrace;
1462 if (trace->current == NULL)
1465 ttrace = thread__priv(trace->current);
1467 if (!ttrace->entry_pending)
1470 duration = sample->time - ttrace->entry_time;
1472 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1473 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1474 ttrace->entry_pending = false;
1479 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1480 union perf_event *event __maybe_unused,
1481 struct perf_sample *sample)
1486 struct thread *thread;
1487 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1488 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1489 struct thread_trace *ttrace;
1494 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1495 ttrace = thread__trace(thread, trace->output);
1499 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1501 if (ttrace->entry_str == NULL) {
1502 ttrace->entry_str = malloc(trace__entry_str_size);
1503 if (!ttrace->entry_str)
1507 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1508 trace__printf_interrupted_entry(trace, sample);
1510 ttrace->entry_time = sample->time;
1511 msg = ttrace->entry_str;
1512 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1514 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1515 args, trace, thread);
1518 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1519 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1520 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1523 ttrace->entry_pending = true;
1524 /* See trace__vfs_getname & trace__sys_exit */
1525 ttrace->filename.pending_open = false;
1528 if (trace->current != thread) {
1529 thread__put(trace->current);
1530 trace->current = thread__get(thread);
1534 thread__put(thread);
1538 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1539 struct perf_sample *sample,
1540 struct callchain_cursor *cursor)
1542 struct addr_location al;
1544 if (machine__resolve(trace->host, &al, sample) < 0 ||
1545 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1551 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1553 /* TODO: user-configurable print_opts */
1554 const unsigned int print_opts = EVSEL__PRINT_SYM |
1556 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1558 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1561 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1562 union perf_event *event __maybe_unused,
1563 struct perf_sample *sample)
1567 bool duration_calculated = false;
1568 struct thread *thread;
1569 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1570 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1571 struct thread_trace *ttrace;
1576 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1577 ttrace = thread__trace(thread, trace->output);
1582 thread__update_stats(ttrace, id, sample);
1584 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1586 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1587 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1588 ttrace->filename.pending_open = false;
1589 ++trace->stats.vfs_getname;
1592 if (ttrace->entry_time) {
1593 duration = sample->time - ttrace->entry_time;
1594 if (trace__filter_duration(trace, duration))
1596 duration_calculated = true;
1597 } else if (trace->duration_filter)
1600 if (sample->callchain) {
1601 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1602 if (callchain_ret == 0) {
1603 if (callchain_cursor.nr < trace->min_stack)
1609 if (trace->summary_only)
1612 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1614 if (ttrace->entry_pending) {
1615 fprintf(trace->output, "%-70s", ttrace->entry_str);
1617 fprintf(trace->output, " ... [");
1618 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1619 fprintf(trace->output, "]: %s()", sc->name);
1622 if (sc->fmt == NULL) {
1624 fprintf(trace->output, ") = %ld", ret);
1625 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1626 char bf[STRERR_BUFSIZE];
1627 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1628 *e = audit_errno_to_name(-ret);
1630 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1631 } else if (ret == 0 && sc->fmt->timeout)
1632 fprintf(trace->output, ") = 0 Timeout");
1633 else if (sc->fmt->hexret)
1634 fprintf(trace->output, ") = %#lx", ret);
1635 else if (sc->fmt->errpid) {
1636 struct thread *child = machine__find_thread(trace->host, ret, ret);
1638 if (child != NULL) {
1639 fprintf(trace->output, ") = %ld", ret);
1640 if (child->comm_set)
1641 fprintf(trace->output, " (%s)", thread__comm_str(child));
1647 fputc('\n', trace->output);
1649 if (callchain_ret > 0)
1650 trace__fprintf_callchain(trace, sample);
1651 else if (callchain_ret < 0)
1652 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1654 ttrace->entry_pending = false;
1657 thread__put(thread);
1661 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1662 union perf_event *event __maybe_unused,
1663 struct perf_sample *sample)
1665 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1666 struct thread_trace *ttrace;
1667 size_t filename_len, entry_str_len, to_move;
1668 ssize_t remaining_space;
1670 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1675 ttrace = thread__priv(thread);
1679 filename_len = strlen(filename);
1680 if (filename_len == 0)
1683 if (ttrace->filename.namelen < filename_len) {
1684 char *f = realloc(ttrace->filename.name, filename_len + 1);
1689 ttrace->filename.namelen = filename_len;
1690 ttrace->filename.name = f;
1693 strcpy(ttrace->filename.name, filename);
1694 ttrace->filename.pending_open = true;
1696 if (!ttrace->filename.ptr)
1699 entry_str_len = strlen(ttrace->entry_str);
1700 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1701 if (remaining_space <= 0)
1704 if (filename_len > (size_t)remaining_space) {
1705 filename += filename_len - remaining_space;
1706 filename_len = remaining_space;
1709 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1710 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1711 memmove(pos + filename_len, pos, to_move);
1712 memcpy(pos, filename, filename_len);
1714 ttrace->filename.ptr = 0;
1715 ttrace->filename.entry_str_pos = 0;
1717 thread__put(thread);
1722 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1723 union perf_event *event __maybe_unused,
1724 struct perf_sample *sample)
1726 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1727 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1728 struct thread *thread = machine__findnew_thread(trace->host,
1731 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1736 ttrace->runtime_ms += runtime_ms;
1737 trace->runtime_ms += runtime_ms;
1739 thread__put(thread);
1743 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1745 perf_evsel__strval(evsel, sample, "comm"),
1746 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1748 perf_evsel__intval(evsel, sample, "vruntime"));
1752 static void bpf_output__printer(enum binary_printer_ops op,
1753 unsigned int val, void *extra)
1755 FILE *output = extra;
1756 unsigned char ch = (unsigned char)val;
1759 case BINARY_PRINT_CHAR_DATA:
1760 fprintf(output, "%c", isprint(ch) ? ch : '.');
1762 case BINARY_PRINT_DATA_BEGIN:
1763 case BINARY_PRINT_LINE_BEGIN:
1764 case BINARY_PRINT_ADDR:
1765 case BINARY_PRINT_NUM_DATA:
1766 case BINARY_PRINT_NUM_PAD:
1767 case BINARY_PRINT_SEP:
1768 case BINARY_PRINT_CHAR_PAD:
1769 case BINARY_PRINT_LINE_END:
1770 case BINARY_PRINT_DATA_END:
1776 static void bpf_output__fprintf(struct trace *trace,
1777 struct perf_sample *sample)
1779 print_binary(sample->raw_data, sample->raw_size, 8,
1780 bpf_output__printer, trace->output);
1783 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample)
1787 int callchain_ret = 0;
1789 if (sample->callchain) {
1790 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1791 if (callchain_ret == 0) {
1792 if (callchain_cursor.nr < trace->min_stack)
1798 trace__printf_interrupted_entry(trace, sample);
1799 trace__fprintf_tstamp(trace, sample->time, trace->output);
1801 if (trace->trace_syscalls)
1802 fprintf(trace->output, "( ): ");
1804 fprintf(trace->output, "%s:", evsel->name);
1806 if (perf_evsel__is_bpf_output(evsel)) {
1807 bpf_output__fprintf(trace, sample);
1808 } else if (evsel->tp_format) {
1809 event_format__fprintf(evsel->tp_format, sample->cpu,
1810 sample->raw_data, sample->raw_size,
1814 fprintf(trace->output, ")\n");
1816 if (callchain_ret > 0)
1817 trace__fprintf_callchain(trace, sample);
1818 else if (callchain_ret < 0)
1819 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1824 static void print_location(FILE *f, struct perf_sample *sample,
1825 struct addr_location *al,
1826 bool print_dso, bool print_sym)
1829 if ((verbose > 0 || print_dso) && al->map)
1830 fprintf(f, "%s@", al->map->dso->long_name);
1832 if ((verbose > 0 || print_sym) && al->sym)
1833 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1834 al->addr - al->sym->start);
1836 fprintf(f, "0x%" PRIx64, al->addr);
1838 fprintf(f, "0x%" PRIx64, sample->addr);
1841 static int trace__pgfault(struct trace *trace,
1842 struct perf_evsel *evsel,
1843 union perf_event *event __maybe_unused,
1844 struct perf_sample *sample)
1846 struct thread *thread;
1847 struct addr_location al;
1848 char map_type = 'd';
1849 struct thread_trace *ttrace;
1851 int callchain_ret = 0;
1853 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1855 if (sample->callchain) {
1856 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1857 if (callchain_ret == 0) {
1858 if (callchain_cursor.nr < trace->min_stack)
1864 ttrace = thread__trace(thread, trace->output);
1868 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1873 if (trace->summary_only)
1876 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1879 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1881 fprintf(trace->output, "%sfault [",
1882 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1885 print_location(trace->output, sample, &al, false, true);
1887 fprintf(trace->output, "] => ");
1889 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1893 thread__find_addr_location(thread, sample->cpumode,
1894 MAP__FUNCTION, sample->addr, &al);
1902 print_location(trace->output, sample, &al, true, false);
1904 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1906 if (callchain_ret > 0)
1907 trace__fprintf_callchain(trace, sample);
1908 else if (callchain_ret < 0)
1909 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1913 thread__put(thread);
1917 static void trace__set_base_time(struct trace *trace,
1918 struct perf_evsel *evsel,
1919 struct perf_sample *sample)
1922 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1923 * and don't use sample->time unconditionally, we may end up having
1924 * some other event in the future without PERF_SAMPLE_TIME for good
1925 * reason, i.e. we may not be interested in its timestamps, just in
1926 * it taking place, picking some piece of information when it
1927 * appears in our event stream (vfs_getname comes to mind).
1929 if (trace->base_time == 0 && !trace->full_time &&
1930 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1931 trace->base_time = sample->time;
1934 static int trace__process_sample(struct perf_tool *tool,
1935 union perf_event *event,
1936 struct perf_sample *sample,
1937 struct perf_evsel *evsel,
1938 struct machine *machine __maybe_unused)
1940 struct trace *trace = container_of(tool, struct trace, tool);
1941 struct thread *thread;
1944 tracepoint_handler handler = evsel->handler;
1946 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1947 if (thread && thread__is_filtered(thread))
1950 trace__set_base_time(trace, evsel, sample);
1954 handler(trace, evsel, event, sample);
1957 thread__put(thread);
1961 static int trace__record(struct trace *trace, int argc, const char **argv)
1963 unsigned int rec_argc, i, j;
1964 const char **rec_argv;
1965 const char * const record_args[] = {
1972 const char * const sc_args[] = { "-e", };
1973 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1974 const char * const majpf_args[] = { "-e", "major-faults" };
1975 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1976 const char * const minpf_args[] = { "-e", "minor-faults" };
1977 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1979 /* +1 is for the event string below */
1980 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1981 majpf_args_nr + minpf_args_nr + argc;
1982 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1984 if (rec_argv == NULL)
1988 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1989 rec_argv[j++] = record_args[i];
1991 if (trace->trace_syscalls) {
1992 for (i = 0; i < sc_args_nr; i++)
1993 rec_argv[j++] = sc_args[i];
1995 /* event string may be different for older kernels - e.g., RHEL6 */
1996 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1997 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1998 else if (is_valid_tracepoint("syscalls:sys_enter"))
1999 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2001 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2006 if (trace->trace_pgfaults & TRACE_PFMAJ)
2007 for (i = 0; i < majpf_args_nr; i++)
2008 rec_argv[j++] = majpf_args[i];
2010 if (trace->trace_pgfaults & TRACE_PFMIN)
2011 for (i = 0; i < minpf_args_nr; i++)
2012 rec_argv[j++] = minpf_args[i];
2014 for (i = 0; i < (unsigned int)argc; i++)
2015 rec_argv[j++] = argv[i];
2017 return cmd_record(j, rec_argv);
2020 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2022 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2024 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2029 if (perf_evsel__field(evsel, "pathname") == NULL) {
2030 perf_evsel__delete(evsel);
2034 evsel->handler = trace__vfs_getname;
2035 perf_evlist__add(evlist, evsel);
2039 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2041 struct perf_evsel *evsel;
2042 struct perf_event_attr attr = {
2043 .type = PERF_TYPE_SOFTWARE,
2047 attr.config = config;
2048 attr.sample_period = 1;
2050 event_attr_init(&attr);
2052 evsel = perf_evsel__new(&attr);
2054 evsel->handler = trace__pgfault;
2059 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2061 const u32 type = event->header.type;
2062 struct perf_evsel *evsel;
2064 if (type != PERF_RECORD_SAMPLE) {
2065 trace__process_event(trace, trace->host, event, sample);
2069 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2070 if (evsel == NULL) {
2071 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2075 trace__set_base_time(trace, evsel, sample);
2077 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2078 sample->raw_data == NULL) {
2079 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2080 perf_evsel__name(evsel), sample->tid,
2081 sample->cpu, sample->raw_size);
2083 tracepoint_handler handler = evsel->handler;
2084 handler(trace, evsel, event, sample);
2088 static int trace__add_syscall_newtp(struct trace *trace)
2091 struct perf_evlist *evlist = trace->evlist;
2092 struct perf_evsel *sys_enter, *sys_exit;
2094 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2095 if (sys_enter == NULL)
2098 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2099 goto out_delete_sys_enter;
2101 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2102 if (sys_exit == NULL)
2103 goto out_delete_sys_enter;
2105 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2106 goto out_delete_sys_exit;
2108 perf_evlist__add(evlist, sys_enter);
2109 perf_evlist__add(evlist, sys_exit);
2111 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2113 * We're interested only in the user space callchain
2114 * leading to the syscall, allow overriding that for
2115 * debugging reasons using --kernel_syscall_callchains
2117 sys_exit->attr.exclude_callchain_kernel = 1;
2120 trace->syscalls.events.sys_enter = sys_enter;
2121 trace->syscalls.events.sys_exit = sys_exit;
2127 out_delete_sys_exit:
2128 perf_evsel__delete_priv(sys_exit);
2129 out_delete_sys_enter:
2130 perf_evsel__delete_priv(sys_enter);
2134 static int trace__set_ev_qualifier_filter(struct trace *trace)
2137 struct perf_evsel *sys_exit;
2138 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2139 trace->ev_qualifier_ids.nr,
2140 trace->ev_qualifier_ids.entries);
2145 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2147 sys_exit = trace->syscalls.events.sys_exit;
2148 err = perf_evsel__append_tp_filter(sys_exit, filter);
2159 static int trace__run(struct trace *trace, int argc, const char **argv)
2161 struct perf_evlist *evlist = trace->evlist;
2162 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2164 unsigned long before;
2165 const bool forks = argc > 0;
2166 bool draining = false;
2170 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2171 goto out_error_raw_syscalls;
2173 if (trace->trace_syscalls)
2174 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2176 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2177 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2178 if (pgfault_maj == NULL)
2180 perf_evlist__add(evlist, pgfault_maj);
2183 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2184 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2185 if (pgfault_min == NULL)
2187 perf_evlist__add(evlist, pgfault_min);
2191 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2192 trace__sched_stat_runtime))
2193 goto out_error_sched_stat_runtime;
2195 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2197 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2198 goto out_delete_evlist;
2201 err = trace__symbols_init(trace, evlist);
2203 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2204 goto out_delete_evlist;
2207 perf_evlist__config(evlist, &trace->opts, NULL);
2209 if (callchain_param.enabled) {
2210 bool use_identifier = false;
2212 if (trace->syscalls.events.sys_exit) {
2213 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2214 &trace->opts, &callchain_param);
2215 use_identifier = true;
2219 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2220 use_identifier = true;
2224 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2225 use_identifier = true;
2228 if (use_identifier) {
2230 * Now we have evsels with different sample_ids, use
2231 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2232 * from a fixed position in each ring buffer record.
2234 * As of this the changeset introducing this comment, this
2235 * isn't strictly needed, as the fields that can come before
2236 * PERF_SAMPLE_ID are all used, but we'll probably disable
2237 * some of those for things like copying the payload of
2238 * pointer syscall arguments, and for vfs_getname we don't
2239 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2240 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2242 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2243 perf_evlist__reset_sample_bit(evlist, ID);
2247 signal(SIGCHLD, sig_handler);
2248 signal(SIGINT, sig_handler);
2251 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2254 fprintf(trace->output, "Couldn't run the workload!\n");
2255 goto out_delete_evlist;
2259 err = perf_evlist__open(evlist);
2261 goto out_error_open;
2263 err = bpf__apply_obj_config();
2265 char errbuf[BUFSIZ];
2267 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2268 pr_err("ERROR: Apply config to BPF failed: %s\n",
2270 goto out_error_open;
2274 * Better not use !target__has_task() here because we need to cover the
2275 * case where no threads were specified in the command line, but a
2276 * workload was, and in that case we will fill in the thread_map when
2277 * we fork the workload in perf_evlist__prepare_workload.
2279 if (trace->filter_pids.nr > 0)
2280 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2281 else if (thread_map__pid(evlist->threads, 0) == -1)
2282 err = perf_evlist__set_filter_pid(evlist, getpid());
2287 if (trace->ev_qualifier_ids.nr > 0) {
2288 err = trace__set_ev_qualifier_filter(trace);
2292 pr_debug("event qualifier tracepoint filter: %s\n",
2293 trace->syscalls.events.sys_exit->filter);
2296 err = perf_evlist__apply_filters(evlist, &evsel);
2298 goto out_error_apply_filters;
2300 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2302 goto out_error_mmap;
2304 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2305 perf_evlist__enable(evlist);
2308 perf_evlist__start_workload(evlist);
2310 if (trace->opts.initial_delay) {
2311 usleep(trace->opts.initial_delay * 1000);
2312 perf_evlist__enable(evlist);
2315 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2316 evlist->threads->nr > 1 ||
2317 perf_evlist__first(evlist)->attr.inherit;
2319 before = trace->nr_events;
2321 for (i = 0; i < evlist->nr_mmaps; i++) {
2322 union perf_event *event;
2324 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2325 struct perf_sample sample;
2329 err = perf_evlist__parse_sample(evlist, event, &sample);
2331 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2335 trace__handle_event(trace, event, &sample);
2337 perf_evlist__mmap_consume(evlist, i);
2342 if (done && !draining) {
2343 perf_evlist__disable(evlist);
2349 if (trace->nr_events == before) {
2350 int timeout = done ? 100 : -1;
2352 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2353 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2363 thread__zput(trace->current);
2365 perf_evlist__disable(evlist);
2369 trace__fprintf_thread_summary(trace, trace->output);
2371 if (trace->show_tool_stats) {
2372 fprintf(trace->output, "Stats:\n "
2373 " vfs_getname : %" PRIu64 "\n"
2374 " proc_getname: %" PRIu64 "\n",
2375 trace->stats.vfs_getname,
2376 trace->stats.proc_getname);
2381 perf_evlist__delete(evlist);
2382 trace->evlist = NULL;
2383 trace->live = false;
2386 char errbuf[BUFSIZ];
2388 out_error_sched_stat_runtime:
2389 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2392 out_error_raw_syscalls:
2393 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2397 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2401 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2404 fprintf(trace->output, "%s\n", errbuf);
2405 goto out_delete_evlist;
2407 out_error_apply_filters:
2408 fprintf(trace->output,
2409 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2410 evsel->filter, perf_evsel__name(evsel), errno,
2411 str_error_r(errno, errbuf, sizeof(errbuf)));
2412 goto out_delete_evlist;
2415 fprintf(trace->output, "Not enough memory to run!\n");
2416 goto out_delete_evlist;
2419 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2420 goto out_delete_evlist;
2423 static int trace__replay(struct trace *trace)
2425 const struct perf_evsel_str_handler handlers[] = {
2426 { "probe:vfs_getname", trace__vfs_getname, },
2428 struct perf_data_file file = {
2430 .mode = PERF_DATA_MODE_READ,
2431 .force = trace->force,
2433 struct perf_session *session;
2434 struct perf_evsel *evsel;
2437 trace->tool.sample = trace__process_sample;
2438 trace->tool.mmap = perf_event__process_mmap;
2439 trace->tool.mmap2 = perf_event__process_mmap2;
2440 trace->tool.comm = perf_event__process_comm;
2441 trace->tool.exit = perf_event__process_exit;
2442 trace->tool.fork = perf_event__process_fork;
2443 trace->tool.attr = perf_event__process_attr;
2444 trace->tool.tracing_data = perf_event__process_tracing_data;
2445 trace->tool.build_id = perf_event__process_build_id;
2446 trace->tool.namespaces = perf_event__process_namespaces;
2448 trace->tool.ordered_events = true;
2449 trace->tool.ordering_requires_timestamps = true;
2451 /* add tid to output */
2452 trace->multiple_threads = true;
2454 session = perf_session__new(&file, false, &trace->tool);
2455 if (session == NULL)
2458 if (trace->opts.target.pid)
2459 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2461 if (trace->opts.target.tid)
2462 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2464 if (symbol__init(&session->header.env) < 0)
2467 trace->host = &session->machines.host;
2469 err = perf_session__set_tracepoints_handlers(session, handlers);
2473 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2474 "raw_syscalls:sys_enter");
2475 /* older kernels have syscalls tp versus raw_syscalls */
2477 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478 "syscalls:sys_enter");
2481 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2482 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2483 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2487 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488 "raw_syscalls:sys_exit");
2490 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2491 "syscalls:sys_exit");
2493 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2494 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2495 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2499 evlist__for_each_entry(session->evlist, evsel) {
2500 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2501 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2502 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2503 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2504 evsel->handler = trace__pgfault;
2509 err = perf_session__process_events(session);
2511 pr_err("Failed to process events, error %d", err);
2513 else if (trace->summary)
2514 trace__fprintf_thread_summary(trace, trace->output);
2517 perf_session__delete(session);
2522 static size_t trace__fprintf_threads_header(FILE *fp)
2526 printed = fprintf(fp, "\n Summary of events:\n\n");
2531 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2532 struct stats *stats;
2537 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2538 struct stats *stats = source->priv;
2540 entry->syscall = source->i;
2541 entry->stats = stats;
2542 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2545 static size_t thread__dump_stats(struct thread_trace *ttrace,
2546 struct trace *trace, FILE *fp)
2551 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2553 if (syscall_stats == NULL)
2556 printed += fprintf(fp, "\n");
2558 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2559 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2560 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2562 resort_rb__for_each_entry(nd, syscall_stats) {
2563 struct stats *stats = syscall_stats_entry->stats;
2565 double min = (double)(stats->min) / NSEC_PER_MSEC;
2566 double max = (double)(stats->max) / NSEC_PER_MSEC;
2567 double avg = avg_stats(stats);
2569 u64 n = (u64) stats->n;
2571 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2572 avg /= NSEC_PER_MSEC;
2574 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2575 printed += fprintf(fp, " %-15s", sc->name);
2576 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2577 n, syscall_stats_entry->msecs, min, avg);
2578 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2582 resort_rb__delete(syscall_stats);
2583 printed += fprintf(fp, "\n\n");
2588 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2591 struct thread_trace *ttrace = thread__priv(thread);
2597 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2599 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2600 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2601 printed += fprintf(fp, "%.1f%%", ratio);
2603 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2605 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2607 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2608 else if (fputc('\n', fp) != EOF)
2611 printed += thread__dump_stats(ttrace, trace, fp);
2616 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2618 return ttrace ? ttrace->nr_events : 0;
2621 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2622 struct thread *thread;
2625 entry->thread = rb_entry(nd, struct thread, rb_node);
2628 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2631 size_t printed = trace__fprintf_threads_header(fp);
2634 if (threads == NULL) {
2635 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2639 resort_rb__for_each_entry(nd, threads)
2640 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2642 resort_rb__delete(threads);
2647 static int trace__set_duration(const struct option *opt, const char *str,
2648 int unset __maybe_unused)
2650 struct trace *trace = opt->value;
2652 trace->duration_filter = atof(str);
2656 static int trace__set_filter_pids(const struct option *opt, const char *str,
2657 int unset __maybe_unused)
2661 struct trace *trace = opt->value;
2663 * FIXME: introduce a intarray class, plain parse csv and create a
2664 * { int nr, int entries[] } struct...
2666 struct intlist *list = intlist__new(str);
2671 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2672 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2674 if (trace->filter_pids.entries == NULL)
2677 trace->filter_pids.entries[0] = getpid();
2679 for (i = 1; i < trace->filter_pids.nr; ++i)
2680 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2682 intlist__delete(list);
2688 static int trace__open_output(struct trace *trace, const char *filename)
2692 if (!stat(filename, &st) && st.st_size) {
2693 char oldname[PATH_MAX];
2695 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2697 rename(filename, oldname);
2700 trace->output = fopen(filename, "w");
2702 return trace->output == NULL ? -errno : 0;
2705 static int parse_pagefaults(const struct option *opt, const char *str,
2706 int unset __maybe_unused)
2708 int *trace_pgfaults = opt->value;
2710 if (strcmp(str, "all") == 0)
2711 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2712 else if (strcmp(str, "maj") == 0)
2713 *trace_pgfaults |= TRACE_PFMAJ;
2714 else if (strcmp(str, "min") == 0)
2715 *trace_pgfaults |= TRACE_PFMIN;
2722 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2724 struct perf_evsel *evsel;
2726 evlist__for_each_entry(evlist, evsel)
2727 evsel->handler = handler;
2731 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2732 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2733 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2735 * It'd be better to introduce a parse_options() variant that would return a
2736 * list with the terms it didn't match to an event...
2738 static int trace__parse_events_option(const struct option *opt, const char *str,
2739 int unset __maybe_unused)
2741 struct trace *trace = (struct trace *)opt->value;
2742 const char *s = str;
2743 char *sep = NULL, *lists[2] = { NULL, NULL, };
2744 int len = strlen(str), err = -1, list;
2745 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2746 char group_name[PATH_MAX];
2748 if (strace_groups_dir == NULL)
2753 trace->not_ev_qualifier = true;
2757 if ((sep = strchr(s, ',')) != NULL)
2761 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2764 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2765 if (access(group_name, R_OK) == 0)
2770 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2772 lists[list] = malloc(len);
2773 if (lists[list] == NULL)
2775 strcpy(lists[list], s);
2785 if (lists[1] != NULL) {
2786 struct strlist_config slist_config = {
2787 .dirname = strace_groups_dir,
2790 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2791 if (trace->ev_qualifier == NULL) {
2792 fputs("Not enough memory to parse event qualifier", trace->output);
2796 if (trace__validate_ev_qualifier(trace))
2803 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2804 "event selector. use 'perf list' to list available events",
2805 parse_events_option);
2806 err = parse_events_option(&o, lists[0], 0);
2815 int cmd_trace(int argc, const char **argv)
2817 const char *trace_usage[] = {
2818 "perf trace [<options>] [<command>]",
2819 "perf trace [<options>] -- <command> [<options>]",
2820 "perf trace record [<options>] [<command>]",
2821 "perf trace record [<options>] -- <command> [<options>]",
2824 struct trace trace = {
2833 .user_freq = UINT_MAX,
2834 .user_interval = ULLONG_MAX,
2835 .no_buffering = true,
2836 .mmap_pages = UINT_MAX,
2837 .proc_map_timeout = 500,
2841 .trace_syscalls = true,
2842 .kernel_syscallchains = false,
2843 .max_stack = UINT_MAX,
2845 const char *output_name = NULL;
2846 const struct option trace_options[] = {
2847 OPT_CALLBACK('e', "event", &trace, "event",
2848 "event/syscall selector. use 'perf list' to list available events",
2849 trace__parse_events_option),
2850 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2851 "show the thread COMM next to its id"),
2852 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2853 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2854 trace__parse_events_option),
2855 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2856 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2857 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2858 "trace events on existing process id"),
2859 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2860 "trace events on existing thread id"),
2861 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2862 "pids to filter (by the kernel)", trace__set_filter_pids),
2863 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2864 "system-wide collection from all CPUs"),
2865 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2866 "list of cpus to monitor"),
2867 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2868 "child tasks do not inherit counters"),
2869 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2870 "number of mmap data pages",
2871 perf_evlist__parse_mmap_pages),
2872 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2874 OPT_CALLBACK(0, "duration", &trace, "float",
2875 "show only events with duration > N.M ms",
2876 trace__set_duration),
2877 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2878 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2879 OPT_BOOLEAN('T', "time", &trace.full_time,
2880 "Show full timestamp, not time relative to first start"),
2881 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2882 "Show only syscall summary with statistics"),
2883 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2884 "Show all syscalls and summary with statistics"),
2885 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2886 "Trace pagefaults", parse_pagefaults, "maj"),
2887 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2888 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2889 OPT_CALLBACK(0, "call-graph", &trace.opts,
2890 "record_mode[,record_size]", record_callchain_help,
2891 &record_parse_callchain_opt),
2892 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2893 "Show the kernel callchains on the syscall exit path"),
2894 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2895 "Set the minimum stack depth when parsing the callchain, "
2896 "anything below the specified depth will be ignored."),
2897 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2898 "Set the maximum stack depth when parsing the callchain, "
2899 "anything beyond the specified depth will be ignored. "
2900 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2901 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2902 "per thread proc mmap processing timeout in ms"),
2903 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2904 "ms to wait before starting measurement after program "
2908 bool __maybe_unused max_stack_user_set = true;
2909 bool mmap_pages_user_set = true;
2910 const char * const trace_subcommands[] = { "record", NULL };
2914 signal(SIGSEGV, sighandler_dump_stack);
2915 signal(SIGFPE, sighandler_dump_stack);
2917 trace.evlist = perf_evlist__new();
2918 trace.sctbl = syscalltbl__new();
2920 if (trace.evlist == NULL || trace.sctbl == NULL) {
2921 pr_err("Not enough memory to run!\n");
2926 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2927 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2929 err = bpf__setup_stdout(trace.evlist);
2931 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2932 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2938 if (trace.trace_pgfaults) {
2939 trace.opts.sample_address = true;
2940 trace.opts.sample_time = true;
2943 if (trace.opts.mmap_pages == UINT_MAX)
2944 mmap_pages_user_set = false;
2946 if (trace.max_stack == UINT_MAX) {
2947 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2948 max_stack_user_set = false;
2951 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2952 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2953 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2956 if (callchain_param.enabled) {
2957 if (!mmap_pages_user_set && geteuid() == 0)
2958 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2960 symbol_conf.use_callchain = true;
2963 if (trace.evlist->nr_entries > 0)
2964 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2966 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2967 return trace__record(&trace, argc-1, &argv[1]);
2969 /* summary_only implies summary option, but don't overwrite summary if set */
2970 if (trace.summary_only)
2971 trace.summary = trace.summary_only;
2973 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2974 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2975 pr_err("Please specify something to trace.\n");
2979 if (!trace.trace_syscalls && trace.ev_qualifier) {
2980 pr_err("The -e option can't be used with --no-syscalls.\n");
2984 if (output_name != NULL) {
2985 err = trace__open_output(&trace, output_name);
2987 perror("failed to create output file");
2992 trace.open_id = syscalltbl__id(trace.sctbl, "open");
2994 err = target__validate(&trace.opts.target);
2996 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2997 fprintf(trace.output, "%s", bf);
3001 err = target__parse_uid(&trace.opts.target);
3003 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3004 fprintf(trace.output, "%s", bf);
3008 if (!argc && target__none(&trace.opts.target))
3009 trace.opts.target.system_wide = true;
3012 err = trace__replay(&trace);
3014 err = trace__run(&trace, argc, argv);
3017 if (output_name != NULL)
3018 fclose(trace.output);